├── .gitignore ├── .nojekyll ├── LICENSE ├── README.md ├── check_env.ipynb ├── images ├── check_env-1.png ├── check_env-2.png └── download-repo.png ├── notebooks ├── 01 - Data Loading.ipynb ├── 02 - Supervised Learning.ipynb ├── 03 - Preprocessing.ipynb ├── data │ ├── adult.csv │ └── ram_price.csv └── solutions │ ├── load_adult.py │ ├── load_iris.py │ └── train_iris.py ├── slides ├── 01-introduction.html ├── 02-supervised-learning.html ├── 03-preprocessing.html ├── 04-missing_values.html ├── images │ ├── PDSH.png │ ├── alpha_go.png │ ├── amazon1.png │ ├── amazon2.png │ ├── amazon_explanations.png │ ├── api-table.png │ ├── apm.png │ ├── boro_ordinal.png │ ├── boro_ordinal_classification.png │ ├── column_transformer_schematic.png │ ├── cross_validation_new.png │ ├── esl.png │ ├── exoplanet.png │ ├── facebook_gael.png │ ├── fancy_impute_comparison.png │ ├── fb1.png │ ├── fb2.png │ ├── fb3.png │ ├── grid_search_cross_validation_new.png │ ├── house_price_boxplot.png │ ├── house_price_scaled_box.png │ ├── house_price_scatter.png │ ├── imlp.png │ ├── imputation-median-schema.png │ ├── imputation-schema.png │ ├── information_leak_preprocessing.png │ ├── knn_boundary_dataset.png │ ├── knn_boundary_k1.png │ ├── knn_boundary_k3.png │ ├── knn_boundary_test_points.png │ ├── knn_boundary_varying_k.png │ ├── knn_imputation.png │ ├── knn_model_complexity.png │ ├── knn_scaling.png │ ├── knn_scaling2.png │ ├── knn_vs_nearest_centroid.png │ ├── matrix-representation.png │ ├── mean_knn_rf_comparison.png │ ├── med_knn_rf_comparison.png │ ├── median_imputation.png │ ├── missing_values_img_17.png │ ├── missing_values_img_19.png │ ├── missing_values_img_20.png │ ├── missing_values_img_22.png │ ├── missing_values_img_23.png │ ├── missing_values_img_24.png │ ├── missing_values_img_27.png │ ├── nao.png │ ├── no_information_leak_preprocessing.png │ ├── no_separate_scaling.png │ ├── overfitting_underfitting_cartoon_full.png │ ├── overfitting_underfitting_cartoon_generalization.png │ ├── overfitting_underfitting_cartoon_train.png │ ├── pipeline.png │ ├── propublica_compas.png │ ├── reinforcement_cycle.png │ ├── row_nan_col_nan.png │ ├── scaler_comparison_scatter.png │ ├── shuffle_split_cv.png │ ├── sklearn-docs.png │ ├── sklearn_logo.png │ ├── spotify.png │ ├── stratified_cv.png │ ├── supervised-ml-api.png │ ├── supervised-ml-workflow.png │ ├── threefold_split.png │ ├── time_series_cv.png │ ├── train-test-split.png │ ├── train_test_set_2d_classification.png │ ├── train_test_split_new.png │ ├── train_test_validation_split.png │ ├── unsupervised-ml-workflow.png │ └── unsupervised_ml_api.png ├── sklearn_logo.png └── style.css └── todo.rst /.gitignore: -------------------------------------------------------------------------------- 1 | # exlude datasets and externals 2 | notebooks/datasets 3 | notebooks/joblib/ 4 | 5 | # exclude temporary files 6 | .ipynb_checkpoints 7 | .DS_Store 8 | gmon.out 9 | __pycache__ 10 | *.pyc 11 | *.o 12 | *.so 13 | *.gcno 14 | *.swp 15 | *.egg-info 16 | *.egg 17 | *~ 18 | build 19 | dist 20 | lib/test 21 | doc/_build 22 | *env 23 | *ENV 24 | .idea 25 | -------------------------------------------------------------------------------- /.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-1-of-4/917850cf6e751c36edb6d9692c48878b42c7f263/.nojekyll -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Andreas Mueller 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Introduction to Machine learning with scikit-learn 2 | ======================================================== 3 | 4 | Part 1 of 4 5 | ----------- 6 | Other parts: 7 | - [Part 2](https://github.com/amueller/ml-workshop-2-of-4) 8 | - [Part 3](https://github.com/amueller/ml-workshop-3-of-4) 9 | - [Part 4](https://github.com/amueller/ml-workshop-4-of-4) 10 | 11 | Content 12 | ------- 13 | - [What is machine learning and what can it do for you?](https://amueller.github.io/ml-workshop-1-of-4/slides/01-introduction.html) 14 | - [Data loading and basic API of scikit-learn](https://amueller.github.io/ml-workshop-1-of-4/slides/02-supervised-learning.html) 15 | - [Fundamentals of Data Preprocessing: scaling and categorical data](https://amueller.github.io/ml-workshop-1-of-4/slides/03-preprocessing.html) 16 | - [Imputation: dealing with missing values](https://amueller.github.io/ml-workshop-1-of-4/slides/04-missing_values.html) 17 | 18 | Instructor 19 | ----------- 20 | 21 | - [Andreas Mueller](http://amuller.github.io) [@amuellerml](https://twitter.com/amuellerml) - Columbia University; [Book: Introduction to Machine Learning with Python](http://shop.oreilly.com/product/0636920030515.do) 22 | 23 | --- 24 | 25 | This repository will contain the teaching material and other info associated 26 | with the "Introduction to Machine Learning with scikit-learn" course. 27 | 28 | About the workshop 29 | ------------------ 30 | Machine learning has become an indispensable tool across many areas of research and commercial applications. From text-to-speech for your phone to detecting the Higgs boson, machine learning excels at extracting knowledge from large amounts of data. This talk will give a general introduction to machine learning, as well as introduce practical tools for you to apply machine learning in your research. We will focus on one particularly important subfield of machine learning, supervised learning. The goal of supervised learning is to "learn" a function that maps inputs x to an output y, by using a collection of training data consisting of input-output pairs. We will walk through formulating a problem as a supervised machine learning problem, creating the necessary training data and applying and evaluating a machine learning algorithm. This workshop should give you all the necessary background to start using machine learning yourself. 31 | 32 | Prerequisites 33 | ------------- 34 | This workshop assumes familiarity with Jupyter notebooks and basics of pandas, matplotlib and numpy. 35 | 36 | 37 | Obtaining the Tutorial Material 38 | -------------------------------- 39 | 40 | 41 | If you are familiar with git, it is most convenient if you clone the GitHub repository. This 42 | is highly encouraged as it allows you to easily synchronize any changes to the material. 43 | 44 | ``` 45 | git clone https://github.com/amueller/ml-workshop-1-of-4.git 46 | ``` 47 | 48 | If you are not familiar with git, you can download the repository as a .zip file by heading over to the GitHub repository (https://github.com/amueller/ml-workshop-1-of-4) in your browser and click the green “Download” button in the upper right. 49 | 50 |  51 | 52 | Please note that I may add and improve the material until shortly before the tutorial session, and we recommend you to update your copy of the materials one day before the tutorials. If you have an GitHub account and forked/cloned the repository via GitHub, you can sync your existing fork with via the following commands: 53 | 54 | ``` 55 | git pull origin master 56 | ``` 57 | 58 | 59 | Installation Notes 60 | ------------------ 61 | 62 | This tutorial will require recent installations of 63 | 64 | - [NumPy](http://www.numpy.org) 65 | - [SciPy](http://www.scipy.org) 66 | - [matplotlib](http://matplotlib.org) 67 | - [pillow](https://python-pillow.org) 68 | - [pandas](http://pandas.pydata.org) 69 | - [scikit-learn](http://scikit-learn.org/stable/) (>=0.22.1) 70 | - [IPython](http://ipython.readthedocs.org/en/stable/) 71 | - [Jupyter Notebook](http://jupyter.org) 72 | 73 | The last one is important, you should be able to type: 74 | 75 | jupyter notebook 76 | 77 | in your terminal window and see the notebook panel load in your web browser. 78 | Try opening and running a notebook from the material to see check that it works. 79 | 80 | For users who do not yet have these packages installed, a relatively 81 | painless way to install all the requirements is to use a Python distribution 82 | such as [Anaconda](https://www.continuum.io/downloads), which includes 83 | the most relevant Python packages for science, math, engineering, and 84 | data analysis; Anaconda can be downloaded and installed for free 85 | including commercial use and redistribution. 86 | The code examples in this tutorial requires Python 3.5 or later. 87 | 88 | After obtaining the material, we **strongly recommend** you to open and execute 89 | a Jupyter Notebook `jupter notebook check_env.ipynb` that is located at the 90 | top level of this repository. Inside the repository, you can open the notebook 91 | by executing 92 | 93 | ```bash 94 | jupyter notebook check_env.ipynb 95 | ``` 96 | 97 | inside this repository. Inside the Notebook, you can run the code cell by 98 | clicking on the "Run Cells" button as illustrated in the figure below: 99 | 100 |  101 | 102 | 103 | Finally, if your environment satisfies the requirements for the tutorials, the executed code cell will produce an output message as shown below: 104 | 105 |  106 | -------------------------------------------------------------------------------- /check_env.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from distutils.version import LooseVersion as Version\n", 10 | "import sys\n", 11 | "\n", 12 | "\n", 13 | "OK = '\\x1b[42m[ OK ]\\x1b[0m'\n", 14 | "FAIL = \"\\x1b[41m[FAIL]\\x1b[0m\"\n", 15 | "\n", 16 | "try:\n", 17 | " import importlib\n", 18 | "except ImportError:\n", 19 | " print(FAIL, \"Python version 3.5 is required,\"\n", 20 | " \" but %s is installed.\" % sys.version)\n", 21 | "\n", 22 | " \n", 23 | "def import_version(pkg, min_ver, fail_msg=\"\"):\n", 24 | " mod = None\n", 25 | " try:\n", 26 | " mod = importlib.import_module(pkg)\n", 27 | " ver = mod.__version__\n", 28 | " if Version(ver) < min_ver:\n", 29 | " print(FAIL, \"%s version %s or higher required, but %s installed.\"\n", 30 | " % (lib, min_ver, ver))\n", 31 | " else:\n", 32 | " print(OK, '%s version %s' % (pkg, ver))\n", 33 | " except ImportError:\n", 34 | " print(FAIL, '%s not installed. %s' % (pkg, fail_msg))\n", 35 | " return mod\n", 36 | "\n", 37 | "\n", 38 | "# first check the python version\n", 39 | "print('Using python in', sys.prefix)\n", 40 | "print(sys.version)\n", 41 | "pyversion = Version(sys.version)\n", 42 | "if pyversion < \"3.5\":\n", 43 | " print(FAIL, \"Python version 3.5 is required,\"\n", 44 | " \" but %s is installed.\" % sys.version)\n", 45 | "print()\n", 46 | "requirements = {'numpy': \"1.6.1\", 'scipy': \"1.0\", 'matplotlib': \"2.0\",\n", 47 | " 'IPython': \"3.0\", 'sklearn': \"0.22.1\", 'pandas': \"0.18\"}\n", 48 | "\n", 49 | "# now the dependencies\n", 50 | "for lib, required_version in list(requirements.items()):\n", 51 | " import_version(lib, required_version)" 52 | ] 53 | } 54 | ], 55 | "metadata": { 56 | "anaconda-cloud": {}, 57 | "kernelspec": { 58 | "display_name": "Python 3", 59 | "language": "python", 60 | "name": "python3" 61 | }, 62 | "language_info": { 63 | "codemirror_mode": { 64 | "name": "ipython", 65 | "version": 3 66 | }, 67 | "file_extension": ".py", 68 | "mimetype": "text/x-python", 69 | "name": "python", 70 | "nbconvert_exporter": "python", 71 | "pygments_lexer": "ipython3", 72 | "version": "3.7.3" 73 | } 74 | }, 75 | "nbformat": 4, 76 | "nbformat_minor": 4 77 | } 78 | -------------------------------------------------------------------------------- /images/check_env-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-1-of-4/917850cf6e751c36edb6d9692c48878b42c7f263/images/check_env-1.png -------------------------------------------------------------------------------- /images/check_env-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-1-of-4/917850cf6e751c36edb6d9692c48878b42c7f263/images/check_env-2.png -------------------------------------------------------------------------------- /images/download-repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/ml-workshop-1-of-4/917850cf6e751c36edb6d9692c48878b42c7f263/images/download-repo.png -------------------------------------------------------------------------------- /notebooks/01 - Data Loading.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Loading" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Get some data to play with" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from sklearn.datasets import fetch_openml\n", 24 | "blood = fetch_openml('blood-transfusion-service-center')\n", 25 | "print(blood.DESCR)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "blood.data.shape" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "blood.data" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import pandas as pd\n", 53 | "X = pd.DataFrame(blood.data, columns=['recency', 'frequency', 'total_amount', 'since_first'])" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "blood.target.shape" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "blood.target" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "y = pd.Series(blood.target)\n", 81 | "y.value_counts()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "import matplotlib.pyplot as plt\n", 91 | "pd.plotting.scatter_matrix(X, c=y=='2', cmap='Paired', figsize=(10, 10));" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "**Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)**" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "Split the data to get going" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "from sklearn.model_selection import train_test_split\n", 115 | "X_train, X_test, y_train, y_test = train_test_split(X, y)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "X.shape" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "X_train.shape" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "X_test.shape" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "# Exercises\n", 150 | "\n", 151 | "## Excercise 1\n", 152 | "\n", 153 | "Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.\n", 154 | "The function returns a dictionary-like object that has the same attributes as ``blood``.\n", 155 | "\n", 156 | "What is the number of classes, features and data points in this dataset?\n", 157 | "Use a scatterplot to visualize the dataset.\n", 158 | "\n", 159 | "You can look at ``DESCR`` attribute to learn more about the dataset.\n", 160 | "``print(iris.DESCR)``\n", 161 | "\n", 162 | "Split the data into training and test set.\n", 163 | "\n", 164 | "## Exercise 2\n", 165 | "\n", 166 | "Usually data doesn't come in that nice a format. You can find the csv file that contains the iris dataset at the following path:\n", 167 | "\n", 168 | "```python\n", 169 | "import sklearn.datasets\n", 170 | "import os\n", 171 | "iris_path = os.path.join(sklearn.datasets.__path__[0], 'data', 'iris.csv')\n", 172 | "```\n", 173 | "Load the data from there using pandas ``pd.read_csv`` method and bring it into the same format as before with the data in a variable X and the labels in a variable y. The first few lines of ``iris.csv`` file looks like:\n", 174 | "\n", 175 | "```\n", 176 | "150,4,setosa,versicolor,virginica\n", 177 | "5.1,3.5,1.4,0.2,0\n", 178 | "4.9,3.0,1.4,0.2,0\n", 179 | "4.7,3.2,1.3,0.2,0\n", 180 | "4.6,3.1,1.5,0.2,0\n", 181 | "```" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "# http://github.com/amueller/ml-workshop-1-of-4" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# %load solutions/load_iris.py" 198 | ] 199 | } 200 | ], 201 | "metadata": { 202 | "anaconda-cloud": {}, 203 | "kernelspec": { 204 | "display_name": "Python 3", 205 | "language": "python", 206 | "name": "python3" 207 | }, 208 | "language_info": { 209 | "codemirror_mode": { 210 | "name": "ipython", 211 | "version": 3 212 | }, 213 | "file_extension": ".py", 214 | "mimetype": "text/x-python", 215 | "name": "python", 216 | "nbconvert_exporter": "python", 217 | "pygments_lexer": "ipython3", 218 | "version": "3.7.3" 219 | } 220 | }, 221 | "nbformat": 4, 222 | "nbformat_minor": 4 223 | } 224 | -------------------------------------------------------------------------------- /notebooks/02 - Supervised Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Scikit-learn" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import matplotlib.pyplot as plt\n", 17 | "import numpy as np\n", 18 | "import sklearn\n", 19 | "sklearn.set_config(print_changed_only=True)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from sklearn.datasets import fetch_openml\n", 29 | "from sklearn.model_selection import train_test_split\n", 30 | "blood = fetch_openml('blood-transfusion-service-center')\n", 31 | "\n", 32 | "X_train, X_test, y_train, y_test = train_test_split(\n", 33 | " blood.data, blood.target, random_state=0)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "X_train.shape" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import pandas as pd\n", 52 | "pd.Series(y_train).value_counts()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "pd.Series(y_train).value_counts(normalize=True)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "Really Simple API\n", 69 | "-------------------\n", 70 | "0) Import your model class" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "from sklearn.svm import LinearSVC" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "1) Instantiate an object and set the parameters" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "svm = LinearSVC()" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "2) Fit the model" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "svm.fit(X_train, y_train)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "3) Apply / evaluate" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "print(svm.predict(X_train))\n", 128 | "print(y_train)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "svm.score(X_train, y_train)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "svm.score(X_test, y_test)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "And again\n", 154 | "---------" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "from sklearn.ensemble import RandomForestClassifier" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "rf = RandomForestClassifier()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "rf.fit(X_train, y_train)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "rf.score(X_train, y_train)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "rf.score(X_test, y_test)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "# Materials: https://github.com/amueller/ml-workshop-1-of-4" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "# Exercises\n", 214 | "\n", 215 | "## Exercise 1\n", 216 | "Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.\n", 217 | "\n", 218 | "Split it into training and test set using ``train_test_split``.\n", 219 | "\n", 220 | "## Exercise 2\n", 221 | "Then train an evaluate ``sklearn.neighbors.KNeighborsClassifier``, the RandomForestClassifier and ``sklearn.linear_model.LogisticRegression`` on the iris dataset.\n", 222 | "How do these perform on the training set vs the test set? Which one is the best on the training set, which one is the best on the test set?\n", 223 | "\n", 224 | "## Exercise 3 (extra)\n", 225 | "Can you construct a binary classification dataset (using np.random for example) on which ``sklearn.linear_model.LogisticRegression`` achieves an accuracy of 1? Can you construct a binary classification dataset on which it achieves accuracy 0.5?" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "# %load solutions/train_iris.py" 235 | ] 236 | } 237 | ], 238 | "metadata": { 239 | "anaconda-cloud": {}, 240 | "kernelspec": { 241 | "display_name": "Python 3", 242 | "language": "python", 243 | "name": "python3" 244 | }, 245 | "language_info": { 246 | "codemirror_mode": { 247 | "name": "ipython", 248 | "version": 3 249 | }, 250 | "file_extension": ".py", 251 | "mimetype": "text/x-python", 252 | "name": "python", 253 | "nbconvert_exporter": "python", 254 | "pygments_lexer": "ipython3", 255 | "version": "3.7.3" 256 | } 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 4 260 | } 261 | -------------------------------------------------------------------------------- /notebooks/03 - Preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Preprocessing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import sklearn\n", 19 | "sklearn.set_config(print_changed_only=True)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from sklearn.datasets import load_boston\n", 29 | "boston = load_boston()\n", 30 | "from sklearn.model_selection import train_test_split\n", 31 | "X, y = boston.data, boston.target\n", 32 | "X_train, X_test, y_train, y_test = train_test_split(\n", 33 | " X, y, random_state=0)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "print(boston.DESCR)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "fig, axes = plt.subplots(3, 5, figsize=(20, 10))\n", 52 | "for i, ax in enumerate(axes.ravel()):\n", 53 | " if i > 12:\n", 54 | " ax.set_visible(False)\n", 55 | " continue\n", 56 | " ax.plot(X[:, i], y, 'o', alpha=.5)\n", 57 | " ax.set_title(\"{}: {}\".format(i, boston.feature_names[i]))\n", 58 | " ax.set_ylabel(\"MEDV\")" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "plt.boxplot(X)\n", 68 | "plt.xticks(np.arange(1, X.shape[1] + 1),\n", 69 | " boston.feature_names, rotation=30, ha=\"right\");" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "from sklearn.preprocessing import StandardScaler\n", 79 | "scaler = StandardScaler()\n", 80 | "X_train_scaled = scaler.fit_transform(X_train)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "plt.boxplot(X_train_scaled)\n", 90 | "plt.xticks(np.arange(1, X.shape[1] + 1),\n", 91 | " boston.feature_names, rotation=30, ha=\"right\");" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "from sklearn.neighbors import KNeighborsRegressor\n", 101 | "knr = KNeighborsRegressor().fit(X_train, y_train)\n", 102 | "knr.score(X_train, y_train)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "knr.score(X_test, y_test)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "knr_scaled = KNeighborsRegressor().fit(X_train_scaled, y_train)\n", 121 | "knr_scaled.fit(X_train_scaled, y_train)\n", 122 | "knr_scaled.score(X_train_scaled, y_train)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "X_test_scaled = scaler.transform(X_test)\n", 132 | "knr_scaled.score(X_test_scaled, y_test)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "from sklearn.ensemble import RandomForestRegressor\n", 142 | "rf = RandomForestRegressor(random_state=0)\n", 143 | "rf.fit(X_train, y_train)\n", 144 | "rf.score(X_test, y_test)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "rf_scaled = RandomForestRegressor(random_state=0)\n", 154 | "rf_scaled.fit(X_train_scaled, y_train)\n", 155 | "rf_scaled.score(X_test_scaled, y_test)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "# Categorical Variables" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "import pandas as pd\n", 172 | "df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],\n", 173 | " 'boro': ['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx']})\n", 174 | "df" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "pd.get_dummies(df)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "from sklearn.compose import make_column_transformer\n", 193 | "from sklearn.preprocessing import OneHotEncoder\n", 194 | "categorical = df.dtypes == object\n", 195 | "categorical" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "~categorical" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "ct = make_column_transformer((OneHotEncoder(), categorical),\n", 214 | " (StandardScaler(), ~categorical))\n", 215 | "ct.fit_transform(df)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "ct = make_column_transformer((OneHotEncoder(sparse=False), categorical))\n", 225 | "ct.fit_transform(df)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "ct = make_column_transformer((OneHotEncoder(), categorical),\n", 235 | " remainder='passthrough')\n", 236 | "ct.fit_transform(df)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "ct = make_column_transformer((OneHotEncoder(), categorical),\n", 246 | " remainder=StandardScaler())\n", 247 | "ct.fit_transform(df)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "# Exercises\n", 255 | "\n", 256 | "## Exercise 1\n", 257 | "Load the \"adult\" datasets using consisting of income data from the census, including information whether someone has a salary of less than \\$50k or more. Look at the data using the ``head`` method. Our final goal in Exercise 4 will be to classify entries into those making less than \\$50k and those that make more.\n", 258 | "\n", 259 | "## Exercise 2\n", 260 | "Experiment with visualizing the data. Can you find out which features influence the income the most?\n", 261 | "\n", 262 | "## Exercise 3\n", 263 | "Separate the target variable from the features.\n", 264 | "Split the data into training and test set.\n", 265 | "Apply dummy encoding and scaling.\n", 266 | "How did this change the number of variables?\n", 267 | "\n", 268 | "## Exercise 4\n", 269 | "Build and evaluate a LogisticRegression model on the data.\n", 270 | "\n" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "data = pd.read_csv(\"data/adult.csv\", index_col=0)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "# %load solutions/load_adult.py" 289 | ] 290 | } 291 | ], 292 | "metadata": { 293 | "anaconda-cloud": {}, 294 | "kernelspec": { 295 | "display_name": "Python 3", 296 | "language": "python", 297 | "name": "python3" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 3 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython3", 309 | "version": "3.7.3" 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 4 314 | } 315 | -------------------------------------------------------------------------------- /notebooks/data/ram_price.csv: -------------------------------------------------------------------------------- 1 | ,date,price 2 | 0,1957.0,411041792.0 3 | 1,1959.0,67947725.0 4 | 2,1960.0,5242880.0 5 | 3,1965.0,2642412.0 6 | 4,1970.0,734003.0 7 | 5,1973.0,399360.0 8 | 6,1974.0,314573.0 9 | 7,1975.0,421888.0 10 | 8,1975.08,180224.0 11 | 9,1975.25,67584.0 12 | 10,1975.75,49920.0 13 | 11,1976.0,40704.0 14 | 12,1976.17,48960.0 15 | 13,1976.42,23040.0 16 | 14,1976.58,32000.0 17 | 15,1977.08,36800.0 18 | 16,1978.17,28000.0 19 | 17,1978.25,29440.0 20 | 18,1978.33,19200.0 21 | 19,1978.5,24000.0 22 | 20,1978.58,16000.0 23 | 21,1978.75,15200.0 24 | 22,1979.0,10528.0 25 | 23,1979.75,6704.0 26 | 24,1980.0,6480.0 27 | 25,1981.0,8800.0 28 | 26,1981.58,4479.0 29 | 27,1982.0,3520.0 30 | 28,1982.17,4464.0 31 | 29,1982.67,1980.0 32 | 30,1983.0,2396.0 33 | 31,1983.67,1980.0 34 | 32,1984.0,1379.0 35 | 33,1984.58,1331.0 36 | 34,1985.0,880.0 37 | 35,1985.33,720.0 38 | 36,1985.42,550.0 39 | 37,1985.5,420.0 40 | 38,1985.58,350.0 41 | 39,1985.67,300.0 42 | 40,1985.83,300.0 43 | 41,1985.92,300.0 44 | 42,1986.0,300.0 45 | 43,1986.08,300.0 46 | 44,1986.17,300.0 47 | 45,1986.25,300.0 48 | 46,1986.33,190.0 49 | 47,1986.42,190.0 50 | 48,1986.5,190.0 51 | 49,1986.58,190.0 52 | 50,1986.67,190.0 53 | 51,1986.75,190.0 54 | 52,1986.92,190.0 55 | 53,1987.0,176.0 56 | 54,1987.08,176.0 57 | 55,1987.17,157.0 58 | 56,1987.25,154.0 59 | 57,1987.33,154.0 60 | 58,1987.42,154.0 61 | 59,1987.5,154.0 62 | 60,1987.58,154.0 63 | 61,1987.67,163.0 64 | 62,1987.75,133.0 65 | 63,1987.83,163.0 66 | 64,1987.92,163.0 67 | 65,1988.0,163.0 68 | 66,1988.08,182.0 69 | 67,1988.17,199.0 70 | 68,1988.33,199.0 71 | 69,1988.42,199.0 72 | 70,1988.5,505.0 73 | 71,1988.58,505.0 74 | 72,1988.67,505.0 75 | 73,1988.75,505.0 76 | 74,1988.83,505.0 77 | 75,1988.92,505.0 78 | 76,1989.0,505.0 79 | 77,1989.08,505.0 80 | 78,1989.17,505.0 81 | 79,1989.25,505.0 82 | 80,1989.42,344.0 83 | 81,1989.5,197.0 84 | 82,1989.58,188.0 85 | 83,1989.67,188.0 86 | 84,1989.75,128.0 87 | 85,1989.83,117.0 88 | 86,1989.92,113.0 89 | 87,1990.0,106.0 90 | 88,1990.17,98.3 91 | 89,1990.33,98.3 92 | 90,1990.42,89.5 93 | 91,1990.5,82.8 94 | 92,1990.58,81.1 95 | 93,1990.67,71.5 96 | 94,1990.75,59.0 97 | 95,1990.83,51.0 98 | 96,1990.92,45.5 99 | 97,1991.0,44.5 100 | 98,1991.08,44.5 101 | 99,1991.17,45.0 102 | 100,1991.25,45.0 103 | 101,1991.33,45.0 104 | 102,1991.42,43.8 105 | 103,1991.5,43.8 106 | 104,1991.58,41.3 107 | 105,1991.67,46.3 108 | 106,1991.75,45.0 109 | 107,1991.83,39.8 110 | 108,1991.92,39.8 111 | 109,1992.0,36.3 112 | 110,1992.08,36.3 113 | 111,1992.17,36.3 114 | 112,1992.25,34.8 115 | 113,1992.33,30.0 116 | 114,1992.42,32.5 117 | 115,1992.5,33.5 118 | 116,1992.58,31.0 119 | 117,1992.67,27.5 120 | 118,1992.75,26.3 121 | 119,1992.83,26.3 122 | 120,1992.92,26.3 123 | 121,1993.0,33.1 124 | 122,1993.08,27.5 125 | 123,1993.17,27.5 126 | 124,1993.25,27.5 127 | 125,1993.33,27.5 128 | 126,1993.42,30.0 129 | 127,1993.5,30.0 130 | 128,1993.58,30.0 131 | 129,1993.67,30.0 132 | 130,1993.75,36.0 133 | 131,1993.83,39.8 134 | 132,1993.92,35.8 135 | 133,1994.0,35.8 136 | 134,1994.08,35.8 137 | 135,1994.17,36.0 138 | 136,1994.25,37.3 139 | 137,1994.33,37.3 140 | 138,1994.42,37.3 141 | 139,1994.5,38.5 142 | 140,1994.58,37.0 143 | 141,1994.67,34.0 144 | 142,1994.75,33.5 145 | 143,1994.83,32.3 146 | 144,1994.92,32.3 147 | 145,1995.0,32.3 148 | 146,1995.08,32.0 149 | 147,1995.17,32.0 150 | 148,1995.25,31.2 151 | 149,1995.33,31.2 152 | 150,1995.42,31.1 153 | 151,1995.5,31.2 154 | 152,1995.58,30.6 155 | 153,1995.67,33.1 156 | 154,1995.75,33.1 157 | 155,1995.83,30.9 158 | 156,1995.92,30.9 159 | 157,1996.0,29.9 160 | 158,1996.08,28.8 161 | 159,1996.17,26.1 162 | 160,1996.25,24.7 163 | 161,1996.33,17.2 164 | 162,1996.42,14.9 165 | 163,1996.5,11.3 166 | 164,1996.58,9.06 167 | 165,1996.67,8.44 168 | 166,1996.75,8.0 169 | 167,1996.83,5.25 170 | 168,1996.92,5.25 171 | 169,1997.0,4.63 172 | 170,1997.08,3.63 173 | 171,1997.17,3.0 174 | 172,1997.25,3.0 175 | 173,1997.33,3.0 176 | 174,1997.42,3.69 177 | 175,1997.5,4.0 178 | 176,1997.58,4.13 179 | 177,1997.67,3.63 180 | 178,1997.75,3.41 181 | 179,1997.83,3.25 182 | 180,1997.92,2.16 183 | 181,1998.0,2.16 184 | 182,1998.08,0.91 185 | 183,1998.17,0.97 186 | 184,1998.25,1.22 187 | 185,1998.33,1.19 188 | 186,1998.42,0.97 189 | 187,1998.58,1.03 190 | 188,1998.67,0.97 191 | 189,1998.75,1.16 192 | 190,1998.83,0.84 193 | 191,1998.92,0.84 194 | 192,1999.08,1.44 195 | 193,1999.13,0.84 196 | 194,1999.17,1.25 197 | 195,1999.25,1.25 198 | 196,1999.33,0.86 199 | 197,1999.5,0.78 200 | 198,1999.67,0.87 201 | 199,1999.75,1.04 202 | 200,1999.83,1.34 203 | 201,1999.92,2.35 204 | 202,2000.0,1.56 205 | 203,2000.08,1.48 206 | 204,2000.17,1.08 207 | 205,2000.25,0.84 208 | 206,2000.33,0.7 209 | 207,2000.42,0.9 210 | 208,2000.5,0.77 211 | 209,2000.58,0.84 212 | 210,2000.67,1.07 213 | 211,2000.75,1.12 214 | 212,2000.83,1.12 215 | 213,2000.92,0.9 216 | 214,2001.0,0.75 217 | 215,2001.08,0.464 218 | 216,2001.17,0.464 219 | 217,2001.25,0.383 220 | 218,2001.33,0.387 221 | 219,2001.42,0.305 222 | 220,2001.5,0.352 223 | 221,2001.5,0.27 224 | 222,2001.58,0.191 225 | 223,2001.67,0.191 226 | 224,2001.75,0.169 227 | 225,2001.77,0.148 228 | 226,2002.08,0.134 229 | 227,2002.08,0.207 230 | 228,2002.25,0.193 231 | 229,2002.33,0.193 232 | 230,2002.42,0.33 233 | 231,2002.58,0.193 234 | 232,2002.75,0.193 235 | 233,2003.17,0.176 236 | 234,2003.25,0.076 237 | 235,2003.33,0.126 238 | 236,2003.42,0.115 239 | 237,2003.5,0.133 240 | 238,2003.58,0.129 241 | 239,2003.67,0.143 242 | 240,2003.75,0.148 243 | 241,2003.83,0.16 244 | 242,2003.99,0.166 245 | 243,2004.0,0.174 246 | 244,2004.08,0.148 247 | 245,2004.17,0.146 248 | 246,2004.33,0.156 249 | 247,2004.42,0.203 250 | 248,2004.5,0.176 251 | 249,2005.25,0.185 252 | 250,2005.42,0.149 253 | 251,2005.83,0.116 254 | 252,2005.92,0.185 255 | 253,2006.17,0.112 256 | 254,2006.33,0.073 257 | 255,2006.5,0.082 258 | 256,2006.67,0.073 259 | 257,2006.75,0.088 260 | 258,2006.83,0.098 261 | 259,2006.99,0.092 262 | 260,2007.0,0.082 263 | 261,2007.08,0.078 264 | 262,2007.17,0.066 265 | 263,2007.33,0.0464 266 | 264,2007.5,0.0386 267 | 265,2007.67,0.0351 268 | 266,2007.75,0.0322 269 | 267,2007.83,0.0244 270 | 268,2007.92,0.0244 271 | 269,2008.0,0.0232 272 | 270,2008.08,0.022 273 | 271,2008.33,0.022 274 | 272,2008.5,0.0207 275 | 273,2008.58,0.0176 276 | 274,2008.67,0.0146 277 | 275,2008.83,0.011 278 | 276,2008.92,0.0098 279 | 277,2009.0,0.0098 280 | 278,2009.08,0.0107 281 | 279,2009.25,0.0105 282 | 280,2009.42,0.0115 283 | 281,2009.5,0.011 284 | 282,2009.58,0.0127 285 | 283,2009.75,0.0183 286 | 284,2009.92,0.0205 287 | 285,2010.0,0.019 288 | 286,2010.08,0.0202 289 | 287,2010.17,0.0195 290 | 288,2010.33,0.0242 291 | 289,2010.5,0.021 292 | 290,2010.58,0.022 293 | 291,2010.75,0.0171 294 | 292,2010.83,0.0146 295 | 293,2010.92,0.0122 296 | 294,2011.0,0.01 297 | 295,2011.08,0.0103 298 | 296,2011.33,0.01 299 | 297,2011.42,0.0085 300 | 298,2011.67,0.0054 301 | 299,2011.75,0.0051 302 | 300,2012.0,0.0049 303 | 301,2012.08,0.0049 304 | 302,2012.25,0.005 305 | 303,2012.33,0.0049 306 | 304,2012.58,0.0048 307 | 305,2012.67,0.004 308 | 306,2012.83,0.0037 309 | 307,2013.0,0.0043 310 | 308,2013.08,0.0054 311 | 309,2013.33,0.0067 312 | 310,2013.42,0.0061 313 | 311,2013.58,0.0073 314 | 312,2013.67,0.0065 315 | 313,2013.75,0.0082 316 | 314,2013.83,0.0085 317 | 315,2013.92,0.0079 318 | 316,2014.08,0.0095 319 | 317,2014.17,0.0079 320 | 318,2014.25,0.0073 321 | 319,2014.42,0.0079 322 | 320,2014.58,0.0085 323 | 321,2014.67,0.0085 324 | 322,2014.83,0.0085 325 | 323,2015.0,0.0078 326 | 324,2015.08,0.0073 327 | 325,2015.25,0.0061 328 | 326,2015.33,0.0056 329 | 327,2015.5,0.0049 330 | 328,2015.58,0.0045 331 | 329,2015.67,0.0043 332 | 330,2015.75,0.0042 333 | 331,2015.83,0.0038 334 | 332,2015.92,0.0037 335 | -------------------------------------------------------------------------------- /notebooks/solutions/load_adult.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | from sklearn.model_selection import train_test_split 4 | import numpy as np 5 | from sklearn.preprocessing import MinMaxScaler 6 | # We use display so that we can do multiple nice renderings of dataframes 7 | # in Jupyter 8 | from IPython.display import display 9 | 10 | # Exercise 1 11 | data = pd.read_csv("data/adult.csv", index_col=0) 12 | display(data.head()) 13 | 14 | income = data.income 15 | data_features = data.drop("income", axis=1) 16 | 17 | display(data_features.head()) 18 | 19 | # Exercise 2 20 | 21 | data.age.hist() 22 | 23 | # plot by gender 24 | data['income_bin'] = data.income == " >50K" 25 | plt.figure() 26 | plt.title("By gender") 27 | grouped = data.groupby("gender") 28 | grouped.income_bin.mean().plot.barh() 29 | 30 | # plot by education 31 | plt.figure() 32 | plt.title("By education") 33 | data.groupby("education").income_bin.mean().sort_values().plot.barh() 34 | 35 | plt.figure() 36 | plt.title("By race") 37 | data.groupby("race").income_bin.mean().sort_values().plot.barh() 38 | 39 | 40 | # Exercise 3 41 | # using pd.get_dummies 42 | data_one_hot = pd.get_dummies(data_features) 43 | X_train, X_test, y_train, y_test = train_test_split(data_one_hot, income) 44 | 45 | scaler = MinMaxScaler().fit(X_train) 46 | X_train_scaled = scaler.transform(X_train) 47 | X_test_scaled = scaler.transform(X_test) 48 | 49 | # using OneHotEncoder 50 | cont_features = data_features.dtypes == "int64" 51 | ct = make_column_transformer((OneHotEncoder(), ~cont_features), 52 | (StandardScaler(), cont_features)) 53 | X_train, X_test, y_train, y_test = train_test_split(data_features, income) 54 | X_train_scaled = ct.fit_transform(X_train) 55 | X_test_scaled = ct.transform(X_test) 56 | 57 | 58 | # Exercise 4 59 | from sklearn.linear_model import LogisticRegression 60 | logreg = LogisticRegression(C=0.1) 61 | logreg.fit(X_train_scaled, y_train) 62 | print("Training score:", logreg.score(X_train_scaled, y_train)) 63 | 64 | print("Test score:", logreg.score(X_test_scaled, y_test)) 65 | 66 | print("Faction <= 50k", (y_train.values == " <=50K").mean()) 67 | -------------------------------------------------------------------------------- /notebooks/solutions/load_iris.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import load_iris 5 | from sklearn.model_selection import train_test_split 6 | 7 | iris = load_iris() 8 | X, y = iris.data, iris.target 9 | 10 | print("Dataset size: %d number of features: %d number of classes: %d" 11 | % (X.shape[0], X.shape[1], len(np.unique(y)))) 12 | 13 | X_train, X_test, y_train, y_test = train_test_split(X, y) 14 | 15 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train) 16 | plt.xlabel(iris.feature_names[0]) 17 | plt.ylabel(iris.feature_names[1]) 18 | 19 | plt.figure() 20 | plt.scatter(X_train[:, 2], X_train[:, 3], c=y_train) 21 | plt.xlabel(iris.feature_names[2]) 22 | plt.ylabel(iris.feature_names[3]) 23 | 24 | import sklearn.datasets 25 | import os 26 | import pandas as pd 27 | iris_path = os.path.join(sklearn.datasets.__path__[0], 'data', 'iris.csv') 28 | iris_df = pd.read_csv(iris_path, header=None) 29 | display(iris_df.head()) 30 | 31 | iris_df = pd.read_csv(iris_path, skiprows=1, header=None) 32 | display(iris_df.head()) 33 | 34 | features = iris_df.iloc[:, :4] 35 | target = iris_df.iloc[:, 4] -------------------------------------------------------------------------------- /notebooks/solutions/train_iris.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.neighbors import KNeighborsClassifier 3 | from sklearn.model_selection import train_test_split 4 | # Exercise 1, loading data 5 | iris = load_iris() 6 | X, y = iris.data, iris.target 7 | 8 | X_train, X_test, y_train, y_test = train_test_split(X, y) 9 | 10 | # Exercise 2 11 | # Training KNN 12 | knn = KNeighborsClassifier(n_neighbors=3) 13 | knn.fit(X_train, y_train) 14 | 15 | print("test set score of knn: %f" % knn.score(X_test, y_test)) 16 | 17 | # Training RandomForest 18 | rf = RandomForestClassifier() 19 | rf.fit(X_train, y_train) 20 | rf.score(X_train, y_train) 21 | rf.score(X_test, y_test) 22 | 23 | # Exercise 3 24 | 25 | # Perfect classification (accuracy=1) on easy dataset 26 | from sklearn.linear_model import LogisticRegression 27 | X = np.random.uniform(size=(1000, 3)) 28 | X[::2] += 1000 29 | y = X[:, 0] > 500 30 | X_train, X_test, y_train, y_test = train_test_split(X, y) 31 | logreg = LogisticRegression() 32 | logreg.fit(X_train, y_train) 33 | print("score on trivial data: ", logreg.score(X_test, y_test)) 34 | 35 | # Random classification (accuracy=.5) on random data 36 | y = np.random.normal(size=1000) > .0 37 | X_train, X_test, y_train, y_test = train_test_split(X, y) 38 | logreg = LogisticRegression() 39 | logreg.fit(X_train, y_train) 40 | print("score on random data: ", logreg.score(X_test, y_test)) 41 | -------------------------------------------------------------------------------- /slides/01-introduction.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 |