├── README.md
├── Chapter 1 - Machine Learning.ipynb
├── Chapter 7 - Writing Fast Code.ipynb
├── Chapter 3 - Model Selection; Overfitting and Generalization.ipynb
├── Chapter 2.5 - Data visualization .ipynb
├── Chapter 4 - Cross-validation.ipynb
├── Chapter 0 - Hello World.ipynb
├── Chapter 5 - Model Selection And Pipelines.ipynb
├── Chapter 6 - Working With Text Data..ipynb
└── Chapter 2 - Introduction to Scikit-learn.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | scikit-learn-interactive-tutorial
2 | =================================
3 | 
4 | IPython notebooks and data for a scikit-learn tutorial.
5 | 
6 | You can view the notebooks online [on nbviewer](http://nbviewer.ipython.org/github/amueller/scikit-learn-interactive-tutorial/tree/master/)
7 | 


--------------------------------------------------------------------------------
/Chapter 1 - Machine Learning.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "metadata": {
 3 |   "name": ""
 4 |  },
 5 |  "nbformat": 3,
 6 |  "nbformat_minor": 0,
 7 |  "worksheets": [
 8 |   {
 9 |    "cells": [
10 |     {
11 |      "cell_type": "markdown",
12 |      "metadata": {},
13 |      "source": [
14 |       "What is Machine Learning?\n",
15 |       "==========================\n",
16 |       "* Data + Algorithm -> predictive program.\n",
17 |       "* Learn from past data, predict quantities on new, unseen data."
18 |      ]
19 |     },
20 |     {
21 |      "cell_type": "markdown",
22 |      "metadata": {},
23 |      "source": [
24 |       "Three kinds of Learning\n",
25 |       "========================\n",
26 |       "* Supervised\n",
27 |       "* Unsupervised\n",
28 |       "* Reinforcement\n"
29 |      ]
30 |     },
31 |     {
32 |      "cell_type": "markdown",
33 |      "metadata": {},
34 |      "source": [
35 |       "Supervised learning\n",
36 |       "====================\n",
37 |       "\n",
38 |       "Training: Examples X_train together with labels y_train.\n",
39 |       "\n",
40 |       "Testing: Given X_test, predict y_test.\n",
41 |       "\n",
42 |       "Examples\n",
43 |       "---------\n",
44 |       "\n",
45 |       "* Classification (spam, sentiment analysis, ...)\n",
46 |       "* Regression (stocks, sales, ...)\n",
47 |       "* Ranking (retrieval, search, ...)"
48 |      ]
49 |     },
50 |     {
51 |      "cell_type": "markdown",
52 |      "metadata": {},
53 |      "source": [
54 |       "Unsupervised Learning\n",
55 |       "=====================\n",
56 |       "\n",
57 |       "Examples X.\n",
58 |       "Learn something about X.\n",
59 |       "\n",
60 |       "Examples\n",
61 |       "--------\n",
62 |       "* Dimensionality reduction\n",
63 |       "* Clustering\n",
64 |       "* Manifold learning\n"
65 |      ]
66 |     },
67 |     {
68 |      "cell_type": "markdown",
69 |      "metadata": {},
70 |      "source": [
71 |       "Reinforcement Learning\n",
72 |       "========================\n",
73 |       "* Learn from experience (not covered)\n",
74 |       "\n",
75 |       "Examples\n",
76 |       "---------\n",
77 |       "* A robot learning to walk.\n",
78 |       "* A car learning to park itself."
79 |      ]
80 |     }
81 |    ],
82 |    "metadata": {}
83 |   }
84 |  ]
85 | }


--------------------------------------------------------------------------------
/Chapter 7 - Writing Fast Code.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "Vectorize\n",
 15 |       "============"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "code",
 20 |      "collapsed": false,
 21 |      "input": [
 22 |       "import numpy as np\n",
 23 |       "a = np.random.uniform(size=10000)\n",
 24 |       "b = np.random.uniform(size=10000)\n",
 25 |       "%time c = a + b\n"
 26 |      ],
 27 |      "language": "python",
 28 |      "metadata": {},
 29 |      "outputs": []
 30 |     },
 31 |     {
 32 |      "cell_type": "code",
 33 |      "collapsed": false,
 34 |      "input": [
 35 |       "def add_vectors(first, second):\n",
 36 |       "    a = np.empty(first.shape)\n",
 37 |       "    for i in xrange(len(first)):\n",
 38 |       "        a[i] = first[i] + second[i]\n",
 39 |       "        "
 40 |      ],
 41 |      "language": "python",
 42 |      "metadata": {},
 43 |      "outputs": []
 44 |     },
 45 |     {
 46 |      "cell_type": "code",
 47 |      "collapsed": false,
 48 |      "input": [
 49 |       "%time c = add_vectors(a, b)"
 50 |      ],
 51 |      "language": "python",
 52 |      "metadata": {},
 53 |      "outputs": []
 54 |     },
 55 |     {
 56 |      "cell_type": "markdown",
 57 |      "metadata": {},
 58 |      "source": [
 59 |       "Don't write for-loops!\n",
 60 |       "------------------------\n",
 61 |       "\n",
 62 |       "If you can't avoid it (writing your custom algorithms)\n",
 63 |       "Use Cython\n",
 64 |       "==========="
 65 |      ]
 66 |     },
 67 |     {
 68 |      "cell_type": "code",
 69 |      "collapsed": false,
 70 |      "input": [
 71 |       "%load_ext cythonmagic"
 72 |      ],
 73 |      "language": "python",
 74 |      "metadata": {},
 75 |      "outputs": []
 76 |     },
 77 |     {
 78 |      "cell_type": "code",
 79 |      "collapsed": false,
 80 |      "input": [
 81 |       "%%cython\n",
 82 |       "\n",
 83 |       "import numpy as np\n",
 84 |       "cimport numpy as np\n",
 85 |       "\n",
 86 |       "def add_vectors_fast(first, second):\n",
 87 |       "    a = np.empty(first.shape)\n",
 88 |       "    for i in xrange(len(first)):\n",
 89 |       "        a[i] = first[i] + second[i]\n",
 90 |       "    return a\n"
 91 |      ],
 92 |      "language": "python",
 93 |      "metadata": {},
 94 |      "outputs": []
 95 |     },
 96 |     {
 97 |      "cell_type": "code",
 98 |      "collapsed": false,
 99 |      "input": [
100 |       "%time c = add_vectors_fast(a, b)"
101 |      ],
102 |      "language": "python",
103 |      "metadata": {},
104 |      "outputs": []
105 |     },
106 |     {
107 |      "cell_type": "code",
108 |      "collapsed": false,
109 |      "input": [
110 |       "%%cython\n",
111 |       "\n",
112 |       "cimport cython\n",
113 |       "cimport numpy as np\n",
114 |       "\n",
115 |       "#@cython.boundscheck(False)\n",
116 |       "#@cython.wraparound(False)\n",
117 |       "def add_vectors_fast2(double[:] first, double[:] second):\n",
118 |       "    cdef np.ndarray[double, ndim = 1, mode = \"c\"] a = np.ndarray(first.shape[0])\n",
119 |       "    #cdef int i\n",
120 |       "    for i in range(len(first)):\n",
121 |       "        a[i] = first[i] + second[i]\n",
122 |       "    return a\n"
123 |      ],
124 |      "language": "python",
125 |      "metadata": {},
126 |      "outputs": []
127 |     },
128 |     {
129 |      "cell_type": "code",
130 |      "collapsed": false,
131 |      "input": [
132 |       "%time c = add_vectors_fast2(a, b)"
133 |      ],
134 |      "language": "python",
135 |      "metadata": {},
136 |      "outputs": []
137 |     },
138 |     {
139 |      "cell_type": "code",
140 |      "collapsed": false,
141 |      "input": [],
142 |      "language": "python",
143 |      "metadata": {},
144 |      "outputs": []
145 |     }
146 |    ],
147 |    "metadata": {}
148 |   }
149 |  ]
150 | }


--------------------------------------------------------------------------------
/Chapter 3 - Model Selection; Overfitting and Generalization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "Choosing the right complexity for a model\n",
 15 |       "==============================================="
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "code",
 20 |      "collapsed": false,
 21 |      "input": [
 22 |       "\n",
 23 |       "import numpy as np\n",
 24 |       "import matplotlib.pyplot as plt\n",
 25 |       "\n",
 26 |       "from sklearn.datasets import load_iris\n",
 27 |       "from sklearn.cross_validation import  train_test_split\n",
 28 |       "\n",
 29 |       "\n",
 30 |       "iris = load_iris()\n",
 31 |       "X = iris.data\n",
 32 |       "y = iris.target\n",
 33 |       "\n",
 34 |       "\n",
 35 |       "# dataset for decision function visualization\n",
 36 |       "X_2d = X[:, :2]\n",
 37 |       "X_2d = X_2d[y > 0]\n",
 38 |       "y_2d = y[y > 0]\n",
 39 |       "y_2d -= 1\n",
 40 |       "\n",
 41 |       "X_train, X_test, y_train, y_test = train_test_split(X_2d, y_2d)"
 42 |      ],
 43 |      "language": "python",
 44 |      "metadata": {},
 45 |      "outputs": []
 46 |     },
 47 |     {
 48 |      "cell_type": "code",
 49 |      "collapsed": false,
 50 |      "input": [
 51 |       "%matplotlib inline\n",
 52 |       "plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)"
 53 |      ],
 54 |      "language": "python",
 55 |      "metadata": {},
 56 |      "outputs": []
 57 |     },
 58 |     {
 59 |      "cell_type": "code",
 60 |      "collapsed": false,
 61 |      "input": [
 62 |       "def show_decision_function(clf, ax):\n",
 63 |       "    xx, yy = np.meshgrid(np.linspace(4.5, 8, 200), np.linspace(1.5, 4.0, 200))\n",
 64 |       "    try:\n",
 65 |       "        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n",
 66 |       "    except AttributeError:\n",
 67 |       "        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]\n",
 68 |       "\n",
 69 |       "    Z = Z.reshape(xx.shape)\n",
 70 |       "    ax.pcolormesh(xx, yy, Z, cmap=plt.cm.jet)\n",
 71 |       "    ax.set_xlim(4.5, 8)\n",
 72 |       "    ax.set_ylim(1.5, 4.0)\n",
 73 |       "    ax.set_xticks(())\n",
 74 |       "    ax.set_yticks(())\n",
 75 |       "    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)"
 76 |      ],
 77 |      "language": "python",
 78 |      "metadata": {},
 79 |      "outputs": []
 80 |     },
 81 |     {
 82 |      "cell_type": "code",
 83 |      "collapsed": false,
 84 |      "input": [
 85 |       "from sklearn.svm import SVC\n",
 86 |       "\n",
 87 |       "training_scores = []\n",
 88 |       "test_scores = []\n",
 89 |       "fig, axes = plt.subplots(2, 3, figsize=(20, 10))\n",
 90 |       "Cs = [0.01, 0.1, 1, 10, 100, 1000]\n",
 91 |       "\n",
 92 |       "for C, ax in zip(Cs, axes.ravel()):\n",
 93 |       "    clf = SVC(gamma=10, C=C)\n",
 94 |       "    clf.fit(X_train, y_train)\n",
 95 |       "    training_scores.append(clf.score(X_train, y_train))\n",
 96 |       "    test_scores.append(clf.score(X_test, y_test))\n",
 97 |       "    show_decision_function(clf, ax)"
 98 |      ],
 99 |      "language": "python",
100 |      "metadata": {},
101 |      "outputs": []
102 |     },
103 |     {
104 |      "cell_type": "code",
105 |      "collapsed": false,
106 |      "input": [
107 |       "plt.figure(figsize=(20, 10))\n",
108 |       "plt.plot(training_scores, label=\"training scores\")\n",
109 |       "plt.plot(test_scores, label=\"test scores\")\n",
110 |       "plt.legend(loc=\"best\")\n",
111 |       "plt.xticks(range(6), Cs)"
112 |      ],
113 |      "language": "python",
114 |      "metadata": {},
115 |      "outputs": []
116 |     },
117 |     {
118 |      "cell_type": "markdown",
119 |      "metadata": {},
120 |      "source": [
121 |       "Tasks\n",
122 |       "======\n",
123 |       "1. Play with the ``n_neighbors`` parameter of ``KNeighborsClassifier`` on the digits dataset. Compare training set and test set performance to see how it is related to complexity."
124 |      ]
125 |     }
126 |    ],
127 |    "metadata": {}
128 |   }
129 |  ]
130 | }


--------------------------------------------------------------------------------
/Chapter 2.5 - Data visualization .ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "code",
 12 |      "collapsed": false,
 13 |      "input": [
 14 |       "from sklearn.datasets import load_digits\n",
 15 |       "digits = load_digits()\n",
 16 |       "import matplotlib.pyplot as plt\n",
 17 |       "%matplotlib inline"
 18 |      ],
 19 |      "language": "python",
 20 |      "metadata": {},
 21 |      "outputs": []
 22 |     },
 23 |     {
 24 |      "cell_type": "markdown",
 25 |      "metadata": {},
 26 |      "source": [
 27 |       "Dimensionality reduction and manifold learning\n",
 28 |       "=================================================="
 29 |      ]
 30 |     },
 31 |     {
 32 |      "cell_type": "code",
 33 |      "collapsed": false,
 34 |      "input": [
 35 |       "from sklearn.decomposition import PCA"
 36 |      ],
 37 |      "language": "python",
 38 |      "metadata": {},
 39 |      "outputs": []
 40 |     },
 41 |     {
 42 |      "cell_type": "code",
 43 |      "collapsed": false,
 44 |      "input": [
 45 |       "pca = PCA(n_components=2)"
 46 |      ],
 47 |      "language": "python",
 48 |      "metadata": {},
 49 |      "outputs": []
 50 |     },
 51 |     {
 52 |      "cell_type": "code",
 53 |      "collapsed": false,
 54 |      "input": [
 55 |       "pca.fit(digits.data)"
 56 |      ],
 57 |      "language": "python",
 58 |      "metadata": {},
 59 |      "outputs": []
 60 |     },
 61 |     {
 62 |      "cell_type": "code",
 63 |      "collapsed": false,
 64 |      "input": [
 65 |       "digits_pca = pca.transform(digits.data)"
 66 |      ],
 67 |      "language": "python",
 68 |      "metadata": {},
 69 |      "outputs": []
 70 |     },
 71 |     {
 72 |      "cell_type": "code",
 73 |      "collapsed": false,
 74 |      "input": [
 75 |       "digits_pca.shape"
 76 |      ],
 77 |      "language": "python",
 78 |      "metadata": {},
 79 |      "outputs": []
 80 |     },
 81 |     {
 82 |      "cell_type": "code",
 83 |      "collapsed": false,
 84 |      "input": [
 85 |       "plt.scatter(digits_pca[:, 0], digits_pca[:, 1], c=digits.target)"
 86 |      ],
 87 |      "language": "python",
 88 |      "metadata": {},
 89 |      "outputs": []
 90 |     },
 91 |     {
 92 |      "cell_type": "code",
 93 |      "collapsed": false,
 94 |      "input": [
 95 |       "plt.matshow(pca.mean_.reshape(8, 8))\n",
 96 |       "plt.matshow(pca.components_[0].reshape(8, 8))\n",
 97 |       "plt.matshow(pca.components_[1].reshape(8, 8))"
 98 |      ],
 99 |      "language": "python",
100 |      "metadata": {},
101 |      "outputs": []
102 |     },
103 |     {
104 |      "cell_type": "markdown",
105 |      "metadata": {},
106 |      "source": [
107 |       "Manifold Learning\n",
108 |       "=================="
109 |      ]
110 |     },
111 |     {
112 |      "cell_type": "code",
113 |      "collapsed": false,
114 |      "input": [
115 |       "from sklearn.manifold import SpectralEmbedding"
116 |      ],
117 |      "language": "python",
118 |      "metadata": {},
119 |      "outputs": []
120 |     },
121 |     {
122 |      "cell_type": "code",
123 |      "collapsed": false,
124 |      "input": [
125 |       "se = SpectralEmbedding()\n",
126 |       "digits_se = se.fit_transform(digits.data)"
127 |      ],
128 |      "language": "python",
129 |      "metadata": {},
130 |      "outputs": []
131 |     },
132 |     {
133 |      "cell_type": "code",
134 |      "collapsed": false,
135 |      "input": [
136 |       "plt.scatter(digits_se[:, 0], digits_se[:, 1], c=digits.target)"
137 |      ],
138 |      "language": "python",
139 |      "metadata": {},
140 |      "outputs": []
141 |     },
142 |     {
143 |      "cell_type": "markdown",
144 |      "metadata": {},
145 |      "source": [
146 |       "Tasks\n",
147 |       "======\n",
148 |       "1. Compare the projection to the principal components to projecting to two rows of the original data.\n",
149 |       "2. Play with the ``n_neighbors`` parameter of Spectral Embedding. How doe that change the outcome?\n",
150 |       "3. Extract more components from the digits data using PCA. How can you visualize them?"
151 |      ]
152 |     },
153 |     {
154 |      "cell_type": "code",
155 |      "collapsed": false,
156 |      "input": [],
157 |      "language": "python",
158 |      "metadata": {},
159 |      "outputs": []
160 |     }
161 |    ],
162 |    "metadata": {}
163 |   }
164 |  ]
165 | }


--------------------------------------------------------------------------------
/Chapter 4 - Cross-validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "\n",
 15 |       "Cross-validation\n",
 16 |       "===================\n",
 17 |       "What is cross-validation?\n",
 18 |       "--------------------------\n",
 19 |       "* A robust way to evaluate predictive accuracy.\n",
 20 |       "* Gives mean and standard deviation.\n",
 21 |       "* Makes good use of all the data."
 22 |      ]
 23 |     },
 24 |     {
 25 |      "cell_type": "code",
 26 |      "collapsed": false,
 27 |      "input": [
 28 |       "from sklearn.cross_validation import KFold\n",
 29 |       "n_samples = 200\n",
 30 |       "cv = KFold(n=n_samples, n_folds=5)"
 31 |      ],
 32 |      "language": "python",
 33 |      "metadata": {},
 34 |      "outputs": []
 35 |     },
 36 |     {
 37 |      "cell_type": "code",
 38 |      "collapsed": false,
 39 |      "input": [
 40 |       "%matplotlib inline\n",
 41 |       "import numpy as np\n",
 42 |       "import matplotlib.pyplot as plt\n",
 43 |       "\n",
 44 |       "for training_set, test_set in cv:\n",
 45 |       "    plt.figure(figsize=(20,1))\n",
 46 |       "    plt.plot(training_set, np.ones(len(training_set)), \"o\", color='blue', label=\"training set\")\n",
 47 |       "    plt.plot(test_set, np.ones(len(test_set)), \"o\", color='red', label=\"test set\")\n",
 48 |       "    plt.legend(loc=\"best\")\n",
 49 |       "    plt.axis(\"off\")"
 50 |      ],
 51 |      "language": "python",
 52 |      "metadata": {},
 53 |      "outputs": []
 54 |     },
 55 |     {
 56 |      "cell_type": "markdown",
 57 |      "metadata": {},
 58 |      "source": [
 59 |       "Using cross-validation in scikit-learn\n",
 60 |       "----------------------------------------"
 61 |      ]
 62 |     },
 63 |     {
 64 |      "cell_type": "code",
 65 |      "collapsed": false,
 66 |      "input": [
 67 |       "from sklearn.cross_validation import cross_val_score, train_test_split"
 68 |      ],
 69 |      "language": "python",
 70 |      "metadata": {},
 71 |      "outputs": []
 72 |     },
 73 |     {
 74 |      "cell_type": "code",
 75 |      "collapsed": false,
 76 |      "input": [
 77 |       "from sklearn.datasets import load_digits"
 78 |      ],
 79 |      "language": "python",
 80 |      "metadata": {},
 81 |      "outputs": []
 82 |     },
 83 |     {
 84 |      "cell_type": "code",
 85 |      "collapsed": false,
 86 |      "input": [
 87 |       "digits = load_digits()"
 88 |      ],
 89 |      "language": "python",
 90 |      "metadata": {},
 91 |      "outputs": []
 92 |     },
 93 |     {
 94 |      "cell_type": "code",
 95 |      "collapsed": false,
 96 |      "input": [
 97 |       "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
 98 |      ],
 99 |      "language": "python",
100 |      "metadata": {},
101 |      "outputs": []
102 |     },
103 |     {
104 |      "cell_type": "code",
105 |      "collapsed": false,
106 |      "input": [
107 |       "from sklearn.svm import SVC"
108 |      ],
109 |      "language": "python",
110 |      "metadata": {},
111 |      "outputs": []
112 |     },
113 |     {
114 |      "cell_type": "code",
115 |      "collapsed": false,
116 |      "input": [
117 |       "cross_val_score(SVC(C=1), X_train, y_train, cv=3)"
118 |      ],
119 |      "language": "python",
120 |      "metadata": {},
121 |      "outputs": []
122 |     },
123 |     {
124 |      "cell_type": "code",
125 |      "collapsed": false,
126 |      "input": [
127 |       "cross_val_score(SVC(C=10), X_train, y_train, cv=3, scoring=\"f1\")"
128 |      ],
129 |      "language": "python",
130 |      "metadata": {},
131 |      "outputs": []
132 |     },
133 |     {
134 |      "cell_type": "markdown",
135 |      "metadata": {},
136 |      "source": [
137 |       "Let's go to a binary task for a moment (even vs uneven)"
138 |      ]
139 |     },
140 |     {
141 |      "cell_type": "code",
142 |      "collapsed": false,
143 |      "input": [
144 |       "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3)"
145 |      ],
146 |      "language": "python",
147 |      "metadata": {},
148 |      "outputs": []
149 |     },
150 |     {
151 |      "cell_type": "code",
152 |      "collapsed": false,
153 |      "input": [
154 |       "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring=\"average_precision\")"
155 |      ],
156 |      "language": "python",
157 |      "metadata": {},
158 |      "outputs": []
159 |     },
160 |     {
161 |      "cell_type": "code",
162 |      "collapsed": false,
163 |      "input": [
164 |       "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring=\"roc_auc\")"
165 |      ],
166 |      "language": "python",
167 |      "metadata": {},
168 |      "outputs": []
169 |     },
170 |     {
171 |      "cell_type": "markdown",
172 |      "metadata": {},
173 |      "source": [
174 |       "There are other ways to do cross-valiation"
175 |      ]
176 |     },
177 |     {
178 |      "cell_type": "code",
179 |      "collapsed": false,
180 |      "input": [
181 |       "from sklearn.cross_validation import ShuffleSplit\n",
182 |       "cross_val_score(SVC(C=10), X_train, y_train, cv=ShuffleSplit(len(X_train), 10, test_size=.4))"
183 |      ],
184 |      "language": "python",
185 |      "metadata": {},
186 |      "outputs": []
187 |     },
188 |     {
189 |      "cell_type": "markdown",
190 |      "metadata": {},
191 |      "source": [
192 |       "Tasks\n",
193 |       "======\n",
194 |       "1. Select a good ``gamma`` and ``C`` for SVC on ``digits`` using cross-validation.\n",
195 |       "2. Validate your findings on the test set."
196 |      ]
197 |     }
198 |    ],
199 |    "metadata": {}
200 |   }
201 |  ]
202 | }


--------------------------------------------------------------------------------
/Chapter 0 - Hello World.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "Get me at http://tinyurl.com/sklcds\n",
 15 |       "========================================"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "The IPython Notebook!\n",
 23 |       "========================\n",
 24 |       "Press shift + enter to run a cell.\n",
 25 |       "\n",
 26 |       "You can go back to previous cells, change them and re-run them."
 27 |      ]
 28 |     },
 29 |     {
 30 |      "cell_type": "code",
 31 |      "collapsed": false,
 32 |      "input": [
 33 |       "print(\"Hello World\")"
 34 |      ],
 35 |      "language": "python",
 36 |      "metadata": {},
 37 |      "outputs": []
 38 |     },
 39 |     {
 40 |      "cell_type": "code",
 41 |      "collapsed": false,
 42 |      "input": [
 43 |       "X = 112"
 44 |      ],
 45 |      "language": "python",
 46 |      "metadata": {},
 47 |      "outputs": []
 48 |     },
 49 |     {
 50 |      "cell_type": "code",
 51 |      "collapsed": false,
 52 |      "input": [
 53 |       "print(X)"
 54 |      ],
 55 |      "language": "python",
 56 |      "metadata": {},
 57 |      "outputs": []
 58 |     },
 59 |     {
 60 |      "cell_type": "code",
 61 |      "collapsed": false,
 62 |      "input": [
 63 |       "range(10)"
 64 |      ],
 65 |      "language": "python",
 66 |      "metadata": {},
 67 |      "outputs": []
 68 |     },
 69 |     {
 70 |      "cell_type": "markdown",
 71 |      "metadata": {},
 72 |      "source": [
 73 |       " IPython notebook allows tab-completion and shows docstrings (by pressing tab [shift-tab in latest versions] after the opening parantheses), or using ?\n",
 74 |       " "
 75 |      ]
 76 |     },
 77 |     {
 78 |      "cell_type": "code",
 79 |      "collapsed": false,
 80 |      "input": [
 81 |       "range?"
 82 |      ],
 83 |      "language": "python",
 84 |      "metadata": {},
 85 |      "outputs": []
 86 |     },
 87 |     {
 88 |      "cell_type": "markdown",
 89 |      "metadata": {},
 90 |      "source": [
 91 |       "Cells can be arbitrary long or short, and can define functions that will be available in other cells."
 92 |      ]
 93 |     },
 94 |     {
 95 |      "cell_type": "code",
 96 |      "collapsed": false,
 97 |      "input": [
 98 |       "def fib(n):\n",
 99 |       "    if n in [0, 1]:\n",
100 |       "        return n\n",
101 |       "    return fib(n - 1) + fib(n - 2)\n",
102 |       "\n",
103 |       "for x in range(5):\n",
104 |       "    print(fib(x))"
105 |      ],
106 |      "language": "python",
107 |      "metadata": {},
108 |      "outputs": []
109 |     },
110 |     {
111 |      "cell_type": "markdown",
112 |      "metadata": {},
113 |      "source": [
114 |       "Numpy\n",
115 |       "======\n",
116 |       "Numpy array are the most common numeric data type.\n",
117 |       "\n",
118 |       "As in other environments, it is very beneficial to vectorize your code (array-based computing) to make use of fast C and Fortran implementations."
119 |      ]
120 |     },
121 |     {
122 |      "cell_type": "code",
123 |      "collapsed": false,
124 |      "input": [
125 |       "import numpy as np\n",
126 |       "np.ones(10)"
127 |      ],
128 |      "language": "python",
129 |      "metadata": {},
130 |      "outputs": []
131 |     },
132 |     {
133 |      "cell_type": "code",
134 |      "collapsed": false,
135 |      "input": [
136 |       "np.ones((10, 10))"
137 |      ],
138 |      "language": "python",
139 |      "metadata": {},
140 |      "outputs": []
141 |     },
142 |     {
143 |      "cell_type": "code",
144 |      "collapsed": false,
145 |      "input": [
146 |       "np.arange(10)"
147 |      ],
148 |      "language": "python",
149 |      "metadata": {},
150 |      "outputs": []
151 |     },
152 |     {
153 |      "cell_type": "markdown",
154 |      "metadata": {},
155 |      "source": [
156 |       "Numpy allows *broadcasting* over rows, leading to practical short-hand notations."
157 |      ]
158 |     },
159 |     {
160 |      "cell_type": "code",
161 |      "collapsed": false,
162 |      "input": [
163 |       "X = np.ones((10, 10)) + np.array([3, 5, 1, 10, 6, 12, 98, 1, 0, 3])\n",
164 |       "print(X)"
165 |      ],
166 |      "language": "python",
167 |      "metadata": {},
168 |      "outputs": []
169 |     },
170 |     {
171 |      "cell_type": "markdown",
172 |      "metadata": {},
173 |      "source": [
174 |       "Most libraries in Python use object oriented interfaces."
175 |      ]
176 |     },
177 |     {
178 |      "cell_type": "code",
179 |      "collapsed": false,
180 |      "input": [
181 |       "X.mean(axis=0)"
182 |      ],
183 |      "language": "python",
184 |      "metadata": {},
185 |      "outputs": []
186 |     },
187 |     {
188 |      "cell_type": "markdown",
189 |      "metadata": {},
190 |      "source": [
191 |       "Numpy has all standard array functions, linear algebra, and *fancy indexing*."
192 |      ]
193 |     },
194 |     {
195 |      "cell_type": "code",
196 |      "collapsed": false,
197 |      "input": [
198 |       "X[:3, 1:4]"
199 |      ],
200 |      "language": "python",
201 |      "metadata": {},
202 |      "outputs": []
203 |     },
204 |     {
205 |      "cell_type": "code",
206 |      "collapsed": false,
207 |      "input": [
208 |       "X[:, ::2]"
209 |      ],
210 |      "language": "python",
211 |      "metadata": {},
212 |      "outputs": []
213 |     },
214 |     {
215 |      "cell_type": "code",
216 |      "collapsed": false,
217 |      "input": [
218 |       "X = np.random.randint(10, size=(32, 103))\n",
219 |       "X"
220 |      ],
221 |      "language": "python",
222 |      "metadata": {},
223 |      "outputs": []
224 |     },
225 |     {
226 |      "cell_type": "code",
227 |      "collapsed": false,
228 |      "input": [
229 |       "X[np.random.randint(32, size=10)]"
230 |      ],
231 |      "language": "python",
232 |      "metadata": {},
233 |      "outputs": []
234 |     },
235 |     {
236 |      "cell_type": "code",
237 |      "collapsed": false,
238 |      "input": [
239 |       "X[np.random.randint(32, size=10)].shape"
240 |      ],
241 |      "language": "python",
242 |      "metadata": {},
243 |      "outputs": []
244 |     },
245 |     {
246 |      "cell_type": "markdown",
247 |      "metadata": {},
248 |      "source": [
249 | 
250 |       "Matplotlib\n",
251 |       "=============\n",
252 |       "For all of your plotting needs!"
253 |      ]
254 |     },
255 |     {
256 |      "cell_type": "markdown",
257 |      "metadata": {},
258 |      "source": [
259 |       "Enable in-line plotting (can be done in config file)"
260 |      ]
261 |     },
262 |     {
263 |      "cell_type": "code",
264 |      "collapsed": false,
265 |      "input": [
266 |       "%matplotlib inline\n",
267 |       "import matplotlib.pyplot as plt\n",
268 |       "import numpy as np"
269 |      ],
270 |      "language": "python",
271 |      "metadata": {},
272 |      "outputs": []
273 |     },
274 |     {
275 |      "cell_type": "code",
276 |      "collapsed": false,
277 |      "input": [
278 |       "plt.plot(np.random.uniform(size=10))"
279 |      ],
280 |      "language": "python",
281 |      "metadata": {},
282 |      "outputs": []
283 |     },
284 |     {
285 |      "cell_type": "code",
286 |      "collapsed": false,
287 |      "input": [
288 |       "plt.bar(np.arange(10), np.random.uniform(size=10))"
289 |      ],
290 |      "language": "python",
291 |      "metadata": {},
292 |      "outputs": []
293 |     },
294 |     {
295 |      "cell_type": "code",
296 |      "collapsed": false,
297 |      "input": [
298 |       "plt.hist(np.random.normal(size=1000))"
299 |      ],
300 |      "language": "python",
301 |      "metadata": {},
302 |      "outputs": []
303 |     },
304 |     {
305 |      "cell_type": "code",
306 |      "collapsed": false,
307 |      "input": [
308 |       "x, y = np.random.uniform(size=(2, 10))\n",
309 |       "plt.scatter(x, y, marker=\"x\")"
310 |      ],
311 |      "language": "python",
312 |      "metadata": {},
313 |      "outputs": []
314 |     },
315 |     {
316 |      "cell_type": "code",
317 |      "collapsed": false,
318 |      "input": [
319 |       "print(np.eye(5))\n",
320 |       "plt.matshow(np.eye(5))"
321 |      ],
322 |      "language": "python",
323 |      "metadata": {},
324 |      "outputs": []
325 |     },
326 |     {
327 |      "cell_type": "markdown",
328 |      "metadata": {},
329 |      "source": [
330 |       "Tasks\n",
331 |       "======\n",
332 |       "\n",
333 |       "1. Plot the function ``f(x) = x ** 2`` using lines. How can you get a smooth plot?\n",
334 |       "2. Visualize a two-dimensional gaussian distribution from samples. How does your approach look with very many samples?"
335 |      ]
336 |     },
337 |     {
338 |      "cell_type": "code",
339 |      "collapsed": false,
340 |      "input": [],
341 |      "language": "python",
342 |      "metadata": {},
343 |      "outputs": []
344 |     }
345 |    ],
346 |    "metadata": {}
347 |   }
348 |  ]
349 | }
350 | 


--------------------------------------------------------------------------------
/Chapter 5 - Model Selection And Pipelines.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "Model Selection\n",
 15 |       "================="
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "Grid-Search with build-in cross validation"
 23 |      ]
 24 |     },
 25 |     {
 26 |      "cell_type": "code",
 27 |      "collapsed": false,
 28 |      "input": [
 29 |       "from sklearn.grid_search import GridSearchCV"
 30 |      ],
 31 |      "language": "python",
 32 |      "metadata": {},
 33 |      "outputs": []
 34 |     },
 35 |     {
 36 |      "cell_type": "markdown",
 37 |      "metadata": {},
 38 |      "source": [
 39 |       "Define parameter grid:"
 40 |      ]
 41 |     },
 42 |     {
 43 |      "cell_type": "code",
 44 |      "collapsed": false,
 45 |      "input": [
 46 |       "import numpy as np\n",
 47 |       "param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma' : 10. ** np.arange(-3, 3)}\n",
 48 |       "print(param_grid)"
 49 |      ],
 50 |      "language": "python",
 51 |      "metadata": {},
 52 |      "outputs": []
 53 |     },
 54 |     {
 55 |      "cell_type": "code",
 56 |      "collapsed": false,
 57 |      "input": [
 58 |       "from sklearn.svm import SVC\n",
 59 |       "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)"
 60 |      ],
 61 |      "language": "python",
 62 |      "metadata": {},
 63 |      "outputs": []
 64 |     },
 65 |     {
 66 |      "cell_type": "markdown",
 67 |      "metadata": {},
 68 |      "source": [
 69 |       "A GridSearchCV object behaves just like a normal classifier."
 70 |      ]
 71 |     },
 72 |     {
 73 |      "cell_type": "code",
 74 |      "collapsed": false,
 75 |      "input": [
 76 |       "from sklearn.datasets import load_digits\n",
 77 |       "from sklearn.cross_validation import train_test_split\n",
 78 |       "digits = load_digits()\n",
 79 |       "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
 80 |      ],
 81 |      "language": "python",
 82 |      "metadata": {},
 83 |      "outputs": []
 84 |     },
 85 |     {
 86 |      "cell_type": "code",
 87 |      "collapsed": false,
 88 |      "input": [
 89 |       "grid_search.fit(X_train, y_train)"
 90 |      ],
 91 |      "language": "python",
 92 |      "metadata": {},
 93 |      "outputs": []
 94 |     },
 95 |     {
 96 |      "cell_type": "code",
 97 |      "collapsed": false,
 98 |      "input": [
 99 |       "# We extract just the scores\n",
100 |       "%matplotlib inline\n",
101 |       "import matplotlib.pyplot as plt\n",
102 |       "\n",
103 |       "scores = [x[1] for x in grid_search.grid_scores_]\n",
104 |       "scores = np.array(scores).reshape(6, 6)\n",
105 |       "\n",
106 |       "plt.matshow(scores)\n",
107 |       "plt.xlabel('gamma')\n",
108 |       "plt.ylabel('C')\n",
109 |       "plt.colorbar()\n",
110 |       "plt.xticks(np.arange(6), param_grid['gamma'])\n",
111 |       "plt.yticks(np.arange(6), param_grid['C'])"
112 |      ],
113 |      "language": "python",
114 |      "metadata": {},
115 |      "outputs": []
116 |     },
117 |     {
118 |      "cell_type": "code",
119 |      "collapsed": false,
120 |      "input": [
121 |       "grid_search.best_params_"
122 |      ],
123 |      "language": "python",
124 |      "metadata": {},
125 |      "outputs": []
126 |     },
127 |     {
128 |      "cell_type": "code",
129 |      "collapsed": false,
130 |      "input": [
131 |       "grid_search.predict(X_test)"
132 |      ],
133 |      "language": "python",
134 |      "metadata": {},
135 |      "outputs": []
136 |     },
137 |     {
138 |      "cell_type": "code",
139 |      "collapsed": false,
140 |      "input": [
141 |       "grid_search.score(X_test, y_test)"
142 |      ],
143 |      "language": "python",
144 |      "metadata": {},
145 |      "outputs": []
146 |     },
147 |     {
148 |      "cell_type": "markdown",
149 |      "metadata": {},
150 |      "source": [
151 |       "Preprocessing and Pipelines\n",
152 |       "============================="
153 |      ]
154 |     },
155 |     {
156 |      "cell_type": "code",
157 |      "collapsed": false,
158 |      "input": [
159 |       "from sklearn.preprocessing import StandardScaler"
160 |      ],
161 |      "language": "python",
162 |      "metadata": {},
163 |      "outputs": []
164 |     },
165 |     {
166 |      "cell_type": "markdown",
167 |      "metadata": {},
168 |      "source": [
169 |       "Same interface as always."
170 |      ]
171 |     },
172 |     {
173 |      "cell_type": "code",
174 |      "collapsed": false,
175 |      "input": [
176 |       "scaler = StandardScaler()"
177 |      ],
178 |      "language": "python",
179 |      "metadata": {},
180 |      "outputs": []
181 |     },
182 |     {
183 |      "cell_type": "code",
184 |      "collapsed": false,
185 |      "input": [
186 |       "scaler.fit(X_train)"
187 |      ],
188 |      "language": "python",
189 |      "metadata": {},
190 |      "outputs": []
191 |     },
192 |     {
193 |      "cell_type": "code",
194 |      "collapsed": false,
195 |      "input": [
196 |       "scaler.transform(X_train).mean(axis=0)"
197 |      ],
198 |      "language": "python",
199 |      "metadata": {},
200 |      "outputs": []
201 |     },
202 |     {
203 |      "cell_type": "code",
204 |      "collapsed": false,
205 |      "input": [
206 |       "scaler.transform(X_train).std(axis=0)"
207 |      ],
208 |      "language": "python",
209 |      "metadata": {},
210 |      "outputs": []
211 |     },
212 |     {
213 |      "cell_type": "markdown",
214 |      "metadata": {},
215 |      "source": [
216 |       "For cross-validation, we need to estimate mean and standard deviation separately for each fold.\n",
217 |       "To do that, we build a pipeline."
218 |      ]
219 |     },
220 |     {
221 |      "cell_type": "code",
222 |      "collapsed": false,
223 |      "input": [
224 |       "from sklearn.pipeline import Pipeline"
225 |      ],
226 |      "language": "python",
227 |      "metadata": {},
228 |      "outputs": []
229 |     },
230 |     {
231 |      "cell_type": "code",
232 |      "collapsed": false,
233 |      "input": [
234 |       "pipeline = Pipeline([(\"scaler\", scaler), (\"svm\", SVC())])"
235 |      ],
236 |      "language": "python",
237 |      "metadata": {},
238 |      "outputs": []
239 |     },
240 |     {
241 |      "cell_type": "code",
242 |      "collapsed": false,
243 |      "input": [
244 |       "pipeline.fit(X_train, y_train)"
245 |      ],
246 |      "language": "python",
247 |      "metadata": {},
248 |      "outputs": []
249 |     },
250 |     {
251 |      "cell_type": "code",
252 |      "collapsed": false,
253 |      "input": [
254 |       "pipeline.predict(X_train)"
255 |      ],
256 |      "language": "python",
257 |      "metadata": {},
258 |      "outputs": []
259 |     },
260 |     {
261 |      "cell_type": "markdown",
262 |      "metadata": {},
263 |      "source": [
264 |       "Cross-validation with a pipeline\n",
265 |       "---------------------------------"
266 |      ]
267 |     },
268 |     {
269 |      "cell_type": "code",
270 |      "collapsed": false,
271 |      "input": [
272 |       "from sklearn.cross_validation import cross_val_score\n",
273 |       "cross_val_score(pipeline, X_train, y_train)"
274 |      ],
275 |      "language": "python",
276 |      "metadata": {},
277 |      "outputs": []
278 |     },
279 |     {
280 |      "cell_type": "markdown",
281 |      "metadata": {},
282 |      "source": [
283 |       "So, yeah, don't forget the preprocessing."
284 |      ]
285 |     },
286 |     {
287 |      "cell_type": "code",
288 |      "collapsed": false,
289 |      "input": [
290 |       "param_grid_pipeline = {'svm__C': 10. ** np.arange(-3, 3), 'svm__gamma' : 10. ** np.arange(-3, 3)}\n",
291 |       "\n",
292 |       "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid_pipeline, verbose=3)"
293 |      ],
294 |      "language": "python",
295 |      "metadata": {},
296 |      "outputs": []
297 |     },
298 |     {
299 |      "cell_type": "code",
300 |      "collapsed": false,
301 |      "input": [
302 |       "grid_pipeline.fit(X_train, y_train)"
303 |      ],
304 |      "language": "python",
305 |      "metadata": {},
306 |      "outputs": []
307 |     },
308 |     {
309 |      "cell_type": "code",
310 |      "collapsed": false,
311 |      "input": [
312 |       "# We extract just the scores\n",
313 |       "scores = [x[1] for x in grid_pipeline.grid_scores_]\n",
314 |       "scores = np.array(scores).reshape(6, 6)\n",
315 |       "\n",
316 |       "plt.matshow(scores)\n",
317 |       "plt.xlabel('gamma')\n",
318 |       "plt.ylabel('C')\n",
319 |       "plt.colorbar()\n",
320 |       "plt.xticks(np.arange(6), param_grid['gamma'])\n",
321 |       "plt.yticks(np.arange(6), param_grid['C'])"
322 |      ],
323 |      "language": "python",
324 |      "metadata": {},
325 |      "outputs": []
326 |     },
327 |     {
328 |      "cell_type": "code",
329 |      "collapsed": false,
330 |      "input": [
331 |       "grid_pipeline.score(X_test, y_test)"
332 |      ],
333 |      "language": "python",
334 |      "metadata": {},
335 |      "outputs": []
336 |     },
337 |     {
338 |      "cell_type": "markdown",
339 |      "metadata": {},
340 |      "source": [
341 |       "Randomized Searching\n",
342 |       "======================"
343 |      ]
344 |     },
345 |     {
346 |      "cell_type": "code",
347 |      "collapsed": false,
348 |      "input": [
349 |       "from sklearn.grid_search import RandomizedSearchCV"
350 |      ],
351 |      "language": "python",
352 |      "metadata": {},
353 |      "outputs": []
354 |     },
355 |     {
356 |      "cell_type": "code",
357 |      "collapsed": false,
358 |      "input": [
359 |       "from scipy.stats import expon"
360 |      ],
361 |      "language": "python",
362 |      "metadata": {},
363 |      "outputs": []
364 |     },
365 |     {
366 |      "cell_type": "code",
367 |      "collapsed": false,
368 |      "input": [
369 |       "plt.hist([expon.rvs() for x in xrange(1000)])"
370 |      ],
371 |      "language": "python",
372 |      "metadata": {},
373 |      "outputs": []
374 |     },
375 |     {
376 |      "cell_type": "code",
377 |      "collapsed": false,
378 |      "input": [
379 |       "params = {'C': expon(), 'gamma': expon()}\n",
380 |       "rs = RandomizedSearchCV(SVC(), param_distributions=params, n_iter=50, verbose=3)"
381 |      ],
382 |      "language": "python",
383 |      "metadata": {},
384 |      "outputs": []
385 |     },
386 |     {
387 |      "cell_type": "code",
388 |      "collapsed": false,
389 |      "input": [
390 |       "rs.fit(X_train, y_train)"
391 |      ],
392 |      "language": "python",
393 |      "metadata": {},
394 |      "outputs": []
395 |     },
396 |     {
397 |      "cell_type": "code",
398 |      "collapsed": false,
399 |      "input": [
400 |       "rs.best_params_"
401 |      ],
402 |      "language": "python",
403 |      "metadata": {},
404 |      "outputs": []
405 |     },
406 |     {
407 |      "cell_type": "code",
408 |      "collapsed": false,
409 |      "input": [
410 |       "rs.best_score_"
411 |      ],
412 |      "language": "python",
413 |      "metadata": {},
414 |      "outputs": []
415 |     },
416 |     {
417 |      "cell_type": "code",
418 |      "collapsed": false,
419 |      "input": [
420 |       "scores, Cs, gammas = zip(*[(score.mean_validation_score, score.parameters['C'], score.parameters['gamma']) for score in rs.grid_scores_])"
421 |      ],
422 |      "language": "python",
423 |      "metadata": {},
424 |      "outputs": []
425 |     },
426 |     {
427 |      "cell_type": "code",
428 |      "collapsed": false,
429 |      "input": [
430 |       "plt.scatter(Cs, gammas, s=40, c=scores)\n",
431 |       "plt.xlabel(\"C\")\n",
432 |       "plt.ylabel(\"gamma\")"
433 |      ],
434 |      "language": "python",
435 |      "metadata": {},
436 |      "outputs": []
437 |     },
438 |     {
439 |      "cell_type": "markdown",
440 |      "metadata": {},
441 |      "source": [
442 |       "Tasks\n",
443 |       "=====\n",
444 |       "1. Do grid-search over a pipeline consisting of the KBest feature selection and an rbf SVM on iris."
445 |      ]
446 |     }
447 |    ],
448 |    "metadata": {}
449 |   }
450 |  ]
451 | }


--------------------------------------------------------------------------------
/Chapter 6 - Working With Text Data..ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "code",
 12 |      "collapsed": false,
 13 |      "input": [
 14 |       "import pandas as pd\n",
 15 |       "data = pd.read_csv(\"train.csv\")"
 16 |      ],
 17 |      "language": "python",
 18 |      "metadata": {},
 19 |      "outputs": []
 20 |     },
 21 |     {
 22 |      "cell_type": "code",
 23 |      "collapsed": false,
 24 |      "input": [
 25 |       "len(data)"
 26 |      ],
 27 |      "language": "python",
 28 |      "metadata": {},
 29 |      "outputs": []
 30 |     },
 31 |     {
 32 |      "cell_type": "code",
 33 |      "collapsed": false,
 34 |      "input": [
 35 |       "data.columns"
 36 |      ],
 37 |      "language": "python",
 38 |      "metadata": {},
 39 |      "outputs": []
 40 |     },
 41 |     {
 42 |      "cell_type": "code",
 43 |      "collapsed": false,
 44 |      "input": [
 45 |       "data.Insult.value_counts()"
 46 |      ],
 47 |      "language": "python",
 48 |      "metadata": {},
 49 |      "outputs": []
 50 |     },
 51 |     {
 52 |      "cell_type": "code",
 53 |      "collapsed": false,
 54 |      "input": [],
 55 |      "language": "python",
 56 |      "metadata": {},
 57 |      "outputs": []
 58 |     },
 59 |     {
 60 |      "cell_type": "code",
 61 |      "collapsed": false,
 62 |      "input": [
 63 |       "import numpy as np\n",
 64 |       "y_train = np.array(data.Insult)"
 65 |      ],
 66 |      "language": "python",
 67 |      "metadata": {},
 68 |      "outputs": []
 69 |     },
 70 |     {
 71 |      "cell_type": "code",
 72 |      "collapsed": false,
 73 |      "input": [
 74 |       "y_train"
 75 |      ],
 76 |      "language": "python",
 77 |      "metadata": {},
 78 |      "outputs": []
 79 |     },
 80 |     {
 81 |      "cell_type": "code",
 82 |      "collapsed": false,
 83 |      "input": [
 84 |       "text_train = data.Comment.tolist()"
 85 |      ],
 86 |      "language": "python",
 87 |      "metadata": {},
 88 |      "outputs": []
 89 |     },
 90 |     {
 91 |      "cell_type": "code",
 92 |      "collapsed": false,
 93 |      "input": [
 94 |       "data_test = pd.read_csv(\"test_with_solutions.csv\")"
 95 |      ],
 96 |      "language": "python",
 97 |      "metadata": {},
 98 |      "outputs": []
 99 |     },
100 |     {
101 |      "cell_type": "code",
102 |      "collapsed": false,
103 |      "input": [
104 |       "data_test"
105 |      ],
106 |      "language": "python",
107 |      "metadata": {},
108 |      "outputs": []
109 |     },
110 |     {
111 |      "cell_type": "code",
112 |      "collapsed": false,
113 |      "input": [
114 |       "text_test, y_test = data_test.Comment.tolist(), np.array(data_test.Insult)"
115 |      ],
116 |      "language": "python",
117 |      "metadata": {},
118 |      "outputs": []
119 |     },
120 |     {
121 |      "cell_type": "code",
122 |      "collapsed": false,
123 |      "input": [
124 |       "from sklearn.feature_extraction.text import CountVectorizer"
125 |      ],
126 |      "language": "python",
127 |      "metadata": {},
128 |      "outputs": []
129 |     },
130 |     {
131 |      "cell_type": "code",
132 |      "collapsed": false,
133 |      "input": [
134 |       "cv = CountVectorizer()\n",
135 |       "cv.fit(text_train)"
136 |      ],
137 |      "language": "python",
138 |      "metadata": {},
139 |      "outputs": []
140 |     },
141 |     {
142 |      "cell_type": "code",
143 |      "collapsed": false,
144 |      "input": [
145 |       "len(cv.vocabulary_)"
146 |      ],
147 |      "language": "python",
148 |      "metadata": {},
149 |      "outputs": []
150 |     },
151 |     {
152 |      "cell_type": "code",
153 |      "collapsed": true,
154 |      "input": [
155 |       "cv.vocabulary_"
156 |      ],
157 |      "language": "python",
158 |      "metadata": {},
159 |      "outputs": []
160 |     },
161 |     {
162 |      "cell_type": "code",
163 |      "collapsed": false,
164 |      "input": [
165 |       "X_train = cv.transform(text_train)"
166 |      ],
167 |      "language": "python",
168 |      "metadata": {},
169 |      "outputs": []
170 |     },
171 |     {
172 |      "cell_type": "code",
173 |      "collapsed": false,
174 |      "input": [
175 |       "X_train.shape"
176 |      ],
177 |      "language": "python",
178 |      "metadata": {},
179 |      "outputs": []
180 |     },
181 |     {
182 |      "cell_type": "code",
183 |      "collapsed": false,
184 |      "input": [
185 |       "text_train[6]"
186 |      ],
187 |      "language": "python",
188 |      "metadata": {},
189 |      "outputs": []
190 |     },
191 |     {
192 |      "cell_type": "code",
193 |      "collapsed": false,
194 |      "input": [
195 |       "X_train[6, :].nonzero()"
196 |      ],
197 |      "language": "python",
198 |      "metadata": {},
199 |      "outputs": []
200 |     },
201 |     {
202 |      "cell_type": "code",
203 |      "collapsed": false,
204 |      "input": [
205 |       "X_train[6]"
206 |      ],
207 |      "language": "python",
208 |      "metadata": {},
209 |      "outputs": []
210 |     },
211 |     {
212 |      "cell_type": "code",
213 |      "collapsed": false,
214 |      "input": [
215 |       "X_test = cv.transform(text_test)"
216 |      ],
217 |      "language": "python",
218 |      "metadata": {},
219 |      "outputs": []
220 |     },
221 |     {
222 |      "cell_type": "code",
223 |      "collapsed": false,
224 |      "input": [
225 |       "from sklearn.svm import LinearSVC\n",
226 |       "svm = LinearSVC(C=.01)"
227 |      ],
228 |      "language": "python",
229 |      "metadata": {},
230 |      "outputs": []
231 |     },
232 |     {
233 |      "cell_type": "code",
234 |      "collapsed": false,
235 |      "input": [
236 |       "svm.fit(X_train, y_train)"
237 |      ],
238 |      "language": "python",
239 |      "metadata": {},
240 |      "outputs": []
241 |     },
242 |     {
243 |      "cell_type": "code",
244 |      "collapsed": false,
245 |      "input": [
246 |       "svm.score(X_train, y_train)"
247 |      ],
248 |      "language": "python",
249 |      "metadata": {},
250 |      "outputs": []
251 |     },
252 |     {
253 |      "cell_type": "code",
254 |      "collapsed": false,
255 |      "input": [
256 |       "svm.score(X_test, y_test)"
257 |      ],
258 |      "language": "python",
259 |      "metadata": {},
260 |      "outputs": []
261 |     },
262 |     {
263 |      "cell_type": "code",
264 |      "collapsed": false,
265 |      "input": [
266 |       "y_test_pred = svm.predict(X_test)"
267 |      ],
268 |      "language": "python",
269 |      "metadata": {},
270 |      "outputs": []
271 |     },
272 |     {
273 |      "cell_type": "code",
274 |      "collapsed": false,
275 |      "input": [
276 |       "from sklearn.metrics import classification_report"
277 |      ],
278 |      "language": "python",
279 |      "metadata": {},
280 |      "outputs": []
281 |     },
282 |     {
283 |      "cell_type": "code",
284 |      "collapsed": false,
285 |      "input": [
286 |       "print(classification_report(y_test, y_test_pred))"
287 |      ],
288 |      "language": "python",
289 |      "metadata": {},
290 |      "outputs": []
291 |     },
292 |     {
293 |      "cell_type": "code",
294 |      "collapsed": false,
295 |      "input": [
296 |       "coef = svm.coef_.ravel()\n",
297 |       "positive_coefficients = np.argsort(coef)[-25:]\n",
298 |       "negative_coefficients = np.argsort(coef)[:25]\n",
299 |       "interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n"
300 |      ],
301 |      "language": "python",
302 |      "metadata": {},
303 |      "outputs": []
304 |     },
305 |     {
306 |      "cell_type": "code",
307 |      "collapsed": false,
308 |      "input": [
309 |       "%matplotlib inline\n",
310 |       "import matplotlib.pyplot as plt\n",
311 |       "\n",
312 |       "plt.figure(figsize=(15, 5))\n",
313 |       "plt.bar(np.arange(50), coef[interesting_coefficients], color=[\"red\" if c < 0 else \"blue\" for c in coef[interesting_coefficients]])\n",
314 |       "feature_names = np.array(cv.get_feature_names())\n",
315 |       "plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha=\"right\");"
316 |      ],
317 |      "language": "python",
318 |      "metadata": {},
319 |      "outputs": []
320 |     },
321 |     {
322 |      "cell_type": "code",
323 |      "collapsed": false,
324 |      "input": [
325 |       "from sklearn.pipeline import Pipeline"
326 |      ],
327 |      "language": "python",
328 |      "metadata": {},
329 |      "outputs": []
330 |     },
331 |     {
332 |      "cell_type": "code",
333 |      "collapsed": false,
334 |      "input": [
335 |       "pipeline = Pipeline([('vectorizer', cv), ('classifier', svm)])"
336 |      ],
337 |      "language": "python",
338 |      "metadata": {},
339 |      "outputs": []
340 |     },
341 |     {
342 |      "cell_type": "code",
343 |      "collapsed": false,
344 |      "input": [
345 |       "pipeline.fit(text_train, y_train)"
346 |      ],
347 |      "language": "python",
348 |      "metadata": {},
349 |      "outputs": []
350 |     },
351 |     {
352 |      "cell_type": "code",
353 |      "collapsed": false,
354 |      "input": [
355 |       "pipeline.score(text_train, y_train)"
356 |      ],
357 |      "language": "python",
358 |      "metadata": {},
359 |      "outputs": []
360 |     },
361 |     {
362 |      "cell_type": "code",
363 |      "collapsed": false,
364 |      "input": [
365 |       "pipeline.score(text_test, y_test)"
366 |      ],
367 |      "language": "python",
368 |      "metadata": {},
369 |      "outputs": []
370 |     },
371 |     {
372 |      "cell_type": "code",
373 |      "collapsed": false,
374 |      "input": [
375 |       "from sklearn.grid_search import GridSearchCV"
376 |      ],
377 |      "language": "python",
378 |      "metadata": {},
379 |      "outputs": []
380 |     },
381 |     {
382 |      "cell_type": "code",
383 |      "collapsed": false,
384 |      "input": [
385 |       "param_grid = {'classifier__C': 10. ** np.arange(-3, 3)}\n",
386 |       "grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)"
387 |      ],
388 |      "language": "python",
389 |      "metadata": {},
390 |      "outputs": []
391 |     },
392 |     {
393 |      "cell_type": "code",
394 |      "collapsed": false,
395 |      "input": [
396 |       "grid_search.fit(text_train, y_train)"
397 |      ],
398 |      "language": "python",
399 |      "metadata": {},
400 |      "outputs": []
401 |     },
402 |     {
403 |      "cell_type": "code",
404 |      "collapsed": false,
405 |      "input": [
406 |       "grid_search.best_score_"
407 |      ],
408 |      "language": "python",
409 |      "metadata": {},
410 |      "outputs": []
411 |     },
412 |     {
413 |      "cell_type": "code",
414 |      "collapsed": false,
415 |      "input": [
416 |       "grid_search.best_params_"
417 |      ],
418 |      "language": "python",
419 |      "metadata": {},
420 |      "outputs": []
421 |     },
422 |     {
423 |      "cell_type": "code",
424 |      "collapsed": false,
425 |      "input": [
426 |       "param_grid = {'classifier__C': 10. ** np.arange(-3, 3), \"vectorizer__ngram_range\": [(1, 1), (1, 2), (1, 3), (2, 3), (2, 2)]}\n",
427 |       "grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=3)"
428 |      ],
429 |      "language": "python",
430 |      "metadata": {},
431 |      "outputs": []
432 |     },
433 |     {
434 |      "cell_type": "code",
435 |      "collapsed": false,
436 |      "input": [
437 |       "grid_search.fit(text_train, y_train)"
438 |      ],
439 |      "language": "python",
440 |      "metadata": {},
441 |      "outputs": []
442 |     },
443 |     {
444 |      "cell_type": "code",
445 |      "collapsed": false,
446 |      "input": [
447 |       "grid_search.best_params_"
448 |      ],
449 |      "language": "python",
450 |      "metadata": {},
451 |      "outputs": []
452 |     },
453 |     {
454 |      "cell_type": "code",
455 |      "collapsed": false,
456 |      "input": [
457 |       "grid_search.best_score_"
458 |      ],
459 |      "language": "python",
460 |      "metadata": {},
461 |      "outputs": []
462 |     },
463 |     {
464 |      "cell_type": "markdown",
465 |      "metadata": {},
466 |      "source": [
467 |       "Tasks\n",
468 |       "======\n",
469 |       "1. Remove the above visualization code for the coefficients and try to recreate it.\n",
470 |       "2. Can you think of any other useful features for this task?"
471 |      ]
472 |     }
473 |    ],
474 |    "metadata": {}
475 |   }
476 |  ]
477 | }


--------------------------------------------------------------------------------
/Chapter 2 - Introduction to Scikit-learn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "Scikit-learn\n",
 15 |       "=============\n",
 16 |       "Machine learning for the masses!"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "cell_type": "markdown",
 21 |      "metadata": {},
 22 |      "source": [
 23 |       "What?\n",
 24 |       "------\n",
 25 |       "**Algorithms**\n",
 26 |       "\n",
 27 |       "- Classification\n",
 28 |       "- Regression\n",
 29 |       "- Dimensionality reduction\n",
 30 |       "- Manifold learning\n",
 31 |       "- Feature selection\n",
 32 |       "- Semisupervised learning\n",
 33 |       "- Clustering\n",
 34 |       "\n",
 35 |       "**Tools**\n",
 36 |       "\n",
 37 |       "- Preprocessing\n",
 38 |       "- Pipelining\n",
 39 |       "- Model evaluation\n",
 40 |       "- Model selection\n",
 41 |       "\n",
 42 |       "**Features**\n",
 43 |       "\n",
 44 |       "- Sparse data\n",
 45 |       "- Dense data\n",
 46 |       "- Multi-core\n",
 47 |       "- Out-of-core\n",
 48 |       "- Cloud tools available"
 49 |      ]
 50 |     },
 51 |     {
 52 |      "cell_type": "markdown",
 53 |      "metadata": {},
 54 |      "source": [
 55 |       "Get some data to play with"
 56 |      ]
 57 |     },
 58 |     {
 59 |      "cell_type": "code",
 60 |      "collapsed": false,
 61 |      "input": [
 62 |       "from sklearn.datasets import load_digits\n",
 63 |       "digits = load_digits()\n",
 64 |       "digits.keys()"
 65 |      ],
 66 |      "language": "python",
 67 |      "metadata": {},
 68 |      "outputs": []
 69 |     },
 70 |     {
 71 |      "cell_type": "code",
 72 |      "collapsed": false,
 73 |      "input": [
 74 |       "digits.images.shape"
 75 |      ],
 76 |      "language": "python",
 77 |      "metadata": {},
 78 |      "outputs": []
 79 |     },
 80 |     {
 81 |      "cell_type": "code",
 82 |      "collapsed": false,
 83 |      "input": [
 84 |       "digits.data.shape"
 85 |      ],
 86 |      "language": "python",
 87 |      "metadata": {},
 88 |      "outputs": []
 89 |     },
 90 |     {
 91 |      "cell_type": "code",
 92 |      "collapsed": false,
 93 |      "input": [
 94 |       "digits.target.shape"
 95 |      ],
 96 |      "language": "python",
 97 |      "metadata": {},
 98 |      "outputs": []
 99 |     },
100 |     {
101 |      "cell_type": "markdown",
102 |      "metadata": {},
103 |      "source": [
104 |       "**Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)**"
105 |      ]
106 |     },
107 |     {
108 |      "cell_type": "code",
109 |      "collapsed": false,
110 |      "input": [
111 |       "print(digits.images[0])\n"
112 |      ],
113 |      "language": "python",
114 |      "metadata": {},
115 |      "outputs": []
116 |     },
117 |     {
118 |      "cell_type": "code",
119 |      "collapsed": false,
120 |      "input": [
121 |       "print(digits.target[0])"
122 |      ],
123 |      "language": "python",
124 |      "metadata": {},
125 |      "outputs": []
126 |     },
127 |     {
128 |      "cell_type": "code",
129 |      "collapsed": false,
130 |      "input": [
131 |       "import matplotlib.pyplot as plt\n",
132 |       "%matplotlib inline\n",
133 |       "\n",
134 |       "plt.matshow(digits.images[0], cmap=plt.cm.Greys)"
135 |      ],
136 |      "language": "python",
137 |      "metadata": {},
138 |      "outputs": []
139 |     },
140 |     {
141 |      "cell_type": "markdown",
142 |      "metadata": {},
143 |      "source": [
144 |       "Split the data to get going"
145 |      ]
146 |     },
147 |     {
148 |      "cell_type": "code",
149 |      "collapsed": false,
150 |      "input": [
151 |       "from sklearn.cross_validation import train_test_split\n",
152 |       "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
153 |      ],
154 |      "language": "python",
155 |      "metadata": {},
156 |      "outputs": []
157 |     },
158 |     {
159 |      "cell_type": "markdown",
160 |      "metadata": {},
161 |      "source": [
162 |       "Really Simple API\n",
163 |       "-------------------\n",
164 |       "1) Import your model class"
165 |      ]
166 |     },
167 |     {
168 |      "cell_type": "code",
169 |      "collapsed": false,
170 |      "input": [
171 |       "from sklearn.svm import LinearSVC"
172 |      ],
173 |      "language": "python",
174 |      "metadata": {},
175 |      "outputs": []
176 |     },
177 |     {
178 |      "cell_type": "markdown",
179 |      "metadata": {},
180 |      "source": [
181 |       "2) Instantiate an object and set the parameters"
182 |      ]
183 |     },
184 |     {
185 |      "cell_type": "code",
186 |      "collapsed": false,
187 |      "input": [
188 |       "svm = LinearSVC(C=0.1)"
189 |      ],
190 |      "language": "python",
191 |      "metadata": {},
192 |      "outputs": []
193 |     },
194 |     {
195 |      "cell_type": "markdown",
196 |      "metadata": {},
197 |      "source": [
198 |       "3) Fit the model"
199 |      ]
200 |     },
201 |     {
202 |      "cell_type": "code",
203 |      "collapsed": false,
204 |      "input": [
205 |       "svm.fit(X_train, y_train)"
206 |      ],
207 |      "language": "python",
208 |      "metadata": {},
209 |      "outputs": []
210 |     },
211 |     {
212 |      "cell_type": "markdown",
213 |      "metadata": {},
214 |      "source": [
215 |       "4) Apply / evaluate"
216 |      ]
217 |     },
218 |     {
219 |      "cell_type": "code",
220 |      "collapsed": false,
221 |      "input": [
222 |       "print(svm.predict(X_train))\n",
223 |       "print(y_train)"
224 |      ],
225 |      "language": "python",
226 |      "metadata": {},
227 |      "outputs": []
228 |     },
229 |     {
230 |      "cell_type": "code",
231 |      "collapsed": false,
232 |      "input": [
233 |       "svm.score(X_train, y_train)"
234 |      ],
235 |      "language": "python",
236 |      "metadata": {},
237 |      "outputs": []
238 |     },
239 |     {
240 |      "cell_type": "code",
241 |      "collapsed": false,
242 |      "input": [
243 |       "svm.score(X_test, y_test)"
244 |      ],
245 |      "language": "python",
246 |      "metadata": {},
247 |      "outputs": []
248 |     },
249 |     {
250 |      "cell_type": "markdown",
251 |      "metadata": {},
252 |      "source": [
253 |       "And again\n",
254 |       "---------"
255 |      ]
256 |     },
257 |     {
258 |      "cell_type": "code",
259 |      "collapsed": false,
260 |      "input": [
261 |       "from sklearn.ensemble import RandomForestClassifier"
262 |      ],
263 |      "language": "python",
264 |      "metadata": {},
265 |      "outputs": []
266 |     },
267 |     {
268 |      "cell_type": "code",
269 |      "collapsed": false,
270 |      "input": [
271 |       "rf = RandomForestClassifier(n_estimators=50)"
272 |      ],
273 |      "language": "python",
274 |      "metadata": {},
275 |      "outputs": []
276 |     },
277 |     {
278 |      "cell_type": "code",
279 |      "collapsed": false,
280 |      "input": [
281 |       "rf.fit(X_train, y_train)"
282 |      ],
283 |      "language": "python",
284 |      "metadata": {},
285 |      "outputs": []
286 |     },
287 |     {
288 |      "cell_type": "code",
289 |      "collapsed": false,
290 |      "input": [
291 |       "rf.score(X_train, y_train)"
292 |      ],
293 |      "language": "python",
294 |      "metadata": {},
295 |      "outputs": []
296 |     },
297 |     {
298 |      "cell_type": "code",
299 |      "collapsed": false,
300 |      "input": [
301 |       "rf.score(X_test, y_test)"
302 |      ],
303 |      "language": "python",
304 |      "metadata": {},
305 |      "outputs": []
306 |     },
307 |     {
308 |      "cell_type": "code",
309 |      "collapsed": false,
310 |      "input": [
311 |       "#%load from github"
312 |      ],
313 |      "language": "python",
314 |      "metadata": {},
315 |      "outputs": []
316 |     },
317 |     {
318 |      "cell_type": "code",
319 |      "collapsed": false,
320 |      "input": [
321 |       "#!/usr/bin/python\n",
322 |       "\n",
323 |       "\"\"\"\n",
324 |       "=====================\n",
325 |       "Classifier comparison\n",
326 |       "=====================\n",
327 |       "\n",
328 |       "A comparison of a several classifiers in scikit-learn on synthetic datasets.\n",
329 |       "The point of this example is to illustrate the nature of decision boundaries\n",
330 |       "of different classifiers.\n",
331 |       "This should be taken with a grain of salt, as the intuition conveyed by\n",
332 |       "these examples does not necessarily carry over to real datasets.\n",
333 |       "\n",
334 |       "Particularly in high-dimensional spaces, data can more easily be separated\n",
335 |       "linearly and the simplicity of classifiers such as naive Bayes and linear SVMs\n",
336 |       "might lead to better generalization than is achieved by other classifiers.\n",
337 |       "\n",
338 |       "The plots show training points in solid colors and testing points\n",
339 |       "semi-transparent. The lower right shows the classification accuracy on the test\n",
340 |       "set.\n",
341 |       "\"\"\"\n",
342 |       "print(__doc__)\n",
343 |       "\n",
344 |       "\n",
345 |       "# Code source: Ga\u00ebl Varoquaux\n",
346 |       "#              Andreas M\u00fcller\n",
347 |       "# Modified for documentation by Jaques Grobler\n",
348 |       "# License: BSD 3 clause\n",
349 |       "\n",
350 |       "import numpy as np\n",
351 |       "import pylab as pl\n",
352 |       "from matplotlib.colors import ListedColormap\n",
353 |       "from sklearn.cross_validation import train_test_split\n",
354 |       "from sklearn.preprocessing import StandardScaler\n",
355 |       "from sklearn.datasets import make_moons, make_circles, make_classification\n",
356 |       "from sklearn.neighbors import KNeighborsClassifier\n",
357 |       "from sklearn.svm import SVC\n",
358 |       "from sklearn.tree import DecisionTreeClassifier\n",
359 |       "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
360 |       "from sklearn.naive_bayes import GaussianNB\n",
361 |       "from sklearn.lda import LDA\n",
362 |       "from sklearn.qda import QDA\n",
363 |       "\n",
364 |       "h = .02  # step size in the mesh\n",
365 |       "\n",
366 |       "names = [\"Nearest Neighbors\", \"Linear SVM\", \"RBF SVM\", \"Decision Tree\",\n",
367 |       "         \"Random Forest\", \"AdaBoost\", \"Naive Bayes\", \"LDA\", \"QDA\"]\n",
368 |       "classifiers = [\n",
369 |       "    KNeighborsClassifier(3),\n",
370 |       "    SVC(kernel=\"linear\", C=0.025),\n",
371 |       "    SVC(gamma=2, C=1),\n",
372 |       "    DecisionTreeClassifier(max_depth=5),\n",
373 |       "    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),\n",
374 |       "    AdaBoostClassifier(),\n",
375 |       "    GaussianNB(),\n",
376 |       "    LDA(),\n",
377 |       "    QDA()]\n",
378 |       "\n",
379 |       "X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,\n",
380 |       "                           random_state=1, n_clusters_per_class=1)\n",
381 |       "rng = np.random.RandomState(2)\n",
382 |       "X += 2 * rng.uniform(size=X.shape)\n",
383 |       "linearly_separable = (X, y)\n",
384 |       "\n",
385 |       "datasets = [make_moons(noise=0.3, random_state=0),\n",
386 |       "            make_circles(noise=0.2, factor=0.5, random_state=1),\n",
387 |       "            linearly_separable\n",
388 |       "            ]\n",
389 |       "\n",
390 |       "figure = pl.figure(figsize=(27, 9))\n",
391 |       "i = 1\n",
392 |       "# iterate over datasets\n",
393 |       "for ds in datasets:\n",
394 |       "    # preprocess dataset, split into training and test part\n",
395 |       "    X, y = ds\n",
396 |       "    X = StandardScaler().fit_transform(X)\n",
397 |       "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)\n",
398 |       "\n",
399 |       "    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n",
400 |       "    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n",
401 |       "    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n",
402 |       "                         np.arange(y_min, y_max, h))\n",
403 |       "\n",
404 |       "    # just plot the dataset first\n",
405 |       "    cm = pl.cm.RdBu\n",
406 |       "    cm_bright = ListedColormap(['#FF0000', '#0000FF'])\n",
407 |       "    ax = pl.subplot(len(datasets), len(classifiers) + 1, i)\n",
408 |       "    # Plot the training points\n",
409 |       "    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)\n",
410 |       "    # and testing points\n",
411 |       "    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)\n",
412 |       "    ax.set_xlim(xx.min(), xx.max())\n",
413 |       "    ax.set_ylim(yy.min(), yy.max())\n",
414 |       "    ax.set_xticks(())\n",
415 |       "    ax.set_yticks(())\n",
416 |       "    i += 1\n",
417 |       "\n",
418 |       "    # iterate over classifiers\n",
419 |       "    for name, clf in zip(names, classifiers):\n",
420 |       "        ax = pl.subplot(len(datasets), len(classifiers) + 1, i)\n",
421 |       "        clf.fit(X_train, y_train)\n",
422 |       "        score = clf.score(X_test, y_test)\n",
423 |       "\n",
424 |       "        # Plot the decision boundary. For that, we will assign a color to each\n",
425 |       "        # point in the mesh [x_min, m_max]x[y_min, y_max].\n",
426 |       "        if hasattr(clf, \"decision_function\"):\n",
427 |       "            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n",
428 |       "        else:\n",
429 |       "            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]\n",
430 |       "\n",
431 |       "        # Put the result into a color plot\n",
432 |       "        Z = Z.reshape(xx.shape)\n",
433 |       "        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)\n",
434 |       "\n",
435 |       "        # Plot also the training points\n",
436 |       "        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)\n",
437 |       "        # and testing points\n",
438 |       "        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,\n",
439 |       "                   alpha=0.6)\n",
440 |       "\n",
441 |       "        ax.set_xlim(xx.min(), xx.max())\n",
442 |       "        ax.set_ylim(yy.min(), yy.max())\n",
443 |       "        ax.set_xticks(())\n",
444 |       "        ax.set_yticks(())\n",
445 |       "        ax.set_title(name)\n",
446 |       "        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),\n",
447 |       "                size=15, horizontalalignment='right')\n",
448 |       "        i += 1\n",
449 |       "\n",
450 |       "figure.subplots_adjust(left=.02, right=.98)\n",
451 |       "pl.show()\n"
452 |      ],
453 |      "language": "python",
454 |      "metadata": {},
455 |      "outputs": []
456 |     },
457 |     {
458 |      "cell_type": "markdown",
459 |      "metadata": {},
460 |      "source": [
461 |       "Tasks\n",
462 |       "======\n",
463 |       "1. Train a KNeighbors classifier on the digits dataset and compute the test accuracy.\n",
464 |       "2. Visualize some of the mistakes."
465 |      ]
466 |     }
467 |    ],
468 |    "metadata": {}
469 |   }
470 |  ]
471 | }


--------------------------------------------------------------------------------