├── .DS_Store ├── 01 Linear Regression ├── Gradient_Descent_Demo.ipynb ├── Linear_Regression.ipynb └── Linear_Regression_with_Multiple_Features.ipynb ├── 02 Logistic Regression └── Logistic_Regression.ipynb ├── 03 Dimensionality Reduction └── PCA.ipynb ├── 04 PCA └── PCA.ipynb ├── 05 KNN └── K_Nearest_Neigbours.ipynb ├── 06 K-Means └── K_Means.ipynb ├── 07 Naive Bayes ├── Naive Bayes For Text Classification.ipynb └── Spam Classifier.ipynb ├── 08 Decision Trees ├── Decision Trees.ipynb └── Titanic Survival Prediction.ipynb └── 09 Ensemble Learning ├── Customer_Churn_Prediction.ipynb ├── Ensemble - Bagging.ipynb └── Ensemble Learning - Boosting.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prateek27/machine-learning-essentials/ead6bf8c01764b86b12e1471255bb1e6cf7064b1/.DS_Store -------------------------------------------------------------------------------- /01 Linear Regression/Gradient_Descent_Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "id": "PfFSSy_kpLSc" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np \n", 26 | "import matplotlib.pyplot as plt" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "source": [ 32 | "# Generate data using a dummy function \n", 33 | "X = np.arange(10)\n", 34 | "Y = (X-5)**2 + 3 \n", 35 | "\n", 36 | "print(X,Y)" 37 | ], 38 | "metadata": { 39 | "colab": { 40 | "base_uri": "https://localhost:8080/" 41 | }, 42 | "id": "cxRCoyRWp31i", 43 | "outputId": "1ac5d03b-7f79-4dd0-89b2-c98b65f1e8a9" 44 | }, 45 | "execution_count": null, 46 | "outputs": [ 47 | { 48 | "output_type": "stream", 49 | "name": "stdout", 50 | "text": [ 51 | "[0 1 2 3 4 5 6 7 8 9] [28 19 12 7 4 3 4 7 12 19]\n" 52 | ] 53 | } 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "source": [ 59 | "# Visualise \n", 60 | "plt.style.use(\"seaborn\")\n", 61 | "plt.plot(X,Y)\n", 62 | "plt.xlabel(\"X\")\n", 63 | "plt.ylabel(\"Y\")\n", 64 | "plt.show()" 65 | ], 66 | "metadata": { 67 | "colab": { 68 | "base_uri": "https://localhost:8080/", 69 | "height": 361 70 | }, 71 | "id": "L4q4bzPkqNno", 72 | "outputId": "da1944a4-4878-4301-8f68-08eeb8990813" 73 | }, 74 | "execution_count": null, 75 | "outputs": [ 76 | { 77 | "output_type": "display_data", 78 | "data": { 79 | "text/plain": [ 80 | "
" 81 | ], 82 | "image/png": "\n" 83 | }, 84 | "metadata": {} 85 | } 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "source": [ 91 | "# Gradient Descent \n", 92 | "# init x with any random value \n", 93 | "x = 9\n", 94 | "lr = 0.1 \n", 95 | "\n", 96 | "plt.plot(X,Y)\n", 97 | "for i in range(30):\n", 98 | " grad = 2*(x-5)\n", 99 | " x = x - lr * grad \n", 100 | " y = (x-5)**2 + 3 \n", 101 | " plt.scatter(x,y)\n", 102 | " print(x)\n", 103 | "\n", 104 | "plt.show()" 105 | ], 106 | "metadata": { 107 | "colab": { 108 | "base_uri": "https://localhost:8080/", 109 | "height": 883 110 | }, 111 | "id": "vRn--jojqnnC", 112 | "outputId": "f1bc9bf9-73a8-413b-b4e1-d901ded9dd61" 113 | }, 114 | "execution_count": null, 115 | "outputs": [ 116 | { 117 | "output_type": "stream", 118 | "name": "stdout", 119 | "text": [ 120 | "8.2\n", 121 | "7.56\n", 122 | "7.048\n", 123 | "6.6384\n", 124 | "6.31072\n", 125 | "6.048576\n", 126 | "5.8388608\n", 127 | "5.67108864\n", 128 | "5.5368709119999995\n", 129 | "5.429496729599999\n", 130 | "5.34359738368\n", 131 | "5.274877906944\n", 132 | "5.2199023255552\n", 133 | "5.17592186044416\n", 134 | "5.140737488355328\n", 135 | "5.1125899906842625\n", 136 | "5.09007199254741\n", 137 | "5.072057594037927\n", 138 | "5.057646075230342\n", 139 | "5.046116860184274\n", 140 | "5.0368934881474186\n", 141 | "5.0295147905179345\n", 142 | "5.023611832414348\n", 143 | "5.018889465931478\n", 144 | "5.015111572745182\n", 145 | "5.012089258196146\n", 146 | "5.009671406556917\n", 147 | "5.007737125245534\n", 148 | "5.006189700196427\n", 149 | "5.004951760157142\n" 150 | ] 151 | }, 152 | { 153 | "output_type": "display_data", 154 | "data": { 155 | "text/plain": [ 156 | "
" 157 | ], 158 | "image/png": "\n" 159 | }, 160 | "metadata": {} 161 | } 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "source": [], 167 | "metadata": { 168 | "id": "G1naw6EAsDpF" 169 | }, 170 | "execution_count": null, 171 | "outputs": [] 172 | } 173 | ] 174 | } -------------------------------------------------------------------------------- /03 Dimensionality Reduction /PCA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 104, 6 | "id": "78ddcaf5", 7 | "metadata": { 8 | "slideshow": { 9 | "slide_type": "skip" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "# jupyter nbconvert ./PCA.ipynb --to slides --post serve" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "id": "9d341bed", 20 | "metadata": { 21 | "slideshow": { 22 | "slide_type": "slide" 23 | } 24 | }, 25 | "source": [ 26 | "# What is PCA?\n", 27 | "- PCA is a dimensionality reduction technique.\n", 28 | "- It is a Feature Extraction algorithm, not a feature selection.\n", 29 | "\n", 30 | " convert n features to k, where k" 117 | ] 118 | }, 119 | "metadata": { 120 | "needs_background": "light" 121 | }, 122 | "output_type": "display_data" 123 | } 124 | ], 125 | "source": [ 126 | "\n", 127 | "x, u = [0,0.4],[0,0.4]\n", 128 | "plt.scatter(X, y)\n", 129 | "plt.xlabel(\"Age\")\n", 130 | "plt.ylabel(\"Salaries\")\n", 131 | "plt.arrow(0, 0, 0.5, 0.5, width = 0.05, color='red')\n", 132 | "# plt.show()\n", 133 | "plt.savefig(\"./imgs/frame_with_dir.jpg\")" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "5181b00d", 139 | "metadata": { 140 | "slideshow": { 141 | "slide_type": "subslide" 142 | } 143 | }, 144 | "source": [ 145 | "**Now 2D data looks like this. Which feature would you choose?**\n", 146 | "\n", 147 | "\n", 148 | "\n", 149 | "\n", 150 | "**Idea:**\n", 151 | "1. Find the directions $f_1^{'}$ and $f_2^{'}$. such that spread on $f_1^{'}$ >> spread on $f_2^{'}$\n", 152 | "2. Drop $f_2^{'}$\n", 153 | "3. Project all data points onto $f_1^{'}$\n", 154 | "\n", 155 | " - Thus 2D -> 1D\n", 156 | " - It is like the rotation of axis\n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | "
\n", 161 | "\n", 162 | "**Note:** \n", 163 | "Find a direction such that the variance of $x^{i}$ projected onto the direction is maximized." 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "id": "2a55c5cd", 169 | "metadata": { 170 | "slideshow": { 171 | "slide_type": "slide" 172 | } 173 | }, 174 | "source": [ 175 | "# PCA Objective - Maximizing Variance" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "id": "4ea98365", 181 | "metadata": { 182 | "slideshow": { 183 | "slide_type": "subslide" 184 | } 185 | }, 186 | "source": [ 187 | "\n", 188 | "\n", 189 | "$\\Large u_1 : unit vector $, $\\large ||u|| = 1$\n", 190 | "\n", 191 | "\n", 192 | "\n", 193 | "\n", 194 | "$\\huge x_i^{'} = proj_{u1} (x_i) $\n", 195 | "\n", 196 | "$\\huge x_i^{'} = \\frac{u_1.x_i}{||u_1||} $\n", 197 | "\n", 198 | "$\\huge x_i^{'} = u_1^T.x_i $\n", 199 | "\n", 200 | "
\n", 201 | "Find $u_1$ such that var{$proj_{u1} (x_i)$} is maximum." 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "id": "c8c2b508", 207 | "metadata": { 208 | "slideshow": { 209 | "slide_type": "slide" 210 | } 211 | }, 212 | "source": [ 213 | "# PCA Objective - Minimizing Distances" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "5305afc2", 219 | "metadata": { 220 | "slideshow": { 221 | "slide_type": "subslide" 222 | } 223 | }, 224 | "source": [ 225 | "\n", 226 | "\n", 227 | "$\\Large min_{u_1} \\sum_{i=1}^{m} {d_i^2} $ \n", 228 | "\n", 229 | "\n", 230 | "\n", 231 | "
\n", 232 | "Find $u_1$ such that if we take all the distance sqaured from each point on $u_1$ , it should be minimized." 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "0d911785", 238 | "metadata": { 239 | "slideshow": { 240 | "slide_type": "slide" 241 | } 242 | }, 243 | "source": [ 244 | "# Eigen Values and Eigen Vectors" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "id": "1aba7e1e", 250 | "metadata": { 251 | "slideshow": { 252 | "slide_type": "subslide" 253 | } 254 | }, 255 | "source": [ 256 | "#### Equation : \n", 257 | "$\\Large \\lambda_1.v_1 = S.v_1$\n", 258 | "\n", 259 | "where:\n", 260 | "\n", 261 | "$\\lambda_1 : eigen value$\n", 262 | ", $v_1 : eigen vector$\n", 263 | ", $S : Covariance Matrix$" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "id": "0f2e497f", 269 | "metadata": { 270 | "slideshow": { 271 | "slide_type": "slide" 272 | } 273 | }, 274 | "source": [ 275 | "# PCA Summary\n", 276 | "\n", 277 | "1. Column Standardise\n", 278 | "2. $ \\large S = \\frac{X^T.X}{m}$\n", 279 | "3. compute eigen values & eigenvectors of S.
\n", 280 | " $eigen(S) = (\\lambda_1 \\geq \\lambda_2 \\geq \\lambda_3 ... \\geq \\lambda_n$)\n", 281 | "and \n", 282 | "( $v_1, v_2, v_3 ... v_n$ )\n", 283 | "4. $u_1 = v_1$ and $u_2 = v_2 ...$" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "id": "2d0c9050", 289 | "metadata": { 290 | "slideshow": { 291 | "slide_type": "slide" 292 | } 293 | }, 294 | "source": [ 295 | "# Understanding Eigen-values" 296 | ] 297 | } 298 | ], 299 | "metadata": { 300 | "celltoolbar": "Slideshow", 301 | "kernelspec": { 302 | "display_name": "Python 3", 303 | "language": "python", 304 | "name": "python3" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 3 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": "3.8.8" 317 | } 318 | }, 319 | "nbformat": 4, 320 | "nbformat_minor": 5 321 | } 322 | -------------------------------------------------------------------------------- /04 PCA/PCA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 104, 6 | "id": "78ddcaf5", 7 | "metadata": { 8 | "slideshow": { 9 | "slide_type": "skip" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "# jupyter nbconvert ./PCA.ipynb --to slides --post serve" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "id": "9d341bed", 20 | "metadata": { 21 | "slideshow": { 22 | "slide_type": "slide" 23 | } 24 | }, 25 | "source": [ 26 | "# What is PCA?\n", 27 | "- PCA is a dimensionality reduction technique.\n", 28 | "- It is a Feature Extraction algorithm, not a feature selection.\n", 29 | "\n", 30 | " convert n features to k, where k" 117 | ] 118 | }, 119 | "metadata": { 120 | "needs_background": "light" 121 | }, 122 | "output_type": "display_data" 123 | } 124 | ], 125 | "source": [ 126 | "\n", 127 | "x, u = [0,0.4],[0,0.4]\n", 128 | "plt.scatter(X, y)\n", 129 | "plt.xlabel(\"Age\")\n", 130 | "plt.ylabel(\"Salaries\")\n", 131 | "plt.arrow(0, 0, 0.5, 0.5, width = 0.05, color='red')\n", 132 | "# plt.show()\n", 133 | "plt.savefig(\"./imgs/frame_with_dir.jpg\")" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "5181b00d", 139 | "metadata": { 140 | "slideshow": { 141 | "slide_type": "subslide" 142 | } 143 | }, 144 | "source": [ 145 | "**Now 2D data looks like this. Which feature would you choose?**\n", 146 | "\n", 147 | "\n", 148 | "\n", 149 | "\n", 150 | "**Idea:**\n", 151 | "1. Find the directions $f_1^{'}$ and $f_2^{'}$. such that spread on $f_1^{'}$ >> spread on $f_2^{'}$\n", 152 | "2. Drop $f_2^{'}$\n", 153 | "3. Project all data points onto $f_1^{'}$\n", 154 | "\n", 155 | " - Thus 2D -> 1D\n", 156 | " - It is like the rotation of axis\n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | "
\n", 161 | "\n", 162 | "**Note:** \n", 163 | "Find a direction such that the variance of $x^{i}$ projected onto the direction is maximized." 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "id": "2a55c5cd", 169 | "metadata": { 170 | "slideshow": { 171 | "slide_type": "slide" 172 | } 173 | }, 174 | "source": [ 175 | "# PCA Objective - Maximizing Variance" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "id": "4ea98365", 181 | "metadata": { 182 | "slideshow": { 183 | "slide_type": "subslide" 184 | } 185 | }, 186 | "source": [ 187 | "\n", 188 | "\n", 189 | "$\\Large u_1 : unit vector $, $\\large ||u|| = 1$\n", 190 | "\n", 191 | "\n", 192 | "\n", 193 | "\n", 194 | "$\\huge x_i^{'} = proj_{u1} (x_i) $\n", 195 | "\n", 196 | "$\\huge x_i^{'} = \\frac{u_1.x_i}{||u_1||} $\n", 197 | "\n", 198 | "$\\huge x_i^{'} = u_1^T.x_i $\n", 199 | "\n", 200 | "
\n", 201 | "Find $u_1$ such that var{$proj_{u1} (x_i)$} is maximum." 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "id": "c8c2b508", 207 | "metadata": { 208 | "slideshow": { 209 | "slide_type": "slide" 210 | } 211 | }, 212 | "source": [ 213 | "# PCA Objective - Minimizing Distances" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "5305afc2", 219 | "metadata": { 220 | "slideshow": { 221 | "slide_type": "subslide" 222 | } 223 | }, 224 | "source": [ 225 | "\n", 226 | "\n", 227 | "$\\Large min_{u_1} \\sum_{i=1}^{m} {d_i^2} $ \n", 228 | "\n", 229 | "\n", 230 | "\n", 231 | "
\n", 232 | "Find $u_1$ such that if we take all the distance sqaured from each point on $u_1$ , it should be minimized." 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "0d911785", 238 | "metadata": { 239 | "slideshow": { 240 | "slide_type": "slide" 241 | } 242 | }, 243 | "source": [ 244 | "# Eigen Values and Eigen Vectors" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "id": "1aba7e1e", 250 | "metadata": { 251 | "slideshow": { 252 | "slide_type": "subslide" 253 | } 254 | }, 255 | "source": [ 256 | "#### Equation : \n", 257 | "$\\Large \\lambda_1.v_1 = S.v_1$\n", 258 | "\n", 259 | "where:\n", 260 | "\n", 261 | "$\\lambda_1 : eigen value$\n", 262 | ", $v_1 : eigen vector$\n", 263 | ", $S : Covariance Matrix$" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "id": "0f2e497f", 269 | "metadata": { 270 | "slideshow": { 271 | "slide_type": "slide" 272 | } 273 | }, 274 | "source": [ 275 | "# PCA Summary\n", 276 | "\n", 277 | "1. Column Standardise\n", 278 | "2. $ \\large S = \\frac{X^T.X}{m}$\n", 279 | "3. compute eigen values & eigenvectors of S.
\n", 280 | " $eigen(S) = (\\lambda_1 \\geq \\lambda_2 \\geq \\lambda_3 ... \\geq \\lambda_n$)\n", 281 | "and \n", 282 | "( $v_1, v_2, v_3 ... v_n$ )\n", 283 | "4. $u_1 = v_1$ and $u_2 = v_2 ...$" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "id": "2d0c9050", 289 | "metadata": { 290 | "slideshow": { 291 | "slide_type": "slide" 292 | } 293 | }, 294 | "source": [ 295 | "# Understanding Eigen-values" 296 | ] 297 | } 298 | ], 299 | "metadata": { 300 | "celltoolbar": "Slideshow", 301 | "kernelspec": { 302 | "display_name": "Python 3", 303 | "language": "python", 304 | "name": "python3" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 3 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": "3.8.8" 317 | } 318 | }, 319 | "nbformat": 4, 320 | "nbformat_minor": 5 321 | } 322 | -------------------------------------------------------------------------------- /06 K-Means/K_Means.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "id": "Tx7p2psca6Bm" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "# K-Means is an example of Hard Clustering, where every point belongs only to one cluster." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "source": [ 31 | "import numpy as np\n", 32 | "from matplotlib import pyplot as plt\n", 33 | "from sklearn.datasets import make_blobs" 34 | ], 35 | "metadata": { 36 | "id": "DkDRsz8cbRut" 37 | }, 38 | "execution_count": null, 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "source": [ 44 | "X,y = make_blobs(n_samples=500,n_features=2,centers=5,random_state=3)" 45 | ], 46 | "metadata": { 47 | "id": "u9BrV9JDbT6f" 48 | }, 49 | "execution_count": null, 50 | "outputs": [] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "source": [ 55 | "print(X.shape)" 56 | ], 57 | "metadata": { 58 | "colab": { 59 | "base_uri": "https://localhost:8080/" 60 | }, 61 | "id": "a_uX70OybaJ2", 62 | "outputId": "89278d43-beb3-4ef2-ca57-623c2373d034" 63 | }, 64 | "execution_count": null, 65 | "outputs": [ 66 | { 67 | "output_type": "stream", 68 | "name": "stdout", 69 | "text": [ 70 | "(500, 2)\n" 71 | ] 72 | } 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "source": [ 78 | "# not really needed\n", 79 | "np.unique(y)" 80 | ], 81 | "metadata": { 82 | "colab": { 83 | "base_uri": "https://localhost:8080/" 84 | }, 85 | "id": "FneaG_D2bdph", 86 | "outputId": "bdf08f81-0e07-415f-eabd-41c639472f37" 87 | }, 88 | "execution_count": null, 89 | "outputs": [ 90 | { 91 | "output_type": "execute_result", 92 | "data": { 93 | "text/plain": [ 94 | "array([0, 1, 2, 3, 4])" 95 | ] 96 | }, 97 | "metadata": {}, 98 | "execution_count": 5 99 | } 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "source": [ 105 | "# Data Visualise \n", 106 | "def normalise(X):\n", 107 | " u = X.mean(axis=0)\n", 108 | " std = X.std(axis=0)\n", 109 | " return (X-u)/std\n", 110 | "\n", 111 | "X = normalise(X)\n", 112 | "\n", 113 | "plt.scatter(X[:,0], X[:,1])\n", 114 | "plt.show()" 115 | ], 116 | "metadata": { 117 | "colab": { 118 | "base_uri": "https://localhost:8080/", 119 | "height": 265 120 | }, 121 | "id": "Y0cy--vRbgmF", 122 | "outputId": "c0fd9799-cf06-4007-a4b5-1dc11a2a0092" 123 | }, 124 | "execution_count": null, 125 | "outputs": [ 126 | { 127 | "output_type": "display_data", 128 | "data": { 129 | "text/plain": [ 130 | "
" 131 | ], 132 | "image/png": "\n" 133 | }, 134 | "metadata": { 135 | "needs_background": "light" 136 | } 137 | } 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "source": [ 143 | "# Init the k-centers for k clusters \n", 144 | "\n", 145 | "k = 5\n", 146 | "colors = [\"green\",\"red\",\"blue\",\"yellow\",\"orange\"]\n", 147 | "n_features = 2\n", 148 | "\n", 149 | "def init(k):\n", 150 | " centroids = {}\n", 151 | " for i in range(k):\n", 152 | " center = 2*(2*np.random.random((n_features,)) - 1)\n", 153 | " \n", 154 | " centroids[i] = {\n", 155 | " 'center' : center,\n", 156 | " 'color' : colors[i],\n", 157 | " 'points' : []\n", 158 | " }\n", 159 | " return centroids\n", 160 | "\n", 161 | "centroids = init(k)" 162 | ], 163 | "metadata": { 164 | "id": "wWjVf-Bfbw0r" 165 | }, 166 | "execution_count": null, 167 | "outputs": [] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "source": [ 172 | "def distance(p,q):\n", 173 | " return np.sqrt(np.sum((p-q)**2))\n", 174 | "\n", 175 | "# Step 2a\n", 176 | "def assignPtsToCluster(X,centroids):\n", 177 | " m = X.shape[0]\n", 178 | "\n", 179 | " # each point will be assigned to exactly one of the clusters \n", 180 | " for i in range(m):\n", 181 | " cdist = []\n", 182 | " cx = X[i]\n", 183 | " # find out distance of pt from each centroid\n", 184 | " for kx in range(k):\n", 185 | " d = distance(centroids[kx]['center'],cx)\n", 186 | " cdist.append(d)\n", 187 | "\n", 188 | " clusterId = np.argmin(cdist)\n", 189 | " #assign the point to the list of points that current_cluster holds\n", 190 | " centroids[clusterId]['points'].append(cx) " 191 | ], 192 | "metadata": { 193 | "id": "SGz40JEdevuG" 194 | }, 195 | "execution_count": null, 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "source": [ 201 | "# Step 2b\n", 202 | "def updateClusters(centroids):\n", 203 | "\n", 204 | " # Update Every Centroid by taking a mean of points assigned to the cluster \n", 205 | " for kx in range(k):\n", 206 | " pts = np.array(centroids[kx]['points'])\n", 207 | "\n", 208 | " # if a cluster has non-zero points\n", 209 | " if pts.shape[0] > 0: \n", 210 | " newCenter = pts.mean(axis=0)\n", 211 | " centroids[kx]['center'] = newCenter \n", 212 | " centroids[kx]['points'] = [] #Clear the list for step 2a " 213 | ], 214 | "metadata": { 215 | "id": "8jpodvOxfSmY" 216 | }, 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "source": [ 223 | "def plotClusters(centroids):\n", 224 | "\n", 225 | " for kx in range(k):\n", 226 | " pts = np.array(centroids[kx]['points'])\n", 227 | "\n", 228 | " # Plot the Points\n", 229 | " if(pts.shape[0]>0):\n", 230 | " plt.scatter(pts[:,0],pts[:,1],color=centroids[kx]['color'])\n", 231 | "\n", 232 | " # Plot the Cluster Center (Centroid)\n", 233 | " uk = centroids[kx]['center']\n", 234 | " plt.scatter(uk[0],uk[1],color='black',marker='*')" 235 | ], 236 | "metadata": { 237 | "id": "mzgL2qBjuYbU" 238 | }, 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "source": [ 245 | "centroids = init(k)\n", 246 | "assignPtsToCluster(X,centroids)\n", 247 | "plotClusters(centroids)" 248 | ], 249 | "metadata": { 250 | "colab": { 251 | "base_uri": "https://localhost:8080/", 252 | "height": 265 253 | }, 254 | "id": "ObCJ5ZrHuZep", 255 | "outputId": "31334d3a-aac5-40d9-b81c-0af838c14018" 256 | }, 257 | "execution_count": null, 258 | "outputs": [ 259 | { 260 | "output_type": "display_data", 261 | "data": { 262 | "text/plain": [ 263 | "
" 264 | ], 265 | "image/png": "\n" 266 | }, 267 | "metadata": { 268 | "needs_background": "light" 269 | } 270 | } 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "source": [ 276 | "assignPtsToCluster(X,centroids)\n", 277 | "plotClusters(centroids)\n", 278 | "updateClusters(centroids)" 279 | ], 280 | "metadata": { 281 | "colab": { 282 | "base_uri": "https://localhost:8080/", 283 | "height": 265 284 | }, 285 | "id": "MSEoYh8H2O3T", 286 | "outputId": "2a5193b2-a1ba-4c69-ea39-0ef0bd88f541" 287 | }, 288 | "execution_count": null, 289 | "outputs": [ 290 | { 291 | "output_type": "display_data", 292 | "data": { 293 | "text/plain": [ 294 | "
" 295 | ], 296 | "image/png": "\n" 297 | }, 298 | "metadata": { 299 | "needs_background": "light" 300 | } 301 | } 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "source": [], 307 | "metadata": { 308 | "id": "nh1wK-ex2lKX" 309 | }, 310 | "execution_count": null, 311 | "outputs": [] 312 | } 313 | ] 314 | } -------------------------------------------------------------------------------- /07 Naive Bayes/Naive Bayes For Text Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b742be27", 6 | "metadata": {}, 7 | "source": [ 8 | "# Naive Bayes Classifier for Text Data\n", 9 | "\n", 10 | " " 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "0df6d2a8", 16 | "metadata": {}, 17 | "source": [ 18 | "#### Multinomial Naive Bayes\n", 19 | "- Important is to compute the likelihood \n", 20 | "\n", 21 | "$$P(x_i|Y_i = c) = \\frac {count(x_i, Y_i = c)} {\\sum_{w \\in V}{count(w, Y_i=c)}} $$\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "2e1977b2", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "bc032f5a", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "4bdc9be2", 43 | "metadata": {}, 44 | "source": [ 45 | "# Laplace Smoothing" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "187a8d0f", 51 | "metadata": {}, 52 | "source": [ 53 | "" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "c08e3ef6", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "c181211a", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "ce330c2d", 75 | "metadata": {}, 76 | "source": [ 77 | "$$P(x_i|Y_i = c) = \\frac {count(x_i, Y_i = c) + \\alpha} {\\sum_{w \\in V}{count(w, Y_i=c)} + \\alpha |V|} $$" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "2065f18a", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "c755295b", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "ad0fe83b", 99 | "metadata": {}, 100 | "source": [ 101 | "# A Practical Example of Multinomial Naive Bayes" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "id": "8c0ad33f", 107 | "metadata": {}, 108 | "source": [ 109 | "\n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | "
docIDwords in documentc = China?
training set 1 Chinese Beijing Chinese yes
2 Chinese Chinese Shanghai yes
3 Chinese Macao yes
4 Tokyo Japan Chinese no
test set 5 Chinese Chinese Chinese Tokyo Japan ?
" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "71f0b1eb", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "id": "7c419b85", 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "id": "2226f872", 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "id": "5a68e1bd", 173 | "metadata": {}, 174 | "source": [ 175 | "# Bernoulli Naive Bayes\n", 176 | "\n", 177 | "\n", 178 | "

\n", 179 | "\n", 180 | "1. Bernoulli doesn't talk about the frequency of a feature/word.\n", 181 | "1. It is only concerned about whether a word is present or not (1 or 0)." 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "id": "f75c721d", 187 | "metadata": {}, 188 | "source": [ 189 | "**Likelihood :**" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "id": "31cc7a35", 195 | "metadata": {}, 196 | "source": [ 197 | "$$P(x_i|Y_i = c) = \\frac {count(d_i \\; contains \\; x_i, Y_i = c) + \\alpha} {{count(Y_i=c)} + 2.\\alpha \\, } $$" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "id": "692c078a", 203 | "metadata": {}, 204 | "source": [ 205 | "**Prediction:**" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "4afee9a0", 211 | "metadata": {}, 212 | "source": [ 213 | "$$P(Y=1|X) = \\prod_{i=1}^{|V|} { P(x_i|Y=1)^b . \\big(1 - P(x_i|Y=spam)\\big)^{1-b}} * P(Y=1)$$" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "80f60524", 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "id": "b1f01fce", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "id": "45b21dcc", 235 | "metadata": {}, 236 | "source": [ 237 | "## Example of Bernoulli Naive Bayes" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "id": "afb80f67", 243 | "metadata": {}, 244 | "source": [ 245 | "\n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | "
docIDwords in documentc = China?
training set 1 Chinese Beijing Chinese yes
2 Chinese Chinese Shanghai yes
3 Chinese Macao yes
4 Tokyo Japan Chinese no
test set 5 Chinese Chinese Chinese Tokyo Japan ?
" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "id": "fb283260", 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "id": "17186f98", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "id": "5bd8ea93", 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "id": "13d133ef", 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "id": "5c334eda", 317 | "metadata": {}, 318 | "source": [ 319 | "# Bias Variance Tradeoff " 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "id": "030837bf", 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "ca46205a", 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "id": "3b3cfcd7", 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "id": "25edd7cf", 349 | "metadata": {}, 350 | "source": [ 351 | "# Gaussian Naive Bayes\n", 352 | "" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "id": "1414e781", 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "id": "32ab6ad1", 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "id": "26944990", 374 | "metadata": {}, 375 | "source": [ 376 | "# Scikit Learn code for Naive Bayes" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 5, 382 | "id": "99b564b7", 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "from sklearn.datasets import load_digits\n", 387 | "import matplotlib.pyplot as plt" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 2, 393 | "id": "9c9a8f98", 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "digits = load_digits()" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 4, 403 | "id": "5e687e3e", 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "X = digits.data\n", 408 | "y = digits.target" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 6, 414 | "id": "67018a91", 415 | "metadata": {}, 416 | "outputs": [ 417 | { 418 | "data": { 419 | "text/plain": [ 420 | "(1797, 64)" 421 | ] 422 | }, 423 | "execution_count": 6, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "X.shape" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 7, 435 | "id": "6cd49e35", 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "data": { 440 | "text/plain": [ 441 | "(1797,)" 442 | ] 443 | }, 444 | "execution_count": 7, 445 | "metadata": {}, 446 | "output_type": "execute_result" 447 | } 448 | ], 449 | "source": [ 450 | "y.shape" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 12, 456 | "id": "8591ce65", 457 | "metadata": {}, 458 | "outputs": [ 459 | { 460 | "data": { 461 | "text/plain": [ 462 | "" 463 | ] 464 | }, 465 | "execution_count": 12, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | }, 469 | { 470 | "data": { 471 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPUAAAD4CAYAAAA0L6C7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAK40lEQVR4nO3d34tc9RnH8c+nidL4KwutDZINyQoSkEI2EgISMGlsS6xictGLBBQaCrlS1BZEe5d/QNKLIizRNGCqtFGDiNUKGqzQWpO4NsaNJQ1bso02Som/Cl2iTy92UqJdu2dmzvme2SfvFyzuj2G/z5i8c87OzpyvI0IA8vha2wMAqBdRA8kQNZAMUQPJEDWQzMImvqntYg+pL168uNRSWrJkSbG1JGnhwkb+eGa1aNGiYmuVdPTo0aLrTU9PF1srIjzb58v9rWnIhg0biq113333FVtLkoaGhoqttWrVqmJrlTQyMlJ0vcnJyaLrzYbTbyAZogaSIWogGaIGkiFqIBmiBpIhaiAZogaSIWogmUpR295k+x3bJ2w/0PRQAHo3Z9S2F0j6haRbJF0vaZvt65seDEBvqhyp10o6EREnI2Ja0hOSNjc7FoBeVYl6qaRTF3w81fncF9jeYfuQ7UN1DQege1VepTXby7v+56WVETEmaUwq+9JLAF9U5Ug9JWnZBR8PSzrdzDgA+lUl6tclXWd7xPalkrZKeqbZsQD0as7T74g4Z/suSS9IWiDp0Yg41vhkAHpS6conEfGcpOcangVADXhGGZAMUQPJEDWQDFEDyRA1kAxRA8kQNZDMvN+hY/v27cXWWr9+fbG1JOnDDz8sttbOnTuLrXXw4MFiaw3CjhmlcaQGkiFqIBmiBpIhaiAZogaSIWogGaIGkiFqIBmiBpIhaiCZKjt0PGr7jO23SgwEoD9VjtS/lLSp4TkA1GTOqCPiFUn/LDALgBrU9iot2zsk7ajr+wHoTW1Rs+0OMBh49BtIhqiBZKr8SutxSX+QtNL2lO0fNz8WgF5V2UtrW4lBANSD028gGaIGkiFqIBmiBpIhaiAZogaSIWogmXm/7c74+HixtUZHR4utJZW9b7t27Sq21tmzZ4utdTHiSA0kQ9RAMkQNJEPUQDJEDSRD1EAyRA0kQ9RAMkQNJEPUQDJVrlG2zPbLtidsH7N9T4nBAPSmynO/z0n6aUQcsX2lpMO2X4yItxueDUAPqmy7825EHOm8/7GkCUlLmx4MQG+6epWW7RWSVkt6bZavse0OMAAqR237CklPSro3Ij768tfZdgcYDJUe/bZ9iWaC3hcRTzU7EoB+VHn025IekTQREQ81PxKAflQ5Uq+TdKekjbbHO28/aHguAD2qsu3Oq5JcYBYANeAZZUAyRA0kQ9RAMkQNJEPUQDJEDSRD1EAyRA0kM+/30ipp+fLladcruW/XihUriq11MeJIDSRD1EAyRA0kQ9RAMkQNJEPUQDJEDSRD1EAyRA0kU+XCg1+3/Sfbb3a23dlZYjAAvanyNNF/S9oYEZ90LhX8qu3fRsQfG54NQA+qXHgwJH3S+fCSzhsX6wcGVNWL+S+wPS7pjKQXI2LWbXdsH7J9qOYZAXShUtQR8VlEjEoalrTW9rdnuc1YRKyJiDU1zwigC109+h0RZyUdlLSpiWEA9K/Ko99X2x7qvL9I0nclHW94LgA9qvLo9zWS9tpeoJl/BH4dEc82OxaAXlV59PvPmtmTGsA8wDPKgGSIGkiGqIFkiBpIhqiBZIgaSIaogWSIGkjGM6+srPmb2sVemjk0NFRqKW3ZsqXYWqXt2bOn2Fq2i62VWUTM+j+SIzWQDFEDyRA1kAxRA8kQNZAMUQPJEDWQDFEDyRA1kAxRA8lUjrpzQf83bHPRQWCAdXOkvkfSRFODAKhH1W13hiXdKml3s+MA6FfVI/UuSfdL+vyrbsBeWsBgqLJDx22SzkTE4f93O/bSAgZDlSP1Okm3256U9ISkjbYfa3QqAD2bM+qIeDAihiNihaStkl6KiDsanwxAT/g9NZBMlQ3y/isiDmpmK1sAA4ojNZAMUQPJEDWQDFEDyRA1kAxRA8kQNZDMvN92J7PNmzcXW+vAgQPF1lq9enWxtcbHx4utVRrb7gAXCaIGkiFqIBmiBpIhaiAZogaSIWogGaIGkiFqIBmiBpKpdDmjzpVEP5b0maRzXAYYGFzdXKPsOxHxQWOTAKgFp99AMlWjDkm/s33Y9o7ZbsC2O8BgqHr6vS4iTtv+lqQXbR+PiFcuvEFEjEkak3jpJdCmSkfqiDjd+e8ZSU9LWtvkUAB6V2WDvMttX3n+fUnfl/RW04MB6E2V0+8lkp62ff72v4qI5xudCkDP5ow6Ik5KWlVgFgA14FdaQDJEDSRD1EAyRA0kQ9RAMkQNJEPUQDLzftudoaGhUktp/fr1xdaSpL179xZba3Jysthao6OjxdbKjG13gIsEUQPJEDWQDFEDyRA1kAxRA8kQNZAMUQPJEDWQDFEDyVSK2vaQ7f22j9uesH1j04MB6E3V637/XNLzEfFD25dKuqzBmQD0Yc6obV8l6SZJP5KkiJiWNN3sWAB6VeX0+1pJ70vaY/sN27s71//+ArbdAQZDlagXSrpB0sMRsVrSp5Ie+PKNImIsItawzS3QripRT0maiojXOh/v10zkAAbQnFFHxHuSTtle2fnUzZLebnQqAD2r+uj33ZL2dR75Pilpe3MjAehHpagjYlwSPysD8wDPKAOSIWogGaIGkiFqIBmiBpIhaiAZogaSIWogmXm/l9aGDRtKLaUDBw4UW0uSFi9eXGytkZGRYmuV3LcrM/bSAi4SRA0kQ9RAMkQNJEPUQDJEDSRD1EAyRA0kQ9RAMnNGbXul7fEL3j6yfW+B2QD0YM5rlEXEO5JGJcn2Akl/l/R0s2MB6FW3p983S/prRPytiWEA9K/qJYLP2yrp8dm+YHuHpB19TwSgL5WP1J1rft8u6TezfZ1td4DB0M3p9y2SjkTEP5oaBkD/uol6m77i1BvA4KgUte3LJH1P0lPNjgOgX1W33fmXpG80PAuAGvCMMiAZogaSIWogGaIGkiFqIBmiBpIhaiAZogaSaWrbnfcldfvyzG9K+qD2YQZD1vvG/WrP8oi4erYvNBJ1L2wfyvoKr6z3jfs1mDj9BpIhaiCZQYp6rO0BGpT1vnG/BtDA/EwNoB6DdKQGUAOiBpIZiKhtb7L9ju0Tth9oe5462F5m+2XbE7aP2b6n7ZnqZHuB7TdsP9v2LHWyPWR7v+3jnT+7G9ueqVut/0zd2SDgL5q5XNKUpNclbYuIt1sdrE+2r5F0TUQcsX2lpMOStsz3+3We7Z9IWiPpqoi4re156mJ7r6TfR8TuzhV0L4uIsy2P1ZVBOFKvlXQiIk5GxLSkJyRtbnmmvkXEuxFxpPP+x5ImJC1td6p62B6WdKuk3W3PUifbV0m6SdIjkhQR0/MtaGkwol4q6dQFH08pyV/+82yvkLRa0mstj1KXXZLul/R5y3PU7VpJ70va0/nRYrfty9seqluDELVn+Vya37PZvkLSk5LujYiP2p6nX7Zvk3QmIg63PUsDFkq6QdLDEbFa0qeS5t1jPIMQ9ZSkZRd8PCzpdEuz1Mr2JZoJel9EZLm88jpJt9ue1MyPShttP9buSLWZkjQVEefPqPZrJvJ5ZRCifl3SdbZHOg9MbJX0TMsz9c22NfOz2UREPNT2PHWJiAcjYjgiVmjmz+qliLij5bFqERHvSTple2XnUzdLmncPbHa7QV7tIuKc7bskvSBpgaRHI+JYy2PVYZ2kOyUdtT3e+dzPIuK59kZCBXdL2tc5wJyUtL3lebrW+q+0ANRrEE6/AdSIqIFkiBpIhqiBZIgaSIaogWSIGkjmP0xwkqC5cHyvAAAAAElFTkSuQmCC\n", 472 | "text/plain": [ 473 | "
" 474 | ] 475 | }, 476 | "metadata": { 477 | "needs_background": "light" 478 | }, 479 | "output_type": "display_data" 480 | } 481 | ], 482 | "source": [ 483 | "plt.imshow(X[15].reshape(8,8), cmap=\"gray\")" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 13, 489 | "id": "b9bf9de7", 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "from sklearn.model_selection import train_test_split" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 14, 499 | "id": "5a19f993", 500 | "metadata": {}, 501 | "outputs": [], 502 | "source": [ 503 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 15, 509 | "id": "7355a199", 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 16, 519 | "id": "1bca7e97", 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "mnb = MultinomialNB()\n", 524 | "bnb = BernoulliNB()\n", 525 | "gnb = GaussianNB()" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 17, 531 | "id": "c6bb7ea0", 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "data": { 536 | "text/plain": [ 537 | "GaussianNB()" 538 | ] 539 | }, 540 | "execution_count": 17, 541 | "metadata": {}, 542 | "output_type": "execute_result" 543 | } 544 | ], 545 | "source": [ 546 | "mnb.fit(X_train, y_train)\n", 547 | "bnb.fit(X_train, y_train)\n", 548 | "gnb.fit(X_train, y_train)" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 18, 554 | "id": "bd695746", 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "data": { 559 | "text/plain": [ 560 | "0.9111111111111111" 561 | ] 562 | }, 563 | "execution_count": 18, 564 | "metadata": {}, 565 | "output_type": "execute_result" 566 | } 567 | ], 568 | "source": [ 569 | "mnb.score(X_test, y_test)" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 19, 575 | "id": "da6a9bfd", 576 | "metadata": {}, 577 | "outputs": [ 578 | { 579 | "data": { 580 | "text/plain": [ 581 | "0.8638888888888889" 582 | ] 583 | }, 584 | "execution_count": 19, 585 | "metadata": {}, 586 | "output_type": "execute_result" 587 | } 588 | ], 589 | "source": [ 590 | "bnb.score(X_test, y_test)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 20, 596 | "id": "182d7588", 597 | "metadata": {}, 598 | "outputs": [ 599 | { 600 | "data": { 601 | "text/plain": [ 602 | "0.8472222222222222" 603 | ] 604 | }, 605 | "execution_count": 20, 606 | "metadata": {}, 607 | "output_type": "execute_result" 608 | } 609 | ], 610 | "source": [ 611 | "gnb.score(X_test, y_test)" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": null, 617 | "id": "34bf7f7d", 618 | "metadata": {}, 619 | "outputs": [], 620 | "source": [] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "id": "bfe95c1e", 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "id": "12e45472", 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": null, 641 | "id": "fdcd1fd6", 642 | "metadata": {}, 643 | "outputs": [], 644 | "source": [] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "id": "97f1706e", 650 | "metadata": {}, 651 | "outputs": [], 652 | "source": [] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": null, 657 | "id": "a9d49ec3", 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "id": "b72a931f", 666 | "metadata": {}, 667 | "outputs": [], 668 | "source": [] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "id": "da356ad2", 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": null, 681 | "id": "90610e4b", 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "id": "185c9a25", 690 | "metadata": {}, 691 | "outputs": [], 692 | "source": [] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": null, 697 | "id": "fcd33768", 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "id": "72355b8f", 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "id": "21f98f20", 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "id": "50dc6ea5", 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "id": "29ca6faa", 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [] 733 | } 734 | ], 735 | "metadata": { 736 | "kernelspec": { 737 | "display_name": "Python 3", 738 | "language": "python", 739 | "name": "python3" 740 | }, 741 | "language_info": { 742 | "codemirror_mode": { 743 | "name": "ipython", 744 | "version": 3 745 | }, 746 | "file_extension": ".py", 747 | "mimetype": "text/x-python", 748 | "name": "python", 749 | "nbconvert_exporter": "python", 750 | "pygments_lexer": "ipython3", 751 | "version": "3.8.8" 752 | } 753 | }, 754 | "nbformat": 4, 755 | "nbformat_minor": 5 756 | } 757 | -------------------------------------------------------------------------------- /08 Decision Trees/Decision Trees.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "46e436b7", 6 | "metadata": {}, 7 | "source": [ 8 | "# Decision Trees\n", 9 | "\n", 10 | "\n", 11 | "\n", 12 | "1. Decision trees are supervised models.\n", 13 | "1. Can be used for Both classification/regression tasks.\n", 14 | "1. They are simple tree like structure (hierarchical in nature).\n", 15 | "1. Decision trees can be thought as nested if else conditions. \n", 16 | "1. Highly interpretable models, easy to explain the workings.\n", 17 | "\n", 18 | "

\n", 19 | "\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "id": "6f7b3ccb", 25 | "metadata": {}, 26 | "source": [ 27 | "# Why Decision Trees are popular?\n", 28 | "1. Easy to interpret and represent.\n", 29 | "1. Mimic human level thought. tries to take decisions like a human does.\n", 30 | "1. Ensemble models are made up of Decision trees that performs even better than individual Decision trees.\n", 31 | "1. When feature are categorical, Decision Trees are preffered over other models. " 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "5b0432f0", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "1830b8ea", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "dfd919e5", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "id": "6b6c6815", 61 | "metadata": {}, 62 | "source": [ 63 | "# Decision Tree Example\n", 64 | "\n", 65 | "Q- Solve a Business Problem that wants to Predict whether a user clicks on an avdertisement of a car or not?\n", 66 | "\n", 67 | "\n", 68 | "\n", 69 | "

\n", 70 | "\n", 71 | "| Sex | Income | alreadyCar | Techy | Age |\n", 72 | "| ----------- | ----------- | ----------- | ----------- | ----------- |\n", 73 | "| M | <=5L | Yes | Yes | 18-25\n", 74 | "| F | 5L-15L | No | No | 25-50\n", 75 | "| | 15L+ | | | 50+\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "8bed2cfd", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "0137ced1", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "bb80770c", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "id": "2b6e90e7", 105 | "metadata": {}, 106 | "source": [ 107 | "
\n", 108 | "\n", 109 | " \n", 110 | " Entropy is measure of randomness of a system.\n", 111 | " \n", 112 | " \n", 113 | "
" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "id": "dbf28fb7", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "id": "7f786299", 127 | "metadata": {}, 128 | "source": [ 129 | "# CODE : Entropy " 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 1, 135 | "id": "1dc24a72", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "import numpy as np" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 2, 145 | "id": "224d00cf", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "def entropy(var):\n", 150 | " N = var.shape[0]\n", 151 | " values, counts = np.unique(var, return_counts=True )\n", 152 | " \n", 153 | " ent = 0.0\n", 154 | " \n", 155 | " for i in counts:\n", 156 | " p = i/N\n", 157 | " ent += (p * np.log2(p))\n", 158 | " \n", 159 | " return -ent" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 3, 165 | "id": "635e0e79", 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "Y = np.array([1,0,0,1,0,1,0,1])" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 4, 175 | "id": "28867b1b", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "X = np.array([1,1,1,1,1,1,1])" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 5, 185 | "id": "082b5ea3", 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/plain": [ 191 | "1.0" 192 | ] 193 | }, 194 | "execution_count": 5, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "entropy(Y)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 6, 206 | "id": "45553195", 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "-0.0" 213 | ] 214 | }, 215 | "execution_count": 6, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "entropy(X)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "id": "83dfef6c", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "id": "fedecfd8", 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "id": "cf5be31d", 243 | "metadata": {}, 244 | "source": [ 245 | "# Information Gain" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "id": "5d2b9507", 251 | "metadata": {}, 252 | "source": [ 253 | "\n", 254 | "\n", 255 | "\n", 256 | "

\n", 257 | "$$ Gain(Y, A) = H_D(Y) - \\sum_{i=1}^{k} \\frac{|D_i|} {|D|}{ H_{D_i}(Y)} $$" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "id": "c182c047", 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "id": "5d8b3a16", 271 | "metadata": {}, 272 | "source": [ 273 | "# Golf Dataset\n", 274 | "\n", 275 | "" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "id": "5bd3b92b", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "id": "a20595cd", 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "id": "a46840bc", 297 | "metadata": {}, 298 | "source": [ 299 | "# CODE : Split Data" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 7, 305 | "id": "d99f49fb", 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "import pandas as pd" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 8, 315 | "id": "f4c23622", 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "df = pd.read_csv('golf.csv')" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 9, 325 | "id": "54c8abba", 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/html": [ 331 | "
\n", 332 | "\n", 345 | "\n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | "
OutlookTemperatureHumidityWindyPlay
0sunnyhothighFalseno
1sunnyhothighTrueno
2overcasthothighFalseyes
3rainymildhighFalseyes
4rainycoolnormalFalseyes
\n", 399 | "
" 400 | ], 401 | "text/plain": [ 402 | " Outlook Temperature Humidity Windy Play\n", 403 | "0 sunny hot high False no\n", 404 | "1 sunny hot high True no\n", 405 | "2 overcast hot high False yes\n", 406 | "3 rainy mild high False yes\n", 407 | "4 rainy cool normal False yes" 408 | ] 409 | }, 410 | "execution_count": 9, 411 | "metadata": {}, 412 | "output_type": "execute_result" 413 | } 414 | ], 415 | "source": [ 416 | "df.head()" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 10, 422 | "id": "a4478e6e", 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "def divide_data(data, feature):\n", 427 | " # here we are working data frames.\n", 428 | " \n", 429 | " DATA = {}\n", 430 | " \n", 431 | " feat_values = list(data[feature].value_counts().index)\n", 432 | " occurence = list(data[feature].value_counts())\n", 433 | " \n", 434 | " for val in feat_values:\n", 435 | " DATA[val] = {'data' : pd.DataFrame([], columns = data.columns), 'len': 0}\n", 436 | " \n", 437 | " \n", 438 | " for ix in range(data.shape[0]):\n", 439 | " val = data[feature].iloc[ix]\n", 440 | " \n", 441 | " DATA[val]['data'] = DATA[val]['data'].append(data.iloc[ix])\n", 442 | " \n", 443 | " idx = feat_values.index(val)\n", 444 | " DATA[val]['len'] = occurence[idx]\n", 445 | " \n", 446 | " return DATA" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 11, 452 | "id": "68913b46", 453 | "metadata": {}, 454 | "outputs": [ 455 | { 456 | "data": { 457 | "text/plain": [ 458 | "{'sunny': {'data': Outlook Temperature Humidity Windy Play\n", 459 | " 0 sunny hot high False no\n", 460 | " 1 sunny hot high True no\n", 461 | " 7 sunny mild high False no\n", 462 | " 8 sunny cool normal False yes\n", 463 | " 10 sunny mild normal True yes,\n", 464 | " 'len': 5},\n", 465 | " 'rainy': {'data': Outlook Temperature Humidity Windy Play\n", 466 | " 3 rainy mild high False yes\n", 467 | " 4 rainy cool normal False yes\n", 468 | " 5 rainy cool normal True no\n", 469 | " 9 rainy mild normal False yes\n", 470 | " 13 rainy mild high True no,\n", 471 | " 'len': 5},\n", 472 | " 'overcast': {'data': Outlook Temperature Humidity Windy Play\n", 473 | " 2 overcast hot high False yes\n", 474 | " 6 overcast cool normal True yes\n", 475 | " 11 overcast mild high True yes\n", 476 | " 12 overcast hot normal False yes,\n", 477 | " 'len': 4}}" 478 | ] 479 | }, 480 | "execution_count": 11, 481 | "metadata": {}, 482 | "output_type": "execute_result" 483 | } 484 | ], 485 | "source": [ 486 | "divide_data(df, 'Outlook')" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "id": "5fcc80fd", 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "id": "b63a4ba2", 500 | "metadata": {}, 501 | "source": [ 502 | "# CODE : Information Gain" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "id": "80762a03", 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 12, 516 | "id": "6d099d2d", 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "def information_gain(data, feature):\n", 521 | " examples = data.shape[0]\n", 522 | " \n", 523 | " DATA = divide_data(data, feature)\n", 524 | " \n", 525 | " keys = DATA.keys()\n", 526 | " \n", 527 | " \n", 528 | " ent_of_children = 0.0\n", 529 | " \n", 530 | " for key in keys:\n", 531 | " ent_of_children += ( (DATA[key]['len']/examples) * entropy(DATA[key]['data']['Play']) )\n", 532 | " \n", 533 | " info_gain = entropy(data['Play']) - ent_of_children\n", 534 | " return info_gain" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 13, 540 | "id": "a4b2e0fd", 541 | "metadata": {}, 542 | "outputs": [ 543 | { 544 | "data": { 545 | "text/plain": [ 546 | "0.24674981977443933" 547 | ] 548 | }, 549 | "execution_count": 13, 550 | "metadata": {}, 551 | "output_type": "execute_result" 552 | } 553 | ], 554 | "source": [ 555 | "information_gain(df, 'Outlook')" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 14, 561 | "id": "fe76c1f8", 562 | "metadata": {}, 563 | "outputs": [ 564 | { 565 | "data": { 566 | "text/plain": [ 567 | "0.04812703040826949" 568 | ] 569 | }, 570 | "execution_count": 14, 571 | "metadata": {}, 572 | "output_type": "execute_result" 573 | } 574 | ], 575 | "source": [ 576 | "information_gain(df, 'Windy')" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 15, 582 | "id": "cc7ec047", 583 | "metadata": {}, 584 | "outputs": [ 585 | { 586 | "data": { 587 | "text/plain": [ 588 | "0.02922256565895487" 589 | ] 590 | }, 591 | "execution_count": 15, 592 | "metadata": {}, 593 | "output_type": "execute_result" 594 | } 595 | ], 596 | "source": [ 597 | "information_gain(df, 'Temperature')" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 16, 603 | "id": "d7f6c7f9", 604 | "metadata": {}, 605 | "outputs": [ 606 | { 607 | "data": { 608 | "text/plain": [ 609 | "0.15183550136234159" 610 | ] 611 | }, 612 | "execution_count": 16, 613 | "metadata": {}, 614 | "output_type": "execute_result" 615 | } 616 | ], 617 | "source": [ 618 | "information_gain(df, 'Humidity')" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": null, 624 | "id": "ee7bb990", 625 | "metadata": {}, 626 | "outputs": [], 627 | "source": [] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "id": "048f2724", 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "id": "01123d80", 640 | "metadata": {}, 641 | "source": [ 642 | "# Constructing a Decision Tree" 643 | ] 644 | }, 645 | { 646 | "cell_type": "markdown", 647 | "id": "8d00e2b3", 648 | "metadata": {}, 649 | "source": [ 650 | "" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "id": "f95c18f5", 657 | "metadata": {}, 658 | "outputs": [], 659 | "source": [] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": null, 664 | "id": "12864330", 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": null, 672 | "id": "62fe2268", 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "id": "ef39099d", 680 | "metadata": {}, 681 | "source": [ 682 | "\n", 683 | "


\n", 684 | "\n", 685 | "# Stopping Condition\n", 686 | "1. Pure Node\n", 687 | "2. Can't grow the tree anymore because of lack of points.\n", 688 | "3. If already reach a max depth." 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": null, 694 | "id": "af16d86c", 695 | "metadata": {}, 696 | "outputs": [], 697 | "source": [] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": null, 702 | "id": "338e1435", 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [] 706 | }, 707 | { 708 | "cell_type": "markdown", 709 | "id": "0cf5738a", 710 | "metadata": {}, 711 | "source": [ 712 | "# CODE : Building a Decision Tree" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": null, 718 | "id": "2e5c50d1", 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 73, 726 | "id": "27d9a136", 727 | "metadata": {}, 728 | "outputs": [], 729 | "source": [ 730 | "class DecisionTree:\n", 731 | " \n", 732 | " # constructor\n", 733 | " def __init__(self, depth=0, max_depth=5):\n", 734 | " # Creating a Node\n", 735 | " self.children = {}\n", 736 | " self.fkey = None\n", 737 | " self.max_depth = max_depth\n", 738 | " self.depth = depth\n", 739 | " self.target = None\n", 740 | " \n", 741 | " def train(self, data):\n", 742 | " features = ['Outlook', 'Temperature', 'Humidity', 'Windy']\n", 743 | " \n", 744 | " info_gains = []\n", 745 | " \n", 746 | " for f in features:\n", 747 | " i_gain = information_gain(data, f)\n", 748 | " info_gains.append(i_gain)\n", 749 | " \n", 750 | " # finding the best feature\n", 751 | " self.fkey = features[np.argmax(info_gains)]\n", 752 | " \n", 753 | " #Spliting the Data\n", 754 | " DATA = divide_data(data, self.fkey)\n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " # Giving a target label to the Node\n", 759 | " labels = list(data['Play'].value_counts().index)\n", 760 | " freq = list(data['Play'].value_counts().values)\n", 761 | " \n", 762 | " self.target = labels[np.argmax(freq)]\n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " ###### STOPPING CONDITIONS ######\n", 768 | " \n", 769 | " have_data = 0\n", 770 | " keys = DATA.keys()\n", 771 | " \n", 772 | " for key in keys:\n", 773 | " if DATA[key]['len'] > 0:\n", 774 | " have_data +=1\n", 775 | " \n", 776 | " # 1. If it is pure node \n", 777 | " if have_data<2:\n", 778 | " return \n", 779 | " \n", 780 | " \n", 781 | " # 2. Early Stop if you have reached max depth\n", 782 | " if(self.depth >= self.max_depth):\n", 783 | " return\n", 784 | " \n", 785 | " \n", 786 | " print(\"\\t\"*self.depth + \"Making tree with - \", self.fkey)\n", 787 | " \n", 788 | " \n", 789 | " # Recursively train child Node\n", 790 | " for key in keys:\n", 791 | " new_data = DATA[key]['data']\n", 792 | " self.children[key] = DecisionTree(depth = self.depth + 1)\n", 793 | " self.children[key].train(new_data)\n", 794 | " \n", 795 | " return \n", 796 | " \n", 797 | " \n", 798 | " def predict(self, test):\n", 799 | " if self.children == {}:\n", 800 | " return self.target\n", 801 | " return self.children[test[self.fkey][0]].predict(test)" 802 | ] 803 | }, 804 | { 805 | "cell_type": "markdown", 806 | "id": "ddf41668", 807 | "metadata": {}, 808 | "source": [ 809 | "# Explore the Model" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 74, 815 | "id": "b97cbf66", 816 | "metadata": {}, 817 | "outputs": [], 818 | "source": [ 819 | "model = DecisionTree()" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": 75, 825 | "id": "2e864622", 826 | "metadata": {}, 827 | "outputs": [ 828 | { 829 | "name": "stdout", 830 | "output_type": "stream", 831 | "text": [ 832 | "Making tree with - Outlook\n", 833 | "\tMaking tree with - Humidity\n", 834 | "\tMaking tree with - Windy\n" 835 | ] 836 | } 837 | ], 838 | "source": [ 839 | "model.train(df)" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": 76, 845 | "id": "816a2a32", 846 | "metadata": {}, 847 | "outputs": [ 848 | { 849 | "data": { 850 | "text/plain": [ 851 | "<__main__.DecisionTree at 0x7ff5d0abf100>" 852 | ] 853 | }, 854 | "execution_count": 76, 855 | "metadata": {}, 856 | "output_type": "execute_result" 857 | } 858 | ], 859 | "source": [ 860 | "model" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": 77, 866 | "id": "aac46af2", 867 | "metadata": {}, 868 | "outputs": [ 869 | { 870 | "data": { 871 | "text/plain": [ 872 | "'yes'" 873 | ] 874 | }, 875 | "execution_count": 77, 876 | "metadata": {}, 877 | "output_type": "execute_result" 878 | } 879 | ], 880 | "source": [ 881 | "model.target" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": 78, 887 | "id": "b24dfda6", 888 | "metadata": {}, 889 | "outputs": [ 890 | { 891 | "data": { 892 | "text/plain": [ 893 | "'Outlook'" 894 | ] 895 | }, 896 | "execution_count": 78, 897 | "metadata": {}, 898 | "output_type": "execute_result" 899 | } 900 | ], 901 | "source": [ 902 | "model.fkey" 903 | ] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": 79, 908 | "id": "0d77bf3e", 909 | "metadata": {}, 910 | "outputs": [ 911 | { 912 | "data": { 913 | "text/plain": [ 914 | "{'sunny': <__main__.DecisionTree at 0x7ff5d0f565e0>,\n", 915 | " 'rainy': <__main__.DecisionTree at 0x7ff5d0eebf70>,\n", 916 | " 'overcast': <__main__.DecisionTree at 0x7ff5d0abfb50>}" 917 | ] 918 | }, 919 | "execution_count": 79, 920 | "metadata": {}, 921 | "output_type": "execute_result" 922 | } 923 | ], 924 | "source": [ 925 | "model.children" 926 | ] 927 | }, 928 | { 929 | "cell_type": "code", 930 | "execution_count": 80, 931 | "id": "f322409e", 932 | "metadata": {}, 933 | "outputs": [ 934 | { 935 | "data": { 936 | "text/plain": [ 937 | "<__main__.DecisionTree at 0x7ff5d0f565e0>" 938 | ] 939 | }, 940 | "execution_count": 80, 941 | "metadata": {}, 942 | "output_type": "execute_result" 943 | } 944 | ], 945 | "source": [ 946 | "model.children['sunny']" 947 | ] 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": 81, 952 | "id": "967c084b", 953 | "metadata": {}, 954 | "outputs": [ 955 | { 956 | "data": { 957 | "text/plain": [ 958 | "'Humidity'" 959 | ] 960 | }, 961 | "execution_count": 81, 962 | "metadata": {}, 963 | "output_type": "execute_result" 964 | } 965 | ], 966 | "source": [ 967 | "model.children['sunny'].fkey" 968 | ] 969 | }, 970 | { 971 | "cell_type": "code", 972 | "execution_count": 82, 973 | "id": "1b1b6a37", 974 | "metadata": {}, 975 | "outputs": [ 976 | { 977 | "data": { 978 | "text/plain": [ 979 | "{'high': <__main__.DecisionTree at 0x7ff5d0ad6280>,\n", 980 | " 'normal': <__main__.DecisionTree at 0x7ff5d0ed82e0>}" 981 | ] 982 | }, 983 | "execution_count": 82, 984 | "metadata": {}, 985 | "output_type": "execute_result" 986 | } 987 | ], 988 | "source": [ 989 | "model.children['sunny'].children" 990 | ] 991 | }, 992 | { 993 | "cell_type": "code", 994 | "execution_count": 83, 995 | "id": "6cae8ba8", 996 | "metadata": {}, 997 | "outputs": [ 998 | { 999 | "data": { 1000 | "text/plain": [ 1001 | "{}" 1002 | ] 1003 | }, 1004 | "execution_count": 83, 1005 | "metadata": {}, 1006 | "output_type": "execute_result" 1007 | } 1008 | ], 1009 | "source": [ 1010 | "model.children['sunny'].children['high'].children" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "code", 1015 | "execution_count": 84, 1016 | "id": "4a7937bd", 1017 | "metadata": {}, 1018 | "outputs": [ 1019 | { 1020 | "data": { 1021 | "text/plain": [ 1022 | "'yes'" 1023 | ] 1024 | }, 1025 | "execution_count": 84, 1026 | "metadata": {}, 1027 | "output_type": "execute_result" 1028 | } 1029 | ], 1030 | "source": [ 1031 | "model.children['overcast'].target" 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "execution_count": 85, 1037 | "id": "f89676f6", 1038 | "metadata": {}, 1039 | "outputs": [ 1040 | { 1041 | "data": { 1042 | "text/plain": [ 1043 | "{}" 1044 | ] 1045 | }, 1046 | "execution_count": 85, 1047 | "metadata": {}, 1048 | "output_type": "execute_result" 1049 | } 1050 | ], 1051 | "source": [ 1052 | "model.children['overcast'].children" 1053 | ] 1054 | }, 1055 | { 1056 | "cell_type": "markdown", 1057 | "id": "39f6c4b7", 1058 | "metadata": {}, 1059 | "source": [ 1060 | "# Prediction Time" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "markdown", 1065 | "id": "6d61eae0", 1066 | "metadata": {}, 1067 | "source": [ 1068 | "" 1069 | ] 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "execution_count": null, 1074 | "id": "2a3ffda9", 1075 | "metadata": {}, 1076 | "outputs": [], 1077 | "source": [] 1078 | }, 1079 | { 1080 | "cell_type": "code", 1081 | "execution_count": 94, 1082 | "id": "39933506", 1083 | "metadata": {}, 1084 | "outputs": [], 1085 | "source": [ 1086 | "x_test = pd.DataFrame([['sunny', 'hot', 'normal', False]], columns=list(df.columns.values[:-1]))" 1087 | ] 1088 | }, 1089 | { 1090 | "cell_type": "code", 1091 | "execution_count": 95, 1092 | "id": "afbc0099", 1093 | "metadata": {}, 1094 | "outputs": [ 1095 | { 1096 | "data": { 1097 | "text/html": [ 1098 | "
\n", 1099 | "\n", 1112 | "\n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | "
OutlookTemperatureHumidityWindy
0sunnyhotnormalFalse
\n", 1132 | "
" 1133 | ], 1134 | "text/plain": [ 1135 | " Outlook Temperature Humidity Windy\n", 1136 | "0 sunny hot normal False" 1137 | ] 1138 | }, 1139 | "execution_count": 95, 1140 | "metadata": {}, 1141 | "output_type": "execute_result" 1142 | } 1143 | ], 1144 | "source": [ 1145 | "x_test" 1146 | ] 1147 | }, 1148 | { 1149 | "cell_type": "code", 1150 | "execution_count": 96, 1151 | "id": "eb24cda9", 1152 | "metadata": { 1153 | "scrolled": true 1154 | }, 1155 | "outputs": [ 1156 | { 1157 | "data": { 1158 | "text/plain": [ 1159 | "'yes'" 1160 | ] 1161 | }, 1162 | "execution_count": 96, 1163 | "metadata": {}, 1164 | "output_type": "execute_result" 1165 | } 1166 | ], 1167 | "source": [ 1168 | "model.predict(x_test)" 1169 | ] 1170 | }, 1171 | { 1172 | "cell_type": "code", 1173 | "execution_count": null, 1174 | "id": "ea650ab0", 1175 | "metadata": {}, 1176 | "outputs": [], 1177 | "source": [] 1178 | } 1179 | ], 1180 | "metadata": { 1181 | "kernelspec": { 1182 | "display_name": "Python 3", 1183 | "language": "python", 1184 | "name": "python3" 1185 | }, 1186 | "language_info": { 1187 | "codemirror_mode": { 1188 | "name": "ipython", 1189 | "version": 3 1190 | }, 1191 | "file_extension": ".py", 1192 | "mimetype": "text/x-python", 1193 | "name": "python", 1194 | "nbconvert_exporter": "python", 1195 | "pygments_lexer": "ipython3", 1196 | "version": "3.8.8" 1197 | } 1198 | }, 1199 | "nbformat": 4, 1200 | "nbformat_minor": 5 1201 | } 1202 | -------------------------------------------------------------------------------- /09 Ensemble Learning/Ensemble - Bagging.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "89848a92", 6 | "metadata": {}, 7 | "source": [ 8 | "# Ensemble Models 🎳\n", 9 | "\n", 10 | "
\n", 11 | "\n", 12 | " - Group/collection of things.\n", 13 | " - Multiple models combined together to create a powerful model.\n", 14 | " - Individual Models are known as Base Models.\n", 15 | " - More different base models are, the better results we can achieve.\n", 16 | " \n", 17 | "
\n", 18 | "\n", 19 | "### Types of Ensemble Models\n", 20 | "1. Bagging\n", 21 | "1. Boosting" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "f03463a7", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "ff0a12e4", 35 | "metadata": {}, 36 | "source": [ 37 | "# Bootstrap Aggegration [Bagging]\n", 38 | "\n", 39 | "1. Bootstrap Sampling. i.e sampling with replacement\n", 40 | "1. Aggregation" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "a545a4c9", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "46f9e498", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "df684730", 62 | "metadata": {}, 63 | "source": [ 64 | "# Why Bagging helps?\n", 65 | " - Bagging reduces the variance of a model, while keeping the bias as low" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "d338d146", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "62c734af", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "f410822d", 87 | "metadata": {}, 88 | "source": [ 89 | "# 🌳 Random Forest \n", 90 | "\n", 91 | "\n", 92 | "\n", 93 | " - Random : Boostrap Sampling\n", 94 | " - Forest : Group of trees\n", 95 | " - RF = Decision Tree + Row Sampling + Feature Sampling" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "14192837", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "a0f4ae8e", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "34e53e05", 117 | "metadata": {}, 118 | "source": [ 119 | "# Bias Variance Tradeoff\n", 120 | "" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "20d62397", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "2ad1de75", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "id": "f8f816dd", 142 | "metadata": {}, 143 | "source": [ 144 | "# CODE - Sklearn" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 100, 150 | "id": "0312eb51", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "from sklearn.datasets import load_breast_cancer" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 101, 160 | "id": "282e817c", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "cancer = load_breast_cancer()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 104, 170 | "id": "71f9914d", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "X = cancer.data" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 105, 180 | "id": "1428ee5d", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "y = cancer.target" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 109, 190 | "id": "54cfcecd", 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "(569, 30)" 197 | ] 198 | }, 199 | "execution_count": 109, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "X.shape" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 110, 211 | "id": "3af4a40d", 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | ".. _breast_cancer_dataset:\n", 219 | "\n", 220 | "Breast cancer wisconsin (diagnostic) dataset\n", 221 | "--------------------------------------------\n", 222 | "\n", 223 | "**Data Set Characteristics:**\n", 224 | "\n", 225 | " :Number of Instances: 569\n", 226 | "\n", 227 | " :Number of Attributes: 30 numeric, predictive attributes and the class\n", 228 | "\n", 229 | " :Attribute Information:\n", 230 | " - radius (mean of distances from center to points on the perimeter)\n", 231 | " - texture (standard deviation of gray-scale values)\n", 232 | " - perimeter\n", 233 | " - area\n", 234 | " - smoothness (local variation in radius lengths)\n", 235 | " - compactness (perimeter^2 / area - 1.0)\n", 236 | " - concavity (severity of concave portions of the contour)\n", 237 | " - concave points (number of concave portions of the contour)\n", 238 | " - symmetry\n", 239 | " - fractal dimension (\"coastline approximation\" - 1)\n", 240 | "\n", 241 | " The mean, standard error, and \"worst\" or largest (mean of the three\n", 242 | " worst/largest values) of these features were computed for each image,\n", 243 | " resulting in 30 features. For instance, field 0 is Mean Radius, field\n", 244 | " 10 is Radius SE, field 20 is Worst Radius.\n", 245 | "\n", 246 | " - class:\n", 247 | " - WDBC-Malignant\n", 248 | " - WDBC-Benign\n", 249 | "\n", 250 | " :Summary Statistics:\n", 251 | "\n", 252 | " ===================================== ====== ======\n", 253 | " Min Max\n", 254 | " ===================================== ====== ======\n", 255 | " radius (mean): 6.981 28.11\n", 256 | " texture (mean): 9.71 39.28\n", 257 | " perimeter (mean): 43.79 188.5\n", 258 | " area (mean): 143.5 2501.0\n", 259 | " smoothness (mean): 0.053 0.163\n", 260 | " compactness (mean): 0.019 0.345\n", 261 | " concavity (mean): 0.0 0.427\n", 262 | " concave points (mean): 0.0 0.201\n", 263 | " symmetry (mean): 0.106 0.304\n", 264 | " fractal dimension (mean): 0.05 0.097\n", 265 | " radius (standard error): 0.112 2.873\n", 266 | " texture (standard error): 0.36 4.885\n", 267 | " perimeter (standard error): 0.757 21.98\n", 268 | " area (standard error): 6.802 542.2\n", 269 | " smoothness (standard error): 0.002 0.031\n", 270 | " compactness (standard error): 0.002 0.135\n", 271 | " concavity (standard error): 0.0 0.396\n", 272 | " concave points (standard error): 0.0 0.053\n", 273 | " symmetry (standard error): 0.008 0.079\n", 274 | " fractal dimension (standard error): 0.001 0.03\n", 275 | " radius (worst): 7.93 36.04\n", 276 | " texture (worst): 12.02 49.54\n", 277 | " perimeter (worst): 50.41 251.2\n", 278 | " area (worst): 185.2 4254.0\n", 279 | " smoothness (worst): 0.071 0.223\n", 280 | " compactness (worst): 0.027 1.058\n", 281 | " concavity (worst): 0.0 1.252\n", 282 | " concave points (worst): 0.0 0.291\n", 283 | " symmetry (worst): 0.156 0.664\n", 284 | " fractal dimension (worst): 0.055 0.208\n", 285 | " ===================================== ====== ======\n", 286 | "\n", 287 | " :Missing Attribute Values: None\n", 288 | "\n", 289 | " :Class Distribution: 212 - Malignant, 357 - Benign\n", 290 | "\n", 291 | " :Creator: Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian\n", 292 | "\n", 293 | " :Donor: Nick Street\n", 294 | "\n", 295 | " :Date: November, 1995\n", 296 | "\n", 297 | "This is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.\n", 298 | "https://goo.gl/U2Uwz2\n", 299 | "\n", 300 | "Features are computed from a digitized image of a fine needle\n", 301 | "aspirate (FNA) of a breast mass. They describe\n", 302 | "characteristics of the cell nuclei present in the image.\n", 303 | "\n", 304 | "Separating plane described above was obtained using\n", 305 | "Multisurface Method-Tree (MSM-T) [K. P. Bennett, \"Decision Tree\n", 306 | "Construction Via Linear Programming.\" Proceedings of the 4th\n", 307 | "Midwest Artificial Intelligence and Cognitive Science Society,\n", 308 | "pp. 97-101, 1992], a classification method which uses linear\n", 309 | "programming to construct a decision tree. Relevant features\n", 310 | "were selected using an exhaustive search in the space of 1-4\n", 311 | "features and 1-3 separating planes.\n", 312 | "\n", 313 | "The actual linear program used to obtain the separating plane\n", 314 | "in the 3-dimensional space is that described in:\n", 315 | "[K. P. Bennett and O. L. Mangasarian: \"Robust Linear\n", 316 | "Programming Discrimination of Two Linearly Inseparable Sets\",\n", 317 | "Optimization Methods and Software 1, 1992, 23-34].\n", 318 | "\n", 319 | "This database is also available through the UW CS ftp server:\n", 320 | "\n", 321 | "ftp ftp.cs.wisc.edu\n", 322 | "cd math-prog/cpo-dataset/machine-learn/WDBC/\n", 323 | "\n", 324 | ".. topic:: References\n", 325 | "\n", 326 | " - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction \n", 327 | " for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on \n", 328 | " Electronic Imaging: Science and Technology, volume 1905, pages 861-870,\n", 329 | " San Jose, CA, 1993.\n", 330 | " - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and \n", 331 | " prognosis via linear programming. Operations Research, 43(4), pages 570-577, \n", 332 | " July-August 1995.\n", 333 | " - W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques\n", 334 | " to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) \n", 335 | " 163-171.\n" 336 | ] 337 | } 338 | ], 339 | "source": [ 340 | "print(cancer.DESCR)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 111, 346 | "id": "ca75e19f", 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "from sklearn.model_selection import train_test_split" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 113, 356 | "id": "f51903eb", 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 114, 366 | "id": "d3256177", 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "from sklearn.ensemble import RandomForestClassifier" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 147, 376 | "id": "cd47985a", 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "model = RandomForestClassifier(n_estimators=200)" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 148, 386 | "id": "953a3633", 387 | "metadata": {}, 388 | "outputs": [ 389 | { 390 | "data": { 391 | "text/plain": [ 392 | "RandomForestClassifier(n_estimators=200)" 393 | ] 394 | }, 395 | "execution_count": 148, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "model.fit(X_train, y_train)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 149, 407 | "id": "762c9ea1", 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/plain": [ 413 | "1.0" 414 | ] 415 | }, 416 | "execution_count": 149, 417 | "metadata": {}, 418 | "output_type": "execute_result" 419 | } 420 | ], 421 | "source": [ 422 | "model.score(X_train, y_train)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 150, 428 | "id": "d0b5d3bb", 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "data": { 433 | "text/plain": [ 434 | "0.9627659574468085" 435 | ] 436 | }, 437 | "execution_count": 150, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "model.score(X_test, y_test)" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "id": "eaddfc3a", 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "id": "8f956910", 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [] 461 | } 462 | ], 463 | "metadata": { 464 | "kernelspec": { 465 | "display_name": "Python 3", 466 | "language": "python", 467 | "name": "python3" 468 | }, 469 | "language_info": { 470 | "codemirror_mode": { 471 | "name": "ipython", 472 | "version": 3 473 | }, 474 | "file_extension": ".py", 475 | "mimetype": "text/x-python", 476 | "name": "python", 477 | "nbconvert_exporter": "python", 478 | "pygments_lexer": "ipython3", 479 | "version": "3.8.8" 480 | } 481 | }, 482 | "nbformat": 4, 483 | "nbformat_minor": 5 484 | } 485 | -------------------------------------------------------------------------------- /09 Ensemble Learning/Ensemble Learning - Boosting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "806fca1b", 6 | "metadata": {}, 7 | "source": [ 8 | "# Ensemble : Boosting Introduction\n", 9 | "\n", 10 | "
\n", 11 | "\n", 12 | "**Idea** : Combine multiple weak learners to form a strong learner to increase the model performance.\n", 13 | "\n", 14 | "\n", 15 | " \n", 16 | "\n", 17 | "**Bagging** : Models _(high var, low bias)_ + randomization + aggregation\n", 18 | "\n", 19 | "**Boosting** : Models _(low var, high bias)_ + additively combine\n", 20 | "\n", 21 | "
\n", 22 | "\n", 23 | "**Note : Bagging is Parallel. Boosting is Sequential.**\n", 24 | "\n", 25 | "
\n", 26 | "\n", 27 | "Most Popular Boosting algorithms are: \n", 28 | "- Gradient Boosting\n", 29 | "- Adaptive Boosting" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "5237e826", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "47aa8d07", 43 | "metadata": {}, 44 | "source": [ 45 | "# Boosting Intuition\n", 46 | "\n", 47 | "**Idea :** Boosting reduces high bias, while keeping the variance same." 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "cf06836d", 53 | "metadata": {}, 54 | "source": [ 55 | "| Hours Studied | Bunked Lectures | Assignment Submitted | Marks |\n", 56 | "|:---------------:|:-----------------:|:----------------------:|:-------:|\n", 57 | "| 7 | 2 | 9 | 93 |\n", 58 | "| 2 | 5 | 4 | 65 |\n", 59 | "| 5 | 3 | 7 | 77 |\n", 60 | "| 6 | 1 | 8 | 85 |" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "66b846cb", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "43548fae", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "951702d3", 82 | "metadata": {}, 83 | "source": [ 84 | "# Boosting Example Walkthrough" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "b98f0b4d", 90 | "metadata": {}, 91 | "source": [ 92 | "| Hours Studied | Bunked Lectures | Assignment Submitted | Marks |\n", 93 | "|:---------------:|:-----------------:|:----------------------:|:-------:|\n", 94 | "| 7 | 2 | 9 | 93 |\n", 95 | "| 2 | 5 | 4 | 65 |\n", 96 | "| 5 | 3 | 7 | 77 |\n", 97 | "| 6 | 1 | 8 | 85 |" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "id": "17bd14be", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "2e7064e8", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "bfa550da", 119 | "metadata": {}, 120 | "source": [ 121 | "# Concept of Pseudo-residuals" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "id": "4a875446", 127 | "metadata": {}, 128 | "source": [ 129 | "" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "23cce5ca", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "a9a0e998", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "id": "f6ab1435", 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "id": "55a0df68", 159 | "metadata": {}, 160 | "source": [ 161 | "# Gradient Boosting Algorithm" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "id": "e48e0ef9", 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "id": "e23439ec", 175 | "metadata": {}, 176 | "source": [ 177 | "# Bias Variance Tradeoff\n", 178 | "- regularization" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "9b69eb01", 184 | "metadata": {}, 185 | "source": [ 186 | "" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "e194f86f", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "064d72b4", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "id": "f3a45ec2", 208 | "metadata": {}, 209 | "source": [ 210 | "# Gradient Boosting Code" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 121, 216 | "id": "338bc7e8", 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "from sklearn.datasets import make_regression" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 124, 226 | "id": "3f7dfa31", 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "X, y = make_regression(n_samples=1000, n_features=10, n_informative=6, noise=2.0)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 125, 236 | "id": "cf825f6b", 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "(1000, 10)" 243 | ] 244 | }, 245 | "execution_count": 125, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "X.shape" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 126, 257 | "id": "b332eb48", 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/plain": [ 263 | "(1000,)" 264 | ] 265 | }, 266 | "execution_count": 126, 267 | "metadata": {}, 268 | "output_type": "execute_result" 269 | } 270 | ], 271 | "source": [ 272 | "y.shape" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 127, 278 | "id": "41b3bca2", 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "from sklearn.model_selection import train_test_split" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 128, 288 | "id": "ccd80e1e", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 129, 298 | "id": "44c37c8d", 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "from sklearn.ensemble import GradientBoostingRegressor" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 136, 308 | "id": "dfa0854e", 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "M = [10, 50, 100, 200, 500, 1000]" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 137, 318 | "id": "5692726d", 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "train_scores = []\n", 323 | "test_scores = []" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 138, 329 | "id": "6977fb0d", 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "for m in M:\n", 334 | " model = GradientBoostingRegressor(n_estimators=m)\n", 335 | " model.fit(X_train, y_train)\n", 336 | " tr_sc = model.score(X_train, y_train)\n", 337 | " te_sc = model.score(X_test, y_test)\n", 338 | " \n", 339 | " train_scores.append(tr_sc)\n", 340 | " test_scores.append(te_sc)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 139, 346 | "id": "dbe6724f", 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/plain": [ 352 | "[0.5693298400680644,\n", 353 | " 0.9517023668196679,\n", 354 | " 0.9896131212623428,\n", 355 | " 0.9957854115756537,\n", 356 | " 0.9991338471481596,\n", 357 | " 0.999898854280583]" 358 | ] 359 | }, 360 | "execution_count": 139, 361 | "metadata": {}, 362 | "output_type": "execute_result" 363 | } 364 | ], 365 | "source": [ 366 | "train_scores" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 140, 372 | "id": "c50a77c6", 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/plain": [ 378 | "[0.5165900903969471,\n", 379 | " 0.8747191986794813,\n", 380 | " 0.9369690789752081,\n", 381 | " 0.953364577019059,\n", 382 | " 0.9584623589008775,\n", 383 | " 0.960608551723134]" 384 | ] 385 | }, 386 | "execution_count": 140, 387 | "metadata": {}, 388 | "output_type": "execute_result" 389 | } 390 | ], 391 | "source": [ 392 | "test_scores" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "id": "7547f6f8", 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "id": "6e5d8b22", 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "id": "921753f9", 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "id": "60c16771", 422 | "metadata": {}, 423 | "source": [ 424 | "# XGBoost" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "id": "20754f16", 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "id": "a0696e8b", 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "id": "9205e929", 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "id": "26293441", 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "id": "0014f3b1", 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "id": "728a52b1", 470 | "metadata": {}, 471 | "source": [ 472 | "# Adaptive Boosting" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "id": "01e037c2", 478 | "metadata": {}, 479 | "source": [ 480 | "upenn" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "id": "ac6f12eb", 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "id": "9d950a73", 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "id": "84a319fb", 502 | "metadata": {}, 503 | "source": [ 504 | "# AdaBoost Code" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": null, 510 | "id": "b0accbca", 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [] 514 | } 515 | ], 516 | "metadata": { 517 | "kernelspec": { 518 | "display_name": "Python 3", 519 | "language": "python", 520 | "name": "python3" 521 | }, 522 | "language_info": { 523 | "codemirror_mode": { 524 | "name": "ipython", 525 | "version": 3 526 | }, 527 | "file_extension": ".py", 528 | "mimetype": "text/x-python", 529 | "name": "python", 530 | "nbconvert_exporter": "python", 531 | "pygments_lexer": "ipython3", 532 | "version": "3.8.8" 533 | } 534 | }, 535 | "nbformat": 4, 536 | "nbformat_minor": 5 537 | } 538 | --------------------------------------------------------------------------------