└── MACHINE_LEARNING ├── DECISION_TREES_AND_RANDOM_FOREST_ALGORITHM ├── .ipynb_checkpoints │ ├── 01-Decision Trees and Random Forests in Python-checkpoint.ipynb │ ├── Decision_Trees_and_Random_Forests_Project-checkpoint.ipynb │ ├── Decision_Trees_and_Random_Forests_Project_Solutions-checkpoint.ipynb │ └── Decision_Trees_and_Random_Forests_With_Python-checkpoint.ipynb ├── Decision_Trees_and_Random_Forests_Project.ipynb ├── Decision_Trees_and_Random_Forests_Project_Solutions.ipynb ├── Decision_Trees_and_Random_Forests_With_Python.ipynb ├── data_loan.csv └── kyphosis.csv ├── HIERARCHICAL_CLUSTERING_ALGORITHM ├── .ipynb_checkpoints │ └── hieararchical_clustering_algorıthm_with_python-checkpoint.ipynb └── hieararchical_clustering_algorıthm_with_python.ipynb ├── K_MEANS_CLUSTERING_ALGORITHM ├── .ipynb_checkpoints │ ├── KMEANS_CLUSTERING _PROJECT-checkpoint.ipynb │ ├── KMEANS_CLUSTERING _PROJECT_SOLUTIONS-checkpoint.ipynb │ ├── KMEANS_CLUSTERING _PROJECT__-checkpoint.ipynb │ └── K_MEANS_CLUSTERING_WITH_PYTHON-checkpoint.ipynb ├── KMEANS_CLUSTERING _PROJECT.ipynb ├── KMEANS_CLUSTERING _PROJECT_SOLUTIONS.ipynb ├── K_MEANS_CLUSTERING_WITH_PYTHON.ipynb └── data_college.csv ├── K_NEAREST_NEIGHBORS_ALGORITHM ├── .ipynb_checkpoints │ ├── KNN DÜZELTME-checkpoint.ipynb │ ├── K_Nearest_Neighbors_Project--checkpoint.ipynb │ ├── K_Nearest_Neighbors_Project-checkpoint.ipynb │ ├── K_Nearest_Neighbors_Project_Solutions-checkpoint.ipynb │ └── K_Nearest_Neighbors_with_Python-checkpoint.ipynb ├── Classified_Data.csv ├── KNN_Data.csv ├── K_Nearest_Neighbors_Project.ipynb ├── K_Nearest_Neighbors_Project_Solutions.ipynb └── K_Nearest_Neighbors_with_Python.ipynb ├── LINEAR_REGRESSION └── Linear_Regression_Notebook.ipynb ├── LOGISTIC_REGRESSION └── Logistic_Regression_Notebook.ipynb ├── PRINCIPAL_COMPONENT_ANALYSIS(PCA) ├── .ipynb_checkpoints │ └── Principal_Component_Analysis-checkpoint.ipynb ├── PCA.png └── Principal_Component_Analysis.ipynb ├── RECOMMENDER_SYSTEMS_ALGORITHM ├── .ipynb_checkpoints │ ├── advanced_recommender_systems_with_python-checkpoint.ipynb │ └── recommender_systems_with_python-checkpoint.ipynb ├── Movie ├── advanced_recommender_systems_with_python.ipynb ├── recommender_systems_with_python.ipynb ├── u.data └── u.item └── SUPPORT_VECTOR_MACHINES_ALGORITHM ├── .ipynb_checkpoints ├── SUPPORT_VECTOR_MACHINES_PROJECT_SOLUTIONS-checkpoint.ipynb └── SUPPORT_VECTOR_MACHINES_WITH_PYTHON-checkpoint.ipynb ├── SUPPORT_VECTOR_MACHINES_PROJECT.ipynb ├── SUPPORT_VECTOR_MACHINES_PROJECT_SOLUTIONS.ipynb └── SUPPORT_VECTOR_MACHINES_WITH_PYTHON.ipynb /MACHINE_LEARNING/DECISION_TREES_AND_RANDOM_FOREST_ALGORITHM/kyphosis.csv: -------------------------------------------------------------------------------- 1 | "Kyphosis","Age","Number","Start" 2 | "absent",71,3,5 3 | "absent",158,3,14 4 | "present",128,4,5 5 | "absent",2,5,1 6 | "absent",1,4,15 7 | "absent",1,2,16 8 | "absent",61,2,17 9 | "absent",37,3,16 10 | "absent",113,2,16 11 | "present",59,6,12 12 | "present",82,5,14 13 | "absent",148,3,16 14 | "absent",18,5,2 15 | "absent",1,4,12 16 | "absent",168,3,18 17 | "absent",1,3,16 18 | "absent",78,6,15 19 | "absent",175,5,13 20 | "absent",80,5,16 21 | "absent",27,4,9 22 | "absent",22,2,16 23 | "present",105,6,5 24 | "present",96,3,12 25 | "absent",131,2,3 26 | "present",15,7,2 27 | "absent",9,5,13 28 | "absent",8,3,6 29 | "absent",100,3,14 30 | "absent",4,3,16 31 | "absent",151,2,16 32 | "absent",31,3,16 33 | "absent",125,2,11 34 | "absent",130,5,13 35 | "absent",112,3,16 36 | "absent",140,5,11 37 | "absent",93,3,16 38 | "absent",1,3,9 39 | "present",52,5,6 40 | "absent",20,6,9 41 | "present",91,5,12 42 | "present",73,5,1 43 | "absent",35,3,13 44 | "absent",143,9,3 45 | "absent",61,4,1 46 | "absent",97,3,16 47 | "present",139,3,10 48 | "absent",136,4,15 49 | "absent",131,5,13 50 | "present",121,3,3 51 | "absent",177,2,14 52 | "absent",68,5,10 53 | "absent",9,2,17 54 | "present",139,10,6 55 | "absent",2,2,17 56 | "absent",140,4,15 57 | "absent",72,5,15 58 | "absent",2,3,13 59 | "present",120,5,8 60 | "absent",51,7,9 61 | "absent",102,3,13 62 | "present",130,4,1 63 | "present",114,7,8 64 | "absent",81,4,1 65 | "absent",118,3,16 66 | "absent",118,4,16 67 | "absent",17,4,10 68 | "absent",195,2,17 69 | "absent",159,4,13 70 | "absent",18,4,11 71 | "absent",15,5,16 72 | "absent",158,5,14 73 | "absent",127,4,12 74 | "absent",87,4,16 75 | "absent",206,4,10 76 | "absent",11,3,15 77 | "absent",178,4,15 78 | "present",157,3,13 79 | "absent",26,7,13 80 | "absent",120,2,13 81 | "present",42,7,6 82 | "absent",36,4,13 83 | -------------------------------------------------------------------------------- /MACHINE_LEARNING/HIERARCHICAL_CLUSTERING_ALGORITHM/.ipynb_checkpoints/hieararchical_clustering_algorıthm_with_python-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# HIEARARCHICAL CLUSTERING ALGORITHM WITH PYTHON\n", 8 | "\n", 9 | "Hierarchical clustering is an unsupervised machine learning method that you can use to predict subgroups based on the difference between data points and their nearest neighbors.\n", 10 | "Each data point is linked to its nearest neighbor based on the distance matrix you choose.\n", 11 | "\n", 12 | "## Episode 1: Do a neccesary imports " 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import numpy as np\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "from sklearn.datasets import make_blobs" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Episode 2 : Create a blob of 200 data points" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "dataset = make_blobs(n_samples = 200,\n", 40 | " n_features = 2,\n", 41 | " centers = 4,\n", 42 | " cluster_std = 1.6,\n", 43 | " random_state = 50)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "points = dataset[0]" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Episode 3 : Create a dendrogram" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import scipy.cluster.hierarchy as sch\n", 69 | "from sklearn.cluster import AgglomerativeClustering" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 6, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "image/png": "\n", 80 | "text/plain": [ 81 | "
" 82 | ] 83 | }, 84 | "metadata": { 85 | "needs_background": "light" 86 | }, 87 | "output_type": "display_data" 88 | } 89 | ], 90 | "source": [ 91 | "dendrogram = sch.dendrogram(sch.linkage(points, method = \"ward\"))" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 8, 97 | "metadata": { 98 | "scrolled": true 99 | }, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "" 105 | ] 106 | }, 107 | "execution_count": 8, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | }, 111 | { 112 | "data": { 113 | "image/png": "\n", 114 | "text/plain": [ 115 | "
" 116 | ] 117 | }, 118 | "metadata": { 119 | "needs_background": "light" 120 | }, 121 | "output_type": "display_data" 122 | } 123 | ], 124 | "source": [ 125 | "plt.scatter(dataset[0][:,0], dataset[0][:,1])" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Episode 4 : Perform the actual clustering" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 9, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "hc = AgglomerativeClustering(n_clusters=4, affinity = \"euclidean\", linkage = \"ward\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 10, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "y_hc = hc.fit_predict(points)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 15, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "" 162 | ] 163 | }, 164 | "execution_count": 15, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | }, 168 | { 169 | "data": { 170 | "image/png": "\n", 171 | "text/plain": [ 172 | "
" 173 | ] 174 | }, 175 | "metadata": { 176 | "needs_background": "light" 177 | }, 178 | "output_type": "display_data" 179 | } 180 | ], 181 | "source": [ 182 | "plt.scatter(points[y_hc == 0,0], points[y_hc == 0,1], s = 100, c = \"cyan\")\n", 183 | "plt.scatter(points[y_hc == 1,0], points[y_hc == 1,1], s = 100, c = \"yellow\")\n", 184 | "plt.scatter(points[y_hc == 2,0], points[y_hc == 2,1], s = 100, c = \"red\")\n", 185 | "plt.scatter(points[y_hc == 3,0], points[y_hc == 3,1], s = 100, c = \"green\")" 186 | ] 187 | } 188 | ], 189 | "metadata": { 190 | "kernelspec": { 191 | "display_name": "Python 3", 192 | "language": "python", 193 | "name": "python3" 194 | }, 195 | "language_info": { 196 | "codemirror_mode": { 197 | "name": "ipython", 198 | "version": 3 199 | }, 200 | "file_extension": ".py", 201 | "mimetype": "text/x-python", 202 | "name": "python", 203 | "nbconvert_exporter": "python", 204 | "pygments_lexer": "ipython3", 205 | "version": "3.8.3" 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 4 210 | } 211 | -------------------------------------------------------------------------------- /MACHINE_LEARNING/HIERARCHICAL_CLUSTERING_ALGORITHM/hieararchical_clustering_algorıthm_with_python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# HIEARARCHICAL CLUSTERING ALGORITHM WITH PYTHON\n", 8 | "\n", 9 | "Hierarchical clustering is an unsupervised machine learning method that you can use to predict subgroups based on the difference between data points and their nearest neighbors.\n", 10 | "Each data point is linked to its nearest neighbor based on the distance matrix you choose.\n", 11 | "\n", 12 | "## Episode 1: Do a neccesary imports " 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import numpy as np\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "from sklearn.datasets import make_blobs" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Episode 2 : Create a blob of 200 data points" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "dataset = make_blobs(n_samples = 200,\n", 40 | " n_features = 2,\n", 41 | " centers = 4,\n", 42 | " cluster_std = 1.6,\n", 43 | " random_state = 50)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "points = dataset[0]" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Episode 3 : Create a dendrogram" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import scipy.cluster.hierarchy as sch\n", 69 | "from sklearn.cluster import AgglomerativeClustering" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 6, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAD6CAYAAACrklzBAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3de5zcdX3v8deHBGK4CUuWEAMkgCHhXjyrGFDQYisCEtpCxVtjSoutQr31WC89tedxtLU3qq1VodY0qEUI6oGWHCSNXAQB2QASLglggCSQy5IFQmIIuXzOH5/PbzOZzOzOzsxmNz/ez8cjj9mZ+V0+39vn+/39Zmdj7o6IiJTPHsMdgIiIDA0leBGRklKCFxEpKSV4EZGSUoIXESkpJXgRkZIaMMGb2bfNbI2ZPVTx2t+Z2WIze9DMfmRmB1S891kze8LMlpjZO4cqcBER6V8jK/h/B86qem0+cLy7nwg8BnwWwMyOBS4Cjst9vm5mo9oWrYiINGz0QBu4++1mNrnqtZsrnt4NXJA/zwC+7+6bgCfN7AngTcBd/Z1j3LhxPnny5P42ERGRKgsXLnzO3TvrvT9ggm/A7wPX5M8TiYRfWJGv9Wvy5Ml0d3e3IRQRkVcPM3u6v/db+pDVzD4PbAG+V7xUY7OafwvBzC4xs24z6+7p6WklDBERqaHpBG9mM4Fzgff79j9oswI4rGKzQ4Fna+3v7le6e5e7d3V21r3CEBGRJjWV4M3sLODPgPPc/VcVb90AXGRmY8zsCGAK8PPWwxQRkcEa8B68mV0NvA0YZ2YrgC8QvzUzBphvZgB3u/sfufvDZnYt8Ahx6+aj7r51qIIXEZH6bCT8ueCuri7Xh6wiIoNjZgvdvave+/omq4hISSnBi4iUVDt+D16GyH/cs4zrH3hmuMOQV7EZvzaR951y+HCHIU3SCn4Eu/6BZ3hk5brhDkNepR5ZuU4LjN2cVvAj3LET9ueaD08f7jDkVeg9V/T7F0ZkN6AVvIhISSnBi4iUlBK8iEhJKcGLiJSUEryISEkpwYuIlJQSvIhISSnBi4iUlBK8iEhJKcGLiJSUEryISEkpwYuIlJQSvIhISSnBi4iUlBK8iEhJKcGLiJSUEryISEkpwYuIlJQSvIhISSnBi4iUlBK8iEhJKcGLiJTUgAnezL5tZmvM7KGK1zrMbL6ZPZ6PB1a891kze8LMlpjZO4cqcBER6V8jK/h/B86qeu0zwAJ3nwIsyOeY2bHARcBxuc/XzWxU26IVEZGGDZjg3f12oLfq5RnAnPx5DnB+xevfd/dN7v4k8ATwpjbFKiIig9DsPfjx7r4SIB8PztcnAssrtluRr4mIyC7W7g9ZrcZrXnNDs0vMrNvMunt6etochoiINJvgV5vZBIB8XJOvrwAOq9juUODZWgdw9yvdvcvduzo7O5sMQ0RE6mk2wd8AzMyfZwLXV7x+kZmNMbMjgCnAz1sLUUREmjF6oA3M7GrgbcA4M1sBfAH4MnCtmV0MLAMuBHD3h83sWuARYAvwUXffOkSxi4hIPwZM8O7+3jpvnVln+y8BX2olKBERaZ2+ySoiUlJK8CIiJaUELyJSUkrwIiIlpQQvIlJSSvAiIiWlBC8iUlJK8CIiJaUELyJSUkrwIiIlpQQvIlJSSvAiIiWlBC8iUlJK8CIiJaUELyJSUkrwIiIlpQQvIlJSSvAiIiWlBC8iUlJK8CIiJaUELyJSUkrwIiIlpQQvIlJSSvAiIiWlBC8iUlJK8CIiJaUELyJSUi0leDP7hJk9bGYPmdnVZvYaM+sws/lm9ng+HtiuYEVEpHFNJ3gzmwj8CdDl7scDo4CLgM8AC9x9CrAgn4uIyC7W6i2a0cBYMxsN7A08C8wA5uT7c4DzWzyHiIg0oekE7+7PAH8PLANWAi+6+83AeHdfmdusBA5uR6AiIjI4rdyiOZBYrR8BvA7Yx8w+MIj9LzGzbjPr7unpaTYMERGpo5VbNO8AnnT3HnffDPwQOBVYbWYTAPJxTa2d3f1Kd+9y967Ozs4WwhARkVpaSfDLgDeb2d5mZsCZwKPADcDM3GYmcH1rIYqISDNGN7uju99jZtcB9wFbgPuBK4F9gWvN7GJiEriwHYGKiMjgNJ3gAdz9C8AXql7eRKzmRURkGOmbrCIiJaUELyJSUkrwIiIlpQQvIlJSSvAiIiWlBC8iUlJK8CIiJaUELyJSUkrwIiIlpQQvIlJSSvAiIiWlBC8iUlJK8CIiJaUELyJSUkrwIiIlpQQvIlJSSvAiIiWlBC8iUlJK8CIiJaUELyJSUkrwIiIlpQQvIlJSSvAiIiWlBC8iUlJK8CIiJaUELyJSUkrwIiIl1VKCN7MDzOw6M1tsZo+a2XQz6zCz+Wb2eD4e2K5gRUSkca2u4L8K3OTu04CTgEeBzwAL3H0KsCCfi4jILtZ0gjez/YHTgX8DcPdX3P0FYAYwJzebA5zfapAiIjJ4razgjwR6gNlmdr+ZfcvM9gHGu/tKgHw8uA1xiojIILWS4EcDbwC+4e4nAxsYxO0YM7vEzLrNrLunp6eFMEREpJZWEvwKYIW735PPryMS/mozmwCQj2tq7ezuV7p7l7t3dXZ2thCGiIjU0nSCd/dVwHIzm5ovnQk8AtwAzMzXZgLXtxShiIg0ZXSL+18GfM/M9gKWArOISeNaM7sYWAZc2OI5RESkCS0leHd/AOiq8daZrRxXRERap2+yioiUlBK8iEhJKcGLiJSUEryISEkpwYuIlJQSvIhISSnBi4iUlBK8iEhJKcGLiJSUEryISEkpwYuIlJQSvIhISSnBi4iUlBK8iEhJKcGLiJSUEryISEkpwYuIlJQSvIhISSnBi4iUlBK8iEhJKcGLiJSUEryISEmNHu4ARKS+uY/NZd7SecNy7iW9ZwAw66Yrd/m5zz7ybC48+sJdft6yUYIXGcHmLZ3Hkt4lTO2YusvPffLJt+3ycwIs6V0CoATfBkrwIiPc1I6pzD5r9nCHscvMumnWcIdQGroHLyJSUkrwIiIl1XKCN7NRZna/mf1XPu8ws/lm9ng+Hth6mCIiMljtWMF/DHi04vlngAXuPgVYkM9FRGQXaynBm9mhwDnAtypengHMyZ/nAOe3cg4REWlOqyv4rwCfBrZVvDbe3VcC5OPBLZ5DRESa0HSCN7NzgTXuvrDJ/S8xs24z6+7p6Wk2DBERqaOVFfxpwHlm9hTwfeDXzey7wGozmwCQj2tq7ezuV7p7l7t3dXZ2thCGiIjU0nSCd/fPuvuh7j4ZuAj4ibt/ALgBmJmbzQSubzlKEREZtKH4PfgvA79hZo8Dv5HPRURkF2vLnypw91uBW/PntcCZ7TiuiIg0T99kFREpKSV4EZGSUoIXESkp/bngQvdsWHTdcEexo1Uz4nH2F4c3jkonXABd+nOuIrsDJfjCoutg1SI45IThjqTPNYePsN8wXbUoHpXgRXYLSvCVDjkBZt043FGMXLPPGe4IRGQQdA9eRKSklOBFREpKCV5EpKSU4EVESkoJXkSkpJTgRURKSgleRKSklOBFREpKCV5EpKSU4EVESkoJXkSkpJTgRURKSgleRKSklOBFREpKCV5EpKSU4EVESkoJXkSkpJTgRURKSgleRKSklOBFREpKCV5EpKSaTvBmdpiZ3WJmj5rZw2b2sXy9w8zmm9nj+Xhg+8IVEZFGtbKC3wJ8yt2PAd4MfNTMjgU+Ayxw9ynAgnwuIiK7WNMJ3t1Xuvt9+fNLwKPARGAGMCc3mwOc32qQIiIyeG25B29mk4GTgXuA8e6+EmISAA6us88lZtZtZt09PT3tCENERCq0nODNbF/gB8DH3X1do/u5+5Xu3uXuXZ2dna2GISIiVVpK8Ga2J5Hcv+fuP8yXV5vZhHx/ArCmtRBFRKQZrfwWjQH/Bjzq7pdXvHUDMDN/nglc33x4IiLSrNEt7Hsa8EFgkZk9kK99DvgycK2ZXQwsAy5sLUQREWlG0wne3e8ArM7bZzZ7XBERaY9WVvAiIjXNfWwu85bOa2rfxb2LAZh106ymz3/2kWdz4dG6eaA/VSAibTdv6TyW9C5pat9pHdOY1jGt6XMv6V3S9ORSNlrBi8iQmNoxldlnzd7l521l5V82WsGLiJSUEryISEnpFk1Zdc+GRde195irHozH2ee075gnXABduqQWGQpawZfVoutg1aL2HvOQE+Nfu6xa1P5JSET6lGMF347VajtXpyNlVXrICTDrxqE9R0t171Hvzdb5SKlnkRGqHCv4dqxW27U6fbWtSlup+1bq/NVWzyJNKMcKHnbNarUR7bw/vbsYjrofwfXcypd8qrXjSz+V9AWgV5dyrOBFRpBWvuRTrdUv/VTSF4BefcqzghcZQYbrSz792Z2/ADSYq6LBXPWU/YpGK3gRGfEGc1XU6FXPq+GKRit4EdktDOaqqJEVv+Ms7l084Ep/d17lawUvIqXTyIq/kZX+7r7K1wpeREqpHZ+D7M6fW4ASfOMa/ULPYL4wpS/qiMgQUoJvVPGFnkNO6H+7Rr+4U3w5SAk+DPYbsc1887ikE2qjv2Ey1L9dUhlH5blGyj3sZr6f0Mr3EEZCuXePBD/Q4G9ksLdjcLfzCz3t+KJOf/Wyq+qkXRqdQAuD/QZsiSfU4n7z1I6p/W7X6O/TF/euB5ucKuMoztXssYZCo/VUqdnvIIyUcu8eCX6gwT/QYC/r4O6vXtpdJ/Umk/4mksFOIEP5jdg2fvN1oJVgI6u+dq/u2vl7963cd66OY6Tdw95V308YKeXePRI8tDb4R/DX2lvWbL0Mtk7qTSb1JpKyTqoMvBJs5DczYPhXd9K/4fp/Zds5+e8+CX53V70CrrfyHUm3TaoNZjIp86RKayvBkbK6q5fA+ktOI+G+8q7SzC2dwki5taMEv6tUr4BrrXxLvOqVkadeAquXnF6NVx67+k9OtHvyV4Kvpdb95lor7nbfY94Vq96ibNXlGclXDoPRzGcFMGTlr7VKrrdCHo7V8UAJrDL+ym9+DhRrsV91WRstY3W9jaQ6a8Wu/vxm5CX4RpMr9D8oK49TuX8jA7nW/ebqFfeuXm23q176ylZRnnaUpdlbUO2ecAb7WQEMaVvWWiVXrpB7NvawduNa1m9e3ze4G0mclYYy+TX7mzHFfpVlHcwVQHW91bqqaPR4g5ls6v2qZ6HVOt3Vn9+MvATfSHKFgQdl5XGK/QczkHf1art7dv8TUbvqBXYuWzvK0uwtqGK/fTphQ0/UwaZ18bjouuYS/WA/eB7iK6f+VsmzbppF78ZeusZ39X0tvpHEWW/CKNRLBM38rnqzvxnTMbZjh+fVf/tloGRZq96auaIYzGRTWb8HjT2ItRvX9tVTMQkX52822e/Kz29GXoKHxgZoI4OyOE7f6rLqv4gbTPJoZIXaymX+ousAi5jrJel69bJDbBVlbOYKp5VyDBhfxtY9e8fjF5PChufi55dWRrJ/+o7tib6VuOrGk9rdlgxu1Ti1YypnH3k2i3sXN5Sw+ksQxXnrJdNaCax7dXdDiau/MsH2CWPuY3MBWLtxLb0be/smo8qEWZksB5MoG7mimPvY3J1iKeqsVv1Un7/Ytph8a622d/XnEdV1P/exuQ2de8gSvJmdBXwVGAV8y92/3O8OtS7Vaw2yRrarXA33bd/iKrHeCrVIRq2uOiGOfcIFcZxaibCR2Bq9Wqne56WVO9ZL5b7V9Vn5eiO3wSpvCxX/1V6tuIoJYvY5kewnvaX/uKpjqI4DardFo21Zca7KleOi5xaxedtmpv/H9B1WhZWJYrC3KOYtnYdhTO2YWnO7RieMyvP2bOzZIZkWqhNY1/iuhmOsVyYAw/q2qz7X3Mfmcnn35azfvJ5999yXSftPonNsZ9/kUpyzVjnrJeCiXqonxqIuq2OpVT/V569WbzJtZCU90O204vYcULcf1Yq7kSu9wpAkeDMbBfwL8BvACuBeM7vB3R+pu1NlEnhpZazelt218+qtke2K1XBxXIB9xm0fvB1HbU/2lavEEy6IbauTWWWCq4wFdkxGsHNibXTiqqyHIvaffW17/M90w9bN8NeHRdmLY1TGVv3aQCv5yhV3ZTmqk3AR06YXYf5fbC9frUmiuj6LY9SavIq63qcT9ptQO7Yiro6joPeXO56/e3Y837QOxuwfx6k1WVW2bWVbVNdLrXPlvvOeu50lvUvoGNvB5m2b2ebb2LxtMxD30Z9e9zSLexezcPVCVm9Y3TeQi9XtvKXzeGXbK9y35r66K/TKxFutkcRdmRiLRNe7sZdJ+0/i6XVPc3n35Rw09iA6x3budM56K9tq9ZLrQWMP6vfXCYukW9yK6hzb2VfW7tXdfQmrKGf11UWhemVeTBqT9p/Ut+1AsVRPcEV8xfEPGnvQDlcBle1V1FP1JF/9fnGs9ZvXs4ftwd6j9wbYof7XblzLxi0bGTt6LLBjP4KdJ5z++kc9Q/Xngt8EPOHuS939FeD7wIx+91j1IOAxqIvB7lvj8ek7tg/sRrc75IQd71cXyR1iv/0mxPNJb4mkUOy76Lp4fdO67cm1SCKV20EkhFUPwivr4/msG+m7DVGY/xexT7FKfPoOuPFTkagrbwlU7lPEvmFNbA+R3GuVszLe6teKeG/81M63tIpEWxlHX8KvupVVvDfpLTvWS9EWRdmL9ijOW7ldkWgr67Z4Xp3cK+uiOEexTa12KeLa0BNxFG1d2bbF+Yq2KOprpzbwncuR9Tq1YyqdYzvZ5tsA2HOPPZl91uy+Abt+83puW34b3au7mdYxjfWb13N59+XMWzqP7tXd7LnHnmzzbXSv7uby7supZdZNs1jcu7hvIqh8bcOWDQB0ju1k/eb1dI3v2ukcleetjLmIr1gxAn3JGSJRFvtP65hWM8bKlTJEUly/ef1Ox61lce9iHGf2WbP7JpH+tqtXxuJ885bO63veNb6rbhkHqtupHVP7JoPieJ1jO/t+LuqiSNpFPRVtCexQV0U7QKzK97A92Obb+tpl7ca1zD5rdt8k6TjTOqbt1I8u776c6f8xvS/OyrYqnjfC3L2hDQfDzC4AznL3P8jnHwROcfdLK7a5BLgkn04F2vOfWIqIvHpMcvfOem8O1T14q/HaDjOJu18JXDlE5xcRedUbqls0K4DDKp4fCjw7ROcSEZEahirB3wtMMbMjzGwv4CLghiE6l4iI1DAkt2jcfYuZXQr8mPg1yW+7+8NDcS4REaltSD5kFRGR4TdUt2hERGSYKcGLiJSUEryISEmNzD82VsXMPgRMBn4FrAZ+4O7rzewAAHd/YYD9DwJ6gdcCW4kPfrcCb81NTgFedPfaXy/c8VgH5I+jgF6v8SGGme0PvOTuXvnzQMceahWxv5v4rsJrgY3u/q0a2+5LLADaHntRJ3n+uu2X2+0BbHX3l2q8/17iV3C/AUx39/kDnPcg6rRZP/sckHG+WB2nme3r7uvz5w8B24D/Adzp7tcW2wBj+jtvRbtsKY5X8foOZR+oDBV9fT8aaLvKMtR5f3/gd4lvpwNcD9xdL4aKtt2H7D8Zyx4QbW1m73T3H9crY7Mq2qqXHON1+s0biNx3PvCou3+nxjZ99TxQHdWIoa9Pm9kVRJ392L34KvpOfadmm5rZF4n62xPYBHyjIu81VGfD+iGrmZ0H7Av8FvFt1q3AEcAPgEnAT4CxwDHArcAHgeeAI4HXAD8jBtQm4BWiIy0nJgLP400B1gBHAauA/YGDgZXAWqAHeAD4APH7+3cD04lOsgXYAJwB3AwcBxwE3A8cncfoBCYCT2YZ1uR5ujL+Q4lG2ggcT0xQk4FlwAsZz0XAZcDewLlE57s5yzkGuAZ4X5Z5fJb7v4CZREd+iPjV1GOA84B5wNsyvi7gPuCXwIHAXlmO1Vmns4BbiAH8Qpb5qKzPX+VrB+XrC4FjgduA54k/P/HVjO1w4CrgNe7+RTN7G/G3iN6a9XJL7vvTrP83AXfka6uBG4FxxN8wujbr4KXc7slsq59nfZ4N/CLjOiPrYjLwI+BC4Jmst8Py51/m8U8A3pVtdGQe45tE/zkrj3sdMC0fP5yx3k0kjr2y7DOBJ4hJ8k6i3/0W0WdeAv44/70949iSZRxF9O9fAq8jvhsyKsv0KHA68FS2217A48AFwFJiHLyS9T4xz781j/M0sUjpzmO9nujTxbiZD3wUeJFIFj/Nun5t/tsz6/cNRB/bkufrAeYCv0P0y/kZ56o8T2fu/508/nMZ3wlEfzdgHdHX35jnPTrL8QTwa7n9FKL/9hL9dhGwyN0/bWZfIBLxz4gx/TzR37YAdxEOIMZRUS9dxBidmOX6cbbvi0S/OiwfH8vyLib60nPZPhuzXU7JWH4t6/MoYDPRb9cSfb4zt10NnEn0xX2Bjnwf4K+IcTYq4/vNLOtyYH3GcBLwIDFWVgMvE/3wFaL99wAOyXZ5ksgD33T3b9CP4b5F806iU60hOsppROd+mkgsexOd9SiiAsYSDb2YSGp/TgwCz31uJpLZL4gGWZT73Ed01ruJRvwFkTSfz/OfRQzcOzKGZUSjduXPBxIDaQnbB9n++dqPiEa6imi8bRnfMmIQjM9yLCPqe3nGujfRqScC3yI6wDnAP+V5Ds3t7wB+mxgEp+W5fkYkn2nEgPhtYvKbTExIhxFJa1XuP4PoHIdnff2cSLib8hG2X9n8Q55jNfDtfHw0H58jVmaHAydnGS4mJoytWTd/amb/nW1zQdbPSuAT2RbTiQ5ruU0x6N4F/BHwz0QnP4RIzkuJATox2/mArJffzfg/ne/fTwz6UVknvVn2a4nFwgeIQfwaYiD9Itvrj4HP5TYb8rVTgEuJ5P1C1sMbiMmoWJW+nG0CMYmtBh4hFhwXEQP/0GyDh7Psj2dd3JbHvSfjfgxYkPX+9jzu5Nx2FJE0ns/4v0qMlWOIfjIx63BVnnN51u0nsj06iL4yAfh9os+OyrZ7V277PJHY7s041gBXZ7ucmY/fICartRnH64iFxJos81ZiUrqG7f3nx7nvs1nOT2e5/zu3P4DoP8XV9YZsw1nAzOxHp+R5HyQm77OJMbI0yz0py7OESMz/mG3zADHOjiL60biM67t5nnuB27O+1hG56D4i3xxLLG7uBWbnz+cBVxB55GZi0jkg6/9Xeawi0T+Z5b6CGGvxR4Tg1zOGLcQidg9iPHyTGGO9+Tg7y7uIGOtj8/WlxCL0Y3mesQxguFfwJxGDdAzwSSJpryOS5C1E49xGrARHEyviMcRqaS4xkH+H6KyjiUa6mRjELwLvIDrsd4mBexaxkjub6LB7EivnjWxfgZ+ZMb2c7/8M+D0iQXcTnXst0bi3Eg30irv/mZnNIDpuJ9GpFhON/ATRCc4lOt3pWb6XieR4D3BixnZVxtQD/AHw+ayX/010ot/MY+xHDND3Z30dTnTIC4jO8eWsuyXu/tdm1kEk0K3EpHN/Hvc5d19uZlcRHWxt1vFyoiO+TCT9fwJ+SAyAp7IuP04kwRXEqmUB8D+JhDaeSB6PZbkvzeMuzPL/kkjiHyaS07FEB/7DLP9/EklsJpFgzyU6/D7ECvE/gVXuPt/Misn4d7P9IJLiPtluy7PtTiUm0Wty2+XEIHkK+HvgEnf/hJm9lZjMVmd8BxCJZEy27Qa2f8fjdqJ/dGTsN+SxfkxcfZxHJMCbc99Tsp6PI5LSIiJJ3ErcOrsr625z1tUrWd6riT/a9xCRGNYR/fNoImm8nPvcSSTdNcB7iL69lUig/w78TZ73YWIcPE5czWwi+sQxWWcXEyvP5cBid3/YzCZkPb+XSDjjiUXGrxOT7xeyHK/L+ij6017EpLWCGD9jiP75HSLZvyPr4wgiyd6RPz9LLFa2uPtHzexKYjHz38CyXOGfRoyBDxFXGEuIWyJHEcl1T3dfZGZ/m/Xz8dzujcSC79vEhDKaGFdrs04+C/xfYgxMI8aVEXlnfZanmNTfA3yPGNMnEnnpNmJSXeHu/2pmJ7v7/QBmdg2RD54mFg4vEpPX7cBbsiz35mtfz9j2IsbX5Xn+jcStwOfpx3Dfgz+d6AijiNXpvuRfoiQG9OuJxtyH6LDvJxrtjUTy+wqx0i5W/M8QnXMZcTvkK0TyO4pIlC8Tq6slwEeISjqO6MzXE415C5m08xg9RKc8H/gakUi3EZ1vNFHZ083sz4mV3JqM8e+IxnmB6PyHE420iUgoBxMJvVh1nU6sit9LdKTikvQ4YpW5J9HJilX0icQl7xbgT4hJbi3w5qy3l4mE/BEz+x0iKYwiEs+dedwDgHPNrCfrfxGRZP84Y9pMJJIvZX29iUhGSzLujUSH+27GehJxZbONSGjnEkmgWJl9jkg2BxKJtZcYPB8hksJ+RILYO9v03cTK68iM7V15ng3ZZg/lPd83E4lnQZb9a0TSOTTj+GS+P5EwIf/9gBhQjxNXUeMyiU0k+sGjxIS6hVhRv0Akh7dWxFYsIqYQg/ylbIfPEZPI14jJ4afAvxJ9YE+ifx1MTG7vzjo7HjjU3U8xsxvzfB/O+j4+Y4Zo/2ezfU4C/o0YB+OIyaC4/z2auFK5mlg9H5NlOS7b9ejcdlPu+wrRB8dlXO8iFj5nmtkLxOR0F9FvziP6ysQsx18Ryfh/5fMriDHXk+VflnV6JpEk/4Lot0fnMX6YbXt8xvwuYnHwPeCLZnY0kfTvJPrHOWZ2XMa7iJhY9wb+No8xlZgMJpjZutx3ITGxTCdywo0Z2wPE+N4n2+HNxBh8O7FYPIyYBP4s6/ftxEKsk1iYbCMWEO/I/e4jFptfBS42syOAqWZ2DzEZjCYWdWuz/E8Qi9de4opzDHEl8gKxYJqcbbiSuFL3PMcYog/XNdy3aPYD/g+RXJ4kKul6IrndDtxEdMTXu/vniAbcRqw0NhCXnMuJRv8m0Zkuzf9cZDExwN5CJLZ9iQ7WSwyGPYiBNTk/7PgTosJez/bL3N8jZvifuvtiYiBtyHM9kMd5E9FITnSak4mV0V5Ex5hOdIQVuf9CIknPZftKemuW/0fEYL6ZWN2sJxLyBmKl8BaiYywmktIcoiPvTVzlfIQYfF8jPhS8gVgJnJF1s5SYaH4z63o/YjJaQqwobiUm3PuJle5RGeNexCrjjnw8IOe026UAAAT3SURBVPfrcfd7iE52ScZ2Z5bjzRnjz7JNpxIJ5GGi064hBsrtxO2TG7KOTicSxHuyXK9k/J8E/pK48tqPSBTHZlmeJyabMcRgOYoY9A9kTCuIhPRJYGH+VdMvZhvfAPw/4nbDvllfk4gk/qE8x6VZpheyHr+ex3ws22kZsRD5CfCVrJNTs00PIxLjOcSVyeeJJPNI1s3phOJWzXIze4iY8M4gBvVdxGRzL9v74HeJifdTxGTyEnGl1UtMjnMyxlvZPuH/F5G4v1xRlrNz31uIBVJ3lulO4nbZc8BxOf62Ev3jO3kOJyauOcRKFGLlupZI0OuznZfkv3/M9ukgVqc3ZZkfzPq+n5i4/ibrcwxx1XkX0bd/kWVanjH8Q5bjO7nN4cRE/W1iMbSQ6KuXZlzj2D6+/zq37SAWcE8RY2B21vmt+e8MYlL/QNbftVnel7Pu7yL6z1TianMTMZnsQfTR12TdjSVy11XEOHRiIbWBuDK7K8v3OiKfLMy4vpsxn5YxbyHGei/R9/s13LdoznD328zsFGJ2vBR4t7tfYWbH5WXhDKKD/ZWZfdXdP2ZmZxCVdxsxAPd39y/VOP5JRKOdTawmi0Hwv4hG2UbcXrliEDF/kFiBPcP2hpiUbx+Qx7wbOMPd/8nMPgw8VfzWwADHPoboSMUHwOvYvqJ+xt3/xsw+7+5fMrM3uPt9ud+fuvvfZ129w90vq9juFKJzPAV0Zt1e5u7/nPX4cr4/lhjMPyES0S+zLORrZHu8n5hoHsu6+5eK2KdnnH1lrYrz48QkMYW4PWNE5y7avIj1QSJJ3wu8Md+bRVwdABzp7h/LY17G9lsZU4B17n5jxfmPIT6DmERMjNvc/Z9r1P2pRIJ/LzGB3EJ80PotYlBtAW4v9q0ub43nJwGj3P2+/vpArkKfIxLeicQgv4O4LfFw7ruCSLwXkqv/ot4rjlPU3RJiIvsp0fdfJG417LBvlrdmneXxphDj5njg5exTVxBXIVOIvrmyom3/ku1XlO8kEvXbiQl9FBV92t1vzON/hJg0xhELnuerYzKz8TViL2Ibn+16FTFZbXX3X2QsryUmsquzDnZov1rtUPw5lazzzUR/GE32UeDeivK+M9u5sn//kbt/s+IYq4D17r6gIne9AZiQZbvE46/qYmbn5GsziInxIaLv/YSYPObmcf7V3f8w97msXnn6yjXMCf5qYtZ+H9GAVwGnuvuMOu9dRtwLfh/bL9cmEf+5yE7/oUge4xiis51GDJTNjezbT7zF8Sax/d56y+dooLz16sca2O4BInHVeq+/8vRXxnqxnFpZ1jpxFj8XnbNerJur6qLYvvI4p2Z8O52/v/5Vp/4basfq8tY6f626beC8A7VBzTquOE5RdzvE3E98jR6vuj/WKnetsVlZj432vQHjHIpx2M/46+ujDcTXaP3UrM8G8t5O5xgwd7n7sP0jVrkQl0En5s8n9vPerIrXip9PLN6vdfyK48zKbRvat4HjnVhxvJbP0UB5a9ZPo9vVe2+A8tQtYz+xnFirXJVx1vi5Zqy16qLWvvXO31/ZW+kr1eerdf5a9dhknzqx3jn66T+zBtp3kMer1QbVx601NivrsaG+10icDdTZoMdhE+WtOw4bqJ+ax28wjp326e+f/tiYiEhJDfeHrCIiMkSU4EVESkoJXkSkpJTgRURKSgleRKSk/j8CiiUqpFq0MgAAAABJRU5ErkJggg==\n", 80 | "text/plain": [ 81 | "
" 82 | ] 83 | }, 84 | "metadata": { 85 | "needs_background": "light" 86 | }, 87 | "output_type": "display_data" 88 | } 89 | ], 90 | "source": [ 91 | "dendrogram = sch.dendrogram(sch.linkage(points, method = \"ward\"))" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 8, 97 | "metadata": { 98 | "scrolled": true 99 | }, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "" 105 | ] 106 | }, 107 | "execution_count": 8, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | }, 111 | { 112 | "data": { 113 | "image/png": "\n", 114 | "text/plain": [ 115 | "
" 116 | ] 117 | }, 118 | "metadata": { 119 | "needs_background": "light" 120 | }, 121 | "output_type": "display_data" 122 | } 123 | ], 124 | "source": [ 125 | "plt.scatter(dataset[0][:,0], dataset[0][:,1])" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Episode 4 : Perform the actual clustering" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 9, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "hc = AgglomerativeClustering(n_clusters=4, affinity = \"euclidean\", linkage = \"ward\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 10, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "y_hc = hc.fit_predict(points)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 15, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "" 162 | ] 163 | }, 164 | "execution_count": 15, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | }, 168 | { 169 | "data": { 170 | "image/png": "\n", 171 | "text/plain": [ 172 | "
" 173 | ] 174 | }, 175 | "metadata": { 176 | "needs_background": "light" 177 | }, 178 | "output_type": "display_data" 179 | } 180 | ], 181 | "source": [ 182 | "plt.scatter(points[y_hc == 0,0], points[y_hc == 0,1], s = 100, c = \"cyan\")\n", 183 | "plt.scatter(points[y_hc == 1,0], points[y_hc == 1,1], s = 100, c = \"yellow\")\n", 184 | "plt.scatter(points[y_hc == 2,0], points[y_hc == 2,1], s = 100, c = \"red\")\n", 185 | "plt.scatter(points[y_hc == 3,0], points[y_hc == 3,1], s = 100, c = \"green\")" 186 | ] 187 | } 188 | ], 189 | "metadata": { 190 | "kernelspec": { 191 | "display_name": "Python 3", 192 | "language": "python", 193 | "name": "python3" 194 | }, 195 | "language_info": { 196 | "codemirror_mode": { 197 | "name": "ipython", 198 | "version": 3 199 | }, 200 | "file_extension": ".py", 201 | "mimetype": "text/x-python", 202 | "name": "python", 203 | "nbconvert_exporter": "python", 204 | "pygments_lexer": "ipython3", 205 | "version": "3.8.3" 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 4 210 | } 211 | -------------------------------------------------------------------------------- /MACHINE_LEARNING/K_NEAREST_NEIGHBORS_ALGORITHM/.ipynb_checkpoints/KNN DÜZELTME-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 4 6 | } 7 | -------------------------------------------------------------------------------- /MACHINE_LEARNING/PRINCIPAL_COMPONENT_ANALYSIS(PCA)/PCA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OakAcademy/Machine-Learning-with-Python/41fb3da8eeb9058649703d6f83a0a583a2cd4388/MACHINE_LEARNING/PRINCIPAL_COMPONENT_ANALYSIS(PCA)/PCA.png -------------------------------------------------------------------------------- /MACHINE_LEARNING/RECOMMENDER_SYSTEMS_ALGORITHM/.ipynb_checkpoints/advanced_recommender_systems_with_python-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "# Advanced Recommender Systems with Python\n", 9 | "\n", 10 | "Welcome to the node book for creating \"Advanced Recommender Systems\" with Python. This is an optional lesson notebook for you to check out. Due to the math level currently used and the heavy use of SciPy here, we have not posted videos for this lesson.\n", 11 | "\n", 12 | "Recommendation Systems are often based on larger data sets and need to be organized in a particular way. Therefore, we will not have a project compatible with this topic, but rather a more intense review of building a recommendation system with Python with the same Movie Lens Dataset.\n", 13 | "\n", 14 | "*Note: The real math behind Recommender systems is quite heavy in Linear Algebra.*\n", 15 | "___" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Methods Used\n", 23 | "\n", 24 | "Two most common types of recommender systems are **Content-Based** and **Collaborative Filtering (CF)**. \n", 25 | "\n", 26 | "* Collaborative filtering generates recommendations based on the knowledge of users' attitudes towards items, that is, it uses the \"wisdom of the crowd\" to suggest items.\n", 27 | "* Content-based recommendation systems focus on the characteristics of the items and give you suggestions based on the similarity between them.\n", 28 | "\n", 29 | "## Collaborative Filtering (CF)\n", 30 | "\n", 31 | "In general, Collaborative filtering (CF) is more commonly used than content-based systems because it usually gives better results and is relatively easy to understand (from a general application perspective). The algorithm has the ability to do feature learning on its own, which means it can start learning by itself what features to use.\n", 32 | "\n", 33 | "CF can be divided into **Memory-Based Collaborative Filtering** and **Model-Based Collaborative filtering**. \n", 34 | "\n", 35 | "In this tutorial, we will implement Model-Based CF by using singular value decomposition (SVD) and Memory-Based CF by computing cosine similarity. \n", 36 | "\n", 37 | "## The Data\n", 38 | "\n", 39 | "We will use famous MovieLens dataset, which is one of the most common datasets used when implementing and testing recommender engines. It contains 100k movie ratings from 943 users and a selection of 1682 movies.\n", 40 | "\n", 41 | "You can download the dataset [here](http://files.grouplens.org/datasets/movielens/ml-100k.zip) or just use the u.data file that is already included in this folder." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Episode 1: Getting Started\n", 49 | "\n", 50 | "Let's import some libraries we will need:" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 1, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import numpy as np\n", 60 | "import pandas as pd" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "We can then read in the **u.data** file, which contains the full dataset. You can read a brief description of the dataset [here](http://files.grouplens.org/datasets/movielens/ml-100k-README.txt).\n", 68 | "\n", 69 | "Note how we specify the separator argument for a Tab separated file." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "column_names = ['id_user', 'item_id', 'rate', 'timestamp']\n", 79 | "df = pd.read_csv('u.data', sep='\\t', names=column_names)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Let's take a quick look at the data." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/html": [ 97 | "
\n", 98 | "\n", 111 | "\n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | "
id_useritem_idratetimestamp
00505881250949
101725881250949
201331881250949
31962423881250949
41863023891717742
\n", 159 | "
" 160 | ], 161 | "text/plain": [ 162 | " id_user item_id rate timestamp\n", 163 | "0 0 50 5 881250949\n", 164 | "1 0 172 5 881250949\n", 165 | "2 0 133 1 881250949\n", 166 | "3 196 242 3 881250949\n", 167 | "4 186 302 3 891717742" 168 | ] 169 | }, 170 | "execution_count": 3, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "df.head()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "Note how we only have the id_item, not the movie name. We can use the Movie csv file to grab the movie names and merge it with this dataframe:" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 4, 189 | "metadata": { 190 | "scrolled": false 191 | }, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/html": [ 196 | "
\n", 197 | "\n", 210 | "\n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | "
item_idtitle
01Toy Story (1995)
12GoldenEye (1995)
23Four Rooms (1995)
34Get Shorty (1995)
45Copycat (1995)
\n", 246 | "
" 247 | ], 248 | "text/plain": [ 249 | " item_id title\n", 250 | "0 1 Toy Story (1995)\n", 251 | "1 2 GoldenEye (1995)\n", 252 | "2 3 Four Rooms (1995)\n", 253 | "3 4 Get Shorty (1995)\n", 254 | "4 5 Copycat (1995)" 255 | ] 256 | }, 257 | "execution_count": 4, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "movie_titles = pd.read_csv(\"Movie\")\n", 264 | "movie_titles.head()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Episode 2: Merge" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "Then merge the dataframes:" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 5, 284 | "metadata": { 285 | "scrolled": true 286 | }, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/html": [ 291 | "
\n", 292 | "\n", 305 | "\n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | "
id_useritem_idratetimestamptitle
00505881250949Star Wars (1977)
1290505880473582Star Wars (1977)
279504891271545Star Wars (1977)
32505888552084Star Wars (1977)
48505879362124Star Wars (1977)
\n", 359 | "
" 360 | ], 361 | "text/plain": [ 362 | " id_user item_id rate timestamp title\n", 363 | "0 0 50 5 881250949 Star Wars (1977)\n", 364 | "1 290 50 5 880473582 Star Wars (1977)\n", 365 | "2 79 50 4 891271545 Star Wars (1977)\n", 366 | "3 2 50 5 888552084 Star Wars (1977)\n", 367 | "4 8 50 5 879362124 Star Wars (1977)" 368 | ] 369 | }, 370 | "execution_count": 5, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | } 374 | ], 375 | "source": [ 376 | "df = pd.merge(df,movie_titles, on = \"item_id\")\n", 377 | "df.head()" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "Now let's take a quick look at the number of unique users and movies." 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 6, 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "name": "stdout", 394 | "output_type": "stream", 395 | "text": [ 396 | "Num. of Users: 944\n", 397 | "Num of Movies: 1682\n" 398 | ] 399 | } 400 | ], 401 | "source": [ 402 | "n_users = df.id_user.nunique()\n", 403 | "n_items = df.item_id.nunique()\n", 404 | "\n", 405 | "print('Num. of Users: '+ str(n_users))\n", 406 | "print('Num of Movies: '+str(n_items))" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "## Episode 3: Train Test Split\n", 414 | "\n", 415 | "Recommender Systems are very difficult to evaluate due to their nature, but we'll show you how to evaluate them in this tutorial. To do this, we will divide our data into two groups. However, we will not do our classic X_train, X_test, y_train, y_test split. Instead, we can split the data into two data sets:" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 8, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "from sklearn.model_selection import train_test_split\n", 425 | "train_data, test_data = train_test_split(df, test_size=0.25)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "## Episode 4 : Memory-Based Collaborative Filtering\n", 433 | "\n", 434 | "Memory-Based Collaborative Filtering approaches can be divided into two main sections: **user-item filtering** and **item-item filtering**. \n", 435 | "\n", 436 | "A *user-item filtering* will take a particular user, find users that are similar to that user based on similarity of ratings, and recommend items that those similar users liked. \n", 437 | "\n", 438 | "In contrast, *item-item filtering* will take an item, find users who liked that item, and find other items that those users or similar users also liked. It takes items and outputs other items as recommendations. \n", 439 | "\n", 440 | "* *Item-Item Collaborative Filtering*: “Users who liked this item also liked …”\n", 441 | "* *User-Item Collaborative Filtering*: “Users who are similar to you also liked …”" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "In both cases, you create a user-item matrix which built from the entire dataset.\n", 449 | "\n", 450 | "Since we have split the data into testing and training we will need to create two ``[943 x 1682]`` matrices (all users by all movies). \n", 451 | "\n", 452 | "The training matrix contains 75% of the ratings and the testing matrix contains 25% of the ratings. " 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "Example of user-item matrix:\n", 460 | "\"blog8\"/" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "After you have built the user-item matrix you calculate the similarity and create a similarity matrix. \n", 468 | "\n", 469 | "The similarity values between items in *Item-Item Collaborative Filtering* are measured by observing all the users who have rated both items. \n", 470 | "\n", 471 | "" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "For *User-Item Collaborative Filtering* the similarity values between users are measured by observing all the items that are rated by both users.\n", 479 | "\n", 480 | "" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "A distance metric commonly used in recommender systems is *cosine similarity*, where the ratings are seen as vectors in ``n``-dimensional space and the similarity is calculated based on the angle between these vectors. \n", 488 | "Cosine similiarity for users *a* and *m* can be calculated using the formula below, where you take dot product of the user vector *$u_k$* and the user vector *$u_a$* and divide it by multiplication of the Euclidean lengths of the vectors.\n", 489 | "\n", 490 | "\n", 491 | "To calculate similarity between items *m* and *b* you use the formula:\n", 492 | "\n", 493 | "\n", 495 | "\n", 496 | "Your first step will be to create the user-item matrix. Since you have both testing and training data you need to create two matrices. " 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 12, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "#Create two user-item matrices, one for training and another for testing\n", 506 | "train_data_matrix = np.zeros((n_users, n_items))\n", 507 | "for line in train_data.itertuples():\n", 508 | " train_data_matrix[line[1]-1, line[2]-1] = line[3] \n", 509 | "\n", 510 | "test_data_matrix = np.zeros((n_users, n_items))\n", 511 | "for line in test_data.itertuples():\n", 512 | " test_data_matrix[line[1]-1, line[2]-1] = line[3]" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "You can use the [pairwise_distances](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html) function from sklearn to calculate the cosine similarity. Note, the output will range from 0 to 1 since the ratings are all positive." 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 13, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "from sklearn.metrics.pairwise import pairwise_distances\n", 529 | "user_similarity = pairwise_distances(train_data_matrix, metric='cosine')\n", 530 | "item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": {}, 536 | "source": [ 537 | "Next step is to make predictions. You have already created similarity matrices: `user_similarity` and `item_similarity` and therefore you can make a prediction by applying following formula for user-based CF:\n", 538 | "\n", 539 | "\n", 540 | "\n", 541 | "You can look at the similarity between users *k* and *a* as weights that are multiplied by the ratings of a similar user *a* (corrected for the average rating of that user). You will need to normalize it so that the ratings stay between 1 and 5 and, as a final step, sum the average ratings for the user that you are trying to predict. \n", 542 | "\n", 543 | "The idea here is that some users may tend always to give high or low ratings to all movies. The relative difference in the ratings that these users give is more important than the absolute values. To give an example: suppose, user *k* gives 4 stars to his favourite movies and 3 stars to all other good movies. Suppose now that another user *t* rates movies that he/she likes with 5 stars, and the movies he/she fell asleep over with 3 stars. These two users could have a very similar taste but treat the rating system differently. \n", 544 | "\n", 545 | "When making a prediction for item-based CF you don't need to correct for users average rating since query user itself is used to do predictions.\n", 546 | "\n", 547 | "" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 14, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "def predict(ratings, similarity, type='user'):\n", 557 | " if type == 'user':\n", 558 | " mean_user_rating = ratings.mean(axis=1)\n", 559 | " #You use np.newaxis so that mean_user_rating has same format as ratings\n", 560 | " ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) \n", 561 | " pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T\n", 562 | " elif type == 'item':\n", 563 | " pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)]) \n", 564 | " return pred" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 15, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "item_prediction = predict(train_data_matrix, item_similarity, type='item')\n", 574 | "user_prediction = predict(train_data_matrix, user_similarity, type='user')" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "### Evaluation\n", 582 | "There are many evaluation metrics, but one of the most popular metric used to evaluate the accuracy of predicted rating is *Root Mean Squared Error (RMSE)*. \n", 583 | "\n", 584 | "\n", 585 | "You can use the [mean_square_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html) (MSE) function from `sklearn`, where the RMSE is just the square root of MSE. To read more about different evaluation metrics you can take a look at [this article](http://research.microsoft.com/pubs/115396/EvaluationMetrics.TR.pdf). " 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": {}, 591 | "source": [ 592 | "Since you only want to consider predicted ratings that are in the test dataset, you filter out all other elements in the prediction matrix with `prediction[ground_truth.nonzero()]`. " 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 16, 598 | "metadata": {}, 599 | "outputs": [], 600 | "source": [ 601 | "from sklearn.metrics import mean_squared_error\n", 602 | "from math import sqrt\n", 603 | "def rmse(prediction, ground_truth):\n", 604 | " prediction = prediction[ground_truth.nonzero()].flatten() \n", 605 | " ground_truth = ground_truth[ground_truth.nonzero()].flatten()\n", 606 | " return sqrt(mean_squared_error(prediction, ground_truth))" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 17, 612 | "metadata": { 613 | "scrolled": true 614 | }, 615 | "outputs": [ 616 | { 617 | "name": "stdout", 618 | "output_type": "stream", 619 | "text": [ 620 | "User-based CF RMSE: 3.1303583292912287\n", 621 | "Item-based CF RMSE: 3.456367296774751\n" 622 | ] 623 | } 624 | ], 625 | "source": [ 626 | "print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))\n", 627 | "print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))" 628 | ] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "metadata": { 633 | "collapsed": true 634 | }, 635 | "source": [ 636 | "Memory-based algorithms are easy to implement and produce reasonable prediction quality. \n", 637 | "The drawback of memory-based CF is that it doesn't scale to real-world scenarios and doesn't address the well-known cold-start problem, that is when new user or new item enters the system. Model-based CF methods are scalable and can deal with higher sparsity level than memory-based models, but also suffer when new users or items that don't have any ratings enter the system. I would like to thank Ethan Rosenthal for his [post](http://blog.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/) about Memory-Based Collaborative Filtering. " 638 | ] 639 | }, 640 | { 641 | "cell_type": "markdown", 642 | "metadata": {}, 643 | "source": [ 644 | "# Model-based Collaborative Filtering\n", 645 | "\n", 646 | "Model-based Collaborative Filtering is based on **matrix factorization (MF)** which has received greater exposure, mainly as an unsupervised learning method for latent variable decomposition and dimensionality reduction. Matrix factorization is widely used for recommender systems where it can deal better with scalability and sparsity than Memory-based CF. The goal of MF is to learn the latent preferences of users and the latent attributes of items from known ratings (learn features that describe the characteristics of ratings) to then predict the unknown ratings through the dot product of the latent features of users and items. \n", 647 | "When you have a very sparse matrix, with a lot of dimensions, by doing matrix factorization you can restructure the user-item matrix into low-rank structure, and you can represent the matrix by the multiplication of two low-rank matrices, where the rows contain the latent vector. You fit this matrix to approximate your original matrix, as closely as possible, by multiplying the low-rank matrices together, which fills in the entries missing in the original matrix.\n", 648 | "\n", 649 | "Let's calculate the sparsity level of MovieLens dataset:" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": 18, 655 | "metadata": {}, 656 | "outputs": [ 657 | { 658 | "name": "stdout", 659 | "output_type": "stream", 660 | "text": [ 661 | "The sparsity level of MovieLens100K is 93.7%\n" 662 | ] 663 | } 664 | ], 665 | "source": [ 666 | "sparsity=round(1.0-len(df)/float(n_users*n_items),3)\n", 667 | "print('The sparsity level of MovieLens100K is ' + str(sparsity*100) + '%')" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": {}, 673 | "source": [ 674 | "To give an example of the learned latent preferences of the users and items: let's say for the MovieLens dataset you have the following information: _(user id, age, location, gender, movie id, director, actor, language, year, rating)_. By applying matrix factorization the model learns that important user features are _age group (under 10, 10-18, 18-30, 30-90)_, _location_ and _gender_, and for movie features it learns that _decade_, _director_ and _actor_ are most important. Now if you look into the information you have stored, there is no such feature as the _decade_, but the model can learn on its own. The important aspect is that the CF model only uses data (user_id, movie_id, rating) to learn the latent features. If there is little data available model-based CF model will predict poorly, since it will be more difficult to learn the latent features. \n", 675 | "\n", 676 | "Models that use both ratings and content features are called **Hybrid Recommender Systems** where both Collaborative Filtering and Content-based Models are combined. Hybrid recommender systems usually show higher accuracy than Collaborative Filtering or Content-based Models on their own: they are capable to address the cold-start problem better since if you don't have any ratings for a user or an item you could use the metadata from the user or item to make a prediction. " 677 | ] 678 | }, 679 | { 680 | "cell_type": "markdown", 681 | "metadata": {}, 682 | "source": [ 683 | "### SVD\n", 684 | "A well-known matrix factorization method is **Singular value decomposition (SVD)**. Collaborative Filtering can be formulated by approximating a matrix `X` by using singular value decomposition. The winning team at the Netflix Prize competition used SVD matrix factorization models to produce product recommendations, for more information I recommend to read articles: [Netflix Recommendations: Beyond the 5 stars](http://techblog.netflix.com/2012/04/netflix-recommendations-beyond-5-stars.html) and [Netflix Prize and SVD](http://buzzard.ups.edu/courses/2014spring/420projects/math420-UPS-spring-2014-gower-netflix-SVD.pdf).\n", 685 | "The general equation can be expressed as follows:\n", 686 | "\n", 687 | "\n", 688 | "\n", 689 | "Given `m x n` matrix `X`:\n", 690 | "* *`U`* is an *`(m x r)`* orthogonal matrix\n", 691 | "* *`S`* is an *`(r x r)`* diagonal matrix with non-negative real numbers on the diagonal\n", 692 | "* *V^T* is an *`(r x n)`* orthogonal matrix\n", 693 | "\n", 694 | "Elements on the diagnoal in `S` are known as *singular values of `X`*. \n", 695 | "\n", 696 | "\n", 697 | "Matrix *`X`* can be factorized to *`U`*, *`S`* and *`V`*. The *`U`* matrix represents the feature vectors corresponding to the users in the hidden feature space and the *`V`* matrix represents the feature vectors corresponding to the items in the hidden feature space.\n", 698 | "\n", 699 | "\n", 700 | "Now you can make a prediction by taking dot product of *`U`*, *`S`* and *`V^T`*.\n", 701 | "\n" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 19, 707 | "metadata": {}, 708 | "outputs": [ 709 | { 710 | "name": "stdout", 711 | "output_type": "stream", 712 | "text": [ 713 | "User-based CF MSE: 2.727678702344757\n" 714 | ] 715 | } 716 | ], 717 | "source": [ 718 | "import scipy.sparse as sp\n", 719 | "from scipy.sparse.linalg import svds\n", 720 | "\n", 721 | "#get SVD components from train matrix. Choose k.\n", 722 | "u, s, vt = svds(train_data_matrix, k = 20)\n", 723 | "s_diag_matrix=np.diag(s)\n", 724 | "X_pred = np.dot(np.dot(u, s_diag_matrix), vt)\n", 725 | "print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": {}, 731 | "source": [ 732 | "Carelessly addressing only the relatively few known entries is highly prone to overfitting. SVD can be very slow and computationally expensive. More recent work minimizes the squared error by applying alternating least square or stochastic gradient descent and uses regularization terms to prevent overfitting. Alternating least square and stochastic gradient descent methods for CF will be covered in the next tutorials.\n" 733 | ] 734 | }, 735 | { 736 | "cell_type": "markdown", 737 | "metadata": {}, 738 | "source": [ 739 | "Review:\n", 740 | "\n", 741 | "* We have covered how to implement simple **Collaborative Filtering** methods, both memory-based CF and model-based CF.\n", 742 | "* **Memory-based models** are based on similarity between items or users, where we use cosine-similarity.\n", 743 | "* **Model-based CF** is based on matrix factorization where we use SVD to factorize the matrix.\n", 744 | "* Building recommender systems that perform well in cold-start scenarios (where little data is available on new users and items) remains a challenge. The standard collaborative filtering method performs poorly is such settings. " 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": {}, 750 | "source": [ 751 | "## Looking for more?\n", 752 | "\n", 753 | "If you want to tackle your own recommendation system analysis, check out these data sets. Note: The files are quite large in most cases, not all the links may stay up to host the data, but the majority of them still work. Or just Google for your own data set!\n", 754 | "\n", 755 | "**Movies Recommendation:**\n", 756 | "\n", 757 | "MovieLens - Movie Recommendation Data Sets http://www.grouplens.org/node/73\n", 758 | "\n", 759 | "Yahoo! - Movie, Music, and Images Ratings Data Sets http://webscope.sandbox.yahoo.com/catalog.php?datatype=r\n", 760 | "\n", 761 | "Jester - Movie Ratings Data Sets (Collaborative Filtering Dataset) http://www.ieor.berkeley.edu/~goldberg/jester-data/\n", 762 | "\n", 763 | "Cornell University - Movie-review data for use in sentiment-analysis experiments http://www.cs.cornell.edu/people/pabo/movie-review-data/\n", 764 | "\n", 765 | "**Music Recommendation:**\n", 766 | "\n", 767 | "Last.fm - Music Recommendation Data Sets http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/index.html\n", 768 | "\n", 769 | "Yahoo! - Movie, Music, and Images Ratings Data Sets http://webscope.sandbox.yahoo.com/catalog.php?datatype=r\n", 770 | "\n", 771 | "Audioscrobbler - Music Recommendation Data Sets http://www-etud.iro.umontreal.ca/~bergstrj/audioscrobbler_data.html\n", 772 | "\n", 773 | "Amazon - Audio CD recommendations http://131.193.40.52/data/\n", 774 | "\n", 775 | "**Books Recommendation:**\n", 776 | "\n", 777 | "Institut für Informatik, Universität Freiburg - Book Ratings Data Sets http://www.informatik.uni-freiburg.de/~cziegler/BX/\n", 778 | "Food Recommendation:\n", 779 | "\n", 780 | "Chicago Entree - Food Ratings Data Sets http://archive.ics.uci.edu/ml/datasets/Entree+Chicago+Recommendation+Data\n", 781 | "Merchandise Recommendation:\n", 782 | "\n", 783 | "**Healthcare Recommendation:**\n", 784 | "\n", 785 | "Nursing Home - Provider Ratings Data Set http://data.medicare.gov/dataset/Nursing-Home-Compare-Provider-Ratings/mufm-vy8d\n", 786 | "\n", 787 | "Hospital Ratings - Survey of Patients Hospital Experiences http://data.medicare.gov/dataset/Survey-of-Patients-Hospital-Experiences-HCAHPS-/rj76-22dk\n", 788 | "\n", 789 | "**Dating Recommendation:**\n", 790 | "\n", 791 | "www.libimseti.cz - Dating website recommendation (collaborative filtering) http://www.occamslab.com/petricek/data/\n", 792 | "Scholarly Paper Recommendation:\n", 793 | "\n", 794 | "National University of Singapore - Scholarly Paper Recommendation http://www.comp.nus.edu.sg/~sugiyama/SchPaperRecData.html\n" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": null, 800 | "metadata": {}, 801 | "outputs": [], 802 | "source": [] 803 | } 804 | ], 805 | "metadata": { 806 | "kernelspec": { 807 | "display_name": "Python 3", 808 | "language": "python", 809 | "name": "python3" 810 | }, 811 | "language_info": { 812 | "codemirror_mode": { 813 | "name": "ipython", 814 | "version": 3 815 | }, 816 | "file_extension": ".py", 817 | "mimetype": "text/x-python", 818 | "name": "python", 819 | "nbconvert_exporter": "python", 820 | "pygments_lexer": "ipython3", 821 | "version": "3.8.3" 822 | } 823 | }, 824 | "nbformat": 4, 825 | "nbformat_minor": 1 826 | } 827 | -------------------------------------------------------------------------------- /MACHINE_LEARNING/RECOMMENDER_SYSTEMS_ALGORITHM/advanced_recommender_systems_with_python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "# Advanced Recommender Systems with Python\n", 9 | "\n", 10 | "Welcome to the node book for creating \"Advanced Recommender Systems\" with Python. This is an optional lesson notebook for you to check out. Due to the math level currently used and the heavy use of SciPy here, we have not posted videos for this lesson.\n", 11 | "\n", 12 | "Recommendation Systems are often based on larger data sets and need to be organized in a particular way. Therefore, we will not have a project compatible with this topic, but rather a more intense review of building a recommendation system with Python with the same Movie Lens Dataset.\n", 13 | "\n", 14 | "*Note: The real math behind Recommender systems is quite heavy in Linear Algebra.*\n", 15 | "___" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Methods Used\n", 23 | "\n", 24 | "Two most common types of recommender systems are **Content-Based** and **Collaborative Filtering (CF)**. \n", 25 | "\n", 26 | "* Collaborative filtering generates recommendations based on the knowledge of users' attitudes towards items, that is, it uses the \"wisdom of the crowd\" to suggest items.\n", 27 | "* Content-based recommendation systems focus on the characteristics of the items and give you suggestions based on the similarity between them.\n", 28 | "\n", 29 | "## Collaborative Filtering (CF)\n", 30 | "\n", 31 | "In general, Collaborative filtering (CF) is more commonly used than content-based systems because it usually gives better results and is relatively easy to understand (from a general application perspective). The algorithm has the ability to do feature learning on its own, which means it can start learning by itself what features to use.\n", 32 | "\n", 33 | "CF can be divided into **Memory-Based Collaborative Filtering** and **Model-Based Collaborative filtering**. \n", 34 | "\n", 35 | "In this tutorial, we will implement Model-Based CF by using singular value decomposition (SVD) and Memory-Based CF by computing cosine similarity. \n", 36 | "\n", 37 | "## The Data\n", 38 | "\n", 39 | "We will use famous MovieLens dataset, which is one of the most common datasets used when implementing and testing recommender engines. It contains 100k movie ratings from 943 users and a selection of 1682 movies.\n", 40 | "\n", 41 | "You can download the dataset [here](http://files.grouplens.org/datasets/movielens/ml-100k.zip) or just use the u.data file that is already included in this folder." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Episode 1: Getting Started\n", 49 | "\n", 50 | "Let's import some libraries we will need:" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 1, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import numpy as np\n", 60 | "import pandas as pd" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "We can then read in the **u.data** file, which contains the full dataset. You can read a brief description of the dataset [here](http://files.grouplens.org/datasets/movielens/ml-100k-README.txt).\n", 68 | "\n", 69 | "Note how we specify the separator argument for a Tab separated file." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "column_names = ['id_user', 'item_id', 'rate', 'timestamp']\n", 79 | "df = pd.read_csv('u.data', sep='\\t', names=column_names)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Let's take a quick look at the data." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/html": [ 97 | "
\n", 98 | "\n", 111 | "\n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | "
id_useritem_idratetimestamp
00505881250949
101725881250949
201331881250949
31962423881250949
41863023891717742
\n", 159 | "
" 160 | ], 161 | "text/plain": [ 162 | " id_user item_id rate timestamp\n", 163 | "0 0 50 5 881250949\n", 164 | "1 0 172 5 881250949\n", 165 | "2 0 133 1 881250949\n", 166 | "3 196 242 3 881250949\n", 167 | "4 186 302 3 891717742" 168 | ] 169 | }, 170 | "execution_count": 3, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "df.head()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "Note how we only have the id_item, not the movie name. We can use the Movie csv file to grab the movie names and merge it with this dataframe:" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 4, 189 | "metadata": { 190 | "scrolled": false 191 | }, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/html": [ 196 | "
\n", 197 | "\n", 210 | "\n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | "
item_idtitle
01Toy Story (1995)
12GoldenEye (1995)
23Four Rooms (1995)
34Get Shorty (1995)
45Copycat (1995)
\n", 246 | "
" 247 | ], 248 | "text/plain": [ 249 | " item_id title\n", 250 | "0 1 Toy Story (1995)\n", 251 | "1 2 GoldenEye (1995)\n", 252 | "2 3 Four Rooms (1995)\n", 253 | "3 4 Get Shorty (1995)\n", 254 | "4 5 Copycat (1995)" 255 | ] 256 | }, 257 | "execution_count": 4, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "movie_titles = pd.read_csv(\"Movie\")\n", 264 | "movie_titles.head()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Episode 2: Merge" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "Then merge the dataframes:" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 5, 284 | "metadata": { 285 | "scrolled": true 286 | }, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/html": [ 291 | "
\n", 292 | "\n", 305 | "\n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | "
id_useritem_idratetimestamptitle
00505881250949Star Wars (1977)
1290505880473582Star Wars (1977)
279504891271545Star Wars (1977)
32505888552084Star Wars (1977)
48505879362124Star Wars (1977)
\n", 359 | "
" 360 | ], 361 | "text/plain": [ 362 | " id_user item_id rate timestamp title\n", 363 | "0 0 50 5 881250949 Star Wars (1977)\n", 364 | "1 290 50 5 880473582 Star Wars (1977)\n", 365 | "2 79 50 4 891271545 Star Wars (1977)\n", 366 | "3 2 50 5 888552084 Star Wars (1977)\n", 367 | "4 8 50 5 879362124 Star Wars (1977)" 368 | ] 369 | }, 370 | "execution_count": 5, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | } 374 | ], 375 | "source": [ 376 | "df = pd.merge(df,movie_titles, on = \"item_id\")\n", 377 | "df.head()" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "Now let's take a quick look at the number of unique users and movies." 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 6, 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "name": "stdout", 394 | "output_type": "stream", 395 | "text": [ 396 | "Num. of Users: 944\n", 397 | "Num of Movies: 1682\n" 398 | ] 399 | } 400 | ], 401 | "source": [ 402 | "n_users = df.id_user.nunique()\n", 403 | "n_items = df.item_id.nunique()\n", 404 | "\n", 405 | "print('Num. of Users: '+ str(n_users))\n", 406 | "print('Num of Movies: '+str(n_items))" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "## Episode 3: Train Test Split\n", 414 | "\n", 415 | "Recommender Systems are very difficult to evaluate due to their nature, but we'll show you how to evaluate them in this tutorial. To do this, we will divide our data into two groups. However, we will not do our classic X_train, X_test, y_train, y_test split. Instead, we can split the data into two data sets:" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 8, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "from sklearn.model_selection import train_test_split\n", 425 | "train_data, test_data = train_test_split(df, test_size=0.25)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "## Episode 4 : Memory-Based Collaborative Filtering\n", 433 | "\n", 434 | "Memory-Based Collaborative Filtering approaches can be divided into two main sections: **user-item filtering** and **item-item filtering**. \n", 435 | "\n", 436 | "A *user-item filtering* will take a particular user, find users that are similar to that user based on similarity of ratings, and recommend items that those similar users liked. \n", 437 | "\n", 438 | "In contrast, *item-item filtering* will take an item, find users who liked that item, and find other items that those users or similar users also liked. It takes items and outputs other items as recommendations. \n", 439 | "\n", 440 | "* *Item-Item Collaborative Filtering*: “Users who liked this item also liked …”\n", 441 | "* *User-Item Collaborative Filtering*: “Users who are similar to you also liked …”" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "In both cases, you create a user-item matrix which built from the entire dataset.\n", 449 | "\n", 450 | "Since we have split the data into testing and training we will need to create two ``[943 x 1682]`` matrices (all users by all movies). \n", 451 | "\n", 452 | "The training matrix contains 75% of the ratings and the testing matrix contains 25% of the ratings. " 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "Example of user-item matrix:\n", 460 | "\"blog8\"/" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "After you have built the user-item matrix you calculate the similarity and create a similarity matrix. \n", 468 | "\n", 469 | "The similarity values between items in *Item-Item Collaborative Filtering* are measured by observing all the users who have rated both items. \n", 470 | "\n", 471 | "" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "For *User-Item Collaborative Filtering* the similarity values between users are measured by observing all the items that are rated by both users.\n", 479 | "\n", 480 | "" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "A distance metric commonly used in recommender systems is *cosine similarity*, where the ratings are seen as vectors in ``n``-dimensional space and the similarity is calculated based on the angle between these vectors. \n", 488 | "Cosine similiarity for users *a* and *m* can be calculated using the formula below, where you take dot product of the user vector *$u_k$* and the user vector *$u_a$* and divide it by multiplication of the Euclidean lengths of the vectors.\n", 489 | "\n", 490 | "\n", 491 | "To calculate similarity between items *m* and *b* you use the formula:\n", 492 | "\n", 493 | "\n", 495 | "\n", 496 | "Your first step will be to create the user-item matrix. Since you have both testing and training data you need to create two matrices. " 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 12, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "#Create two user-item matrices, one for training and another for testing\n", 506 | "train_data_matrix = np.zeros((n_users, n_items))\n", 507 | "for line in train_data.itertuples():\n", 508 | " train_data_matrix[line[1]-1, line[2]-1] = line[3] \n", 509 | "\n", 510 | "test_data_matrix = np.zeros((n_users, n_items))\n", 511 | "for line in test_data.itertuples():\n", 512 | " test_data_matrix[line[1]-1, line[2]-1] = line[3]" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "You can use the [pairwise_distances](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html) function from sklearn to calculate the cosine similarity. Note, the output will range from 0 to 1 since the ratings are all positive." 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 13, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "from sklearn.metrics.pairwise import pairwise_distances\n", 529 | "user_similarity = pairwise_distances(train_data_matrix, metric='cosine')\n", 530 | "item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": {}, 536 | "source": [ 537 | "Next step is to make predictions. You have already created similarity matrices: `user_similarity` and `item_similarity` and therefore you can make a prediction by applying following formula for user-based CF:\n", 538 | "\n", 539 | "\n", 540 | "\n", 541 | "You can look at the similarity between users *k* and *a* as weights that are multiplied by the ratings of a similar user *a* (corrected for the average rating of that user). You will need to normalize it so that the ratings stay between 1 and 5 and, as a final step, sum the average ratings for the user that you are trying to predict. \n", 542 | "\n", 543 | "The idea here is that some users may tend always to give high or low ratings to all movies. The relative difference in the ratings that these users give is more important than the absolute values. To give an example: suppose, user *k* gives 4 stars to his favourite movies and 3 stars to all other good movies. Suppose now that another user *t* rates movies that he/she likes with 5 stars, and the movies he/she fell asleep over with 3 stars. These two users could have a very similar taste but treat the rating system differently. \n", 544 | "\n", 545 | "When making a prediction for item-based CF you don't need to correct for users average rating since query user itself is used to do predictions.\n", 546 | "\n", 547 | "" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 14, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "def predict(ratings, similarity, type='user'):\n", 557 | " if type == 'user':\n", 558 | " mean_user_rating = ratings.mean(axis=1)\n", 559 | " #You use np.newaxis so that mean_user_rating has same format as ratings\n", 560 | " ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) \n", 561 | " pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T\n", 562 | " elif type == 'item':\n", 563 | " pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)]) \n", 564 | " return pred" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 15, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "item_prediction = predict(train_data_matrix, item_similarity, type='item')\n", 574 | "user_prediction = predict(train_data_matrix, user_similarity, type='user')" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "### Evaluation\n", 582 | "There are many evaluation metrics, but one of the most popular metric used to evaluate the accuracy of predicted rating is *Root Mean Squared Error (RMSE)*. \n", 583 | "\n", 584 | "\n", 585 | "You can use the [mean_square_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html) (MSE) function from `sklearn`, where the RMSE is just the square root of MSE. To read more about different evaluation metrics you can take a look at [this article](http://research.microsoft.com/pubs/115396/EvaluationMetrics.TR.pdf). " 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": {}, 591 | "source": [ 592 | "Since you only want to consider predicted ratings that are in the test dataset, you filter out all other elements in the prediction matrix with `prediction[ground_truth.nonzero()]`. " 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 16, 598 | "metadata": {}, 599 | "outputs": [], 600 | "source": [ 601 | "from sklearn.metrics import mean_squared_error\n", 602 | "from math import sqrt\n", 603 | "def rmse(prediction, ground_truth):\n", 604 | " prediction = prediction[ground_truth.nonzero()].flatten() \n", 605 | " ground_truth = ground_truth[ground_truth.nonzero()].flatten()\n", 606 | " return sqrt(mean_squared_error(prediction, ground_truth))" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 17, 612 | "metadata": { 613 | "scrolled": true 614 | }, 615 | "outputs": [ 616 | { 617 | "name": "stdout", 618 | "output_type": "stream", 619 | "text": [ 620 | "User-based CF RMSE: 3.1303583292912287\n", 621 | "Item-based CF RMSE: 3.456367296774751\n" 622 | ] 623 | } 624 | ], 625 | "source": [ 626 | "print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))\n", 627 | "print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))" 628 | ] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "metadata": { 633 | "collapsed": true 634 | }, 635 | "source": [ 636 | "Memory-based algorithms are easy to implement and produce reasonable prediction quality. \n", 637 | "The drawback of memory-based CF is that it doesn't scale to real-world scenarios and doesn't address the well-known cold-start problem, that is when new user or new item enters the system. Model-based CF methods are scalable and can deal with higher sparsity level than memory-based models, but also suffer when new users or items that don't have any ratings enter the system. I would like to thank Ethan Rosenthal for his [post](http://blog.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/) about Memory-Based Collaborative Filtering. " 638 | ] 639 | }, 640 | { 641 | "cell_type": "markdown", 642 | "metadata": {}, 643 | "source": [ 644 | "# Model-based Collaborative Filtering\n", 645 | "\n", 646 | "Model-based Collaborative Filtering is based on **matrix factorization (MF)** which has received greater exposure, mainly as an unsupervised learning method for latent variable decomposition and dimensionality reduction. Matrix factorization is widely used for recommender systems where it can deal better with scalability and sparsity than Memory-based CF. The goal of MF is to learn the latent preferences of users and the latent attributes of items from known ratings (learn features that describe the characteristics of ratings) to then predict the unknown ratings through the dot product of the latent features of users and items. \n", 647 | "When you have a very sparse matrix, with a lot of dimensions, by doing matrix factorization you can restructure the user-item matrix into low-rank structure, and you can represent the matrix by the multiplication of two low-rank matrices, where the rows contain the latent vector. You fit this matrix to approximate your original matrix, as closely as possible, by multiplying the low-rank matrices together, which fills in the entries missing in the original matrix.\n", 648 | "\n", 649 | "Let's calculate the sparsity level of MovieLens dataset:" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": 18, 655 | "metadata": {}, 656 | "outputs": [ 657 | { 658 | "name": "stdout", 659 | "output_type": "stream", 660 | "text": [ 661 | "The sparsity level of MovieLens100K is 93.7%\n" 662 | ] 663 | } 664 | ], 665 | "source": [ 666 | "sparsity=round(1.0-len(df)/float(n_users*n_items),3)\n", 667 | "print('The sparsity level of MovieLens100K is ' + str(sparsity*100) + '%')" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": {}, 673 | "source": [ 674 | "To give an example of the learned latent preferences of the users and items: let's say for the MovieLens dataset you have the following information: _(user id, age, location, gender, movie id, director, actor, language, year, rating)_. By applying matrix factorization the model learns that important user features are _age group (under 10, 10-18, 18-30, 30-90)_, _location_ and _gender_, and for movie features it learns that _decade_, _director_ and _actor_ are most important. Now if you look into the information you have stored, there is no such feature as the _decade_, but the model can learn on its own. The important aspect is that the CF model only uses data (user_id, movie_id, rating) to learn the latent features. If there is little data available model-based CF model will predict poorly, since it will be more difficult to learn the latent features. \n", 675 | "\n", 676 | "Models that use both ratings and content features are called **Hybrid Recommender Systems** where both Collaborative Filtering and Content-based Models are combined. Hybrid recommender systems usually show higher accuracy than Collaborative Filtering or Content-based Models on their own: they are capable to address the cold-start problem better since if you don't have any ratings for a user or an item you could use the metadata from the user or item to make a prediction. " 677 | ] 678 | }, 679 | { 680 | "cell_type": "markdown", 681 | "metadata": {}, 682 | "source": [ 683 | "### SVD\n", 684 | "A well-known matrix factorization method is **Singular value decomposition (SVD)**. Collaborative Filtering can be formulated by approximating a matrix `X` by using singular value decomposition. The winning team at the Netflix Prize competition used SVD matrix factorization models to produce product recommendations, for more information I recommend to read articles: [Netflix Recommendations: Beyond the 5 stars](http://techblog.netflix.com/2012/04/netflix-recommendations-beyond-5-stars.html) and [Netflix Prize and SVD](http://buzzard.ups.edu/courses/2014spring/420projects/math420-UPS-spring-2014-gower-netflix-SVD.pdf).\n", 685 | "The general equation can be expressed as follows:\n", 686 | "\n", 687 | "\n", 688 | "\n", 689 | "Given `m x n` matrix `X`:\n", 690 | "* *`U`* is an *`(m x r)`* orthogonal matrix\n", 691 | "* *`S`* is an *`(r x r)`* diagonal matrix with non-negative real numbers on the diagonal\n", 692 | "* *V^T* is an *`(r x n)`* orthogonal matrix\n", 693 | "\n", 694 | "Elements on the diagnoal in `S` are known as *singular values of `X`*. \n", 695 | "\n", 696 | "\n", 697 | "Matrix *`X`* can be factorized to *`U`*, *`S`* and *`V`*. The *`U`* matrix represents the feature vectors corresponding to the users in the hidden feature space and the *`V`* matrix represents the feature vectors corresponding to the items in the hidden feature space.\n", 698 | "\n", 699 | "\n", 700 | "Now you can make a prediction by taking dot product of *`U`*, *`S`* and *`V^T`*.\n", 701 | "\n" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 19, 707 | "metadata": {}, 708 | "outputs": [ 709 | { 710 | "name": "stdout", 711 | "output_type": "stream", 712 | "text": [ 713 | "User-based CF MSE: 2.727678702344757\n" 714 | ] 715 | } 716 | ], 717 | "source": [ 718 | "import scipy.sparse as sp\n", 719 | "from scipy.sparse.linalg import svds\n", 720 | "\n", 721 | "#get SVD components from train matrix. Choose k.\n", 722 | "u, s, vt = svds(train_data_matrix, k = 20)\n", 723 | "s_diag_matrix=np.diag(s)\n", 724 | "X_pred = np.dot(np.dot(u, s_diag_matrix), vt)\n", 725 | "print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": {}, 731 | "source": [ 732 | "Carelessly addressing only the relatively few known entries is highly prone to overfitting. SVD can be very slow and computationally expensive. More recent work minimizes the squared error by applying alternating least square or stochastic gradient descent and uses regularization terms to prevent overfitting. Alternating least square and stochastic gradient descent methods for CF will be covered in the next tutorials.\n" 733 | ] 734 | }, 735 | { 736 | "cell_type": "markdown", 737 | "metadata": {}, 738 | "source": [ 739 | "Review:\n", 740 | "\n", 741 | "* We have covered how to implement simple **Collaborative Filtering** methods, both memory-based CF and model-based CF.\n", 742 | "* **Memory-based models** are based on similarity between items or users, where we use cosine-similarity.\n", 743 | "* **Model-based CF** is based on matrix factorization where we use SVD to factorize the matrix.\n", 744 | "* Building recommender systems that perform well in cold-start scenarios (where little data is available on new users and items) remains a challenge. The standard collaborative filtering method performs poorly is such settings. " 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": {}, 750 | "source": [ 751 | "## Looking for more?\n", 752 | "\n", 753 | "If you want to tackle your own recommendation system analysis, check out these data sets. Note: The files are quite large in most cases, not all the links may stay up to host the data, but the majority of them still work. Or just Google for your own data set!\n", 754 | "\n", 755 | "**Movies Recommendation:**\n", 756 | "\n", 757 | "MovieLens - Movie Recommendation Data Sets http://www.grouplens.org/node/73\n", 758 | "\n", 759 | "Yahoo! - Movie, Music, and Images Ratings Data Sets http://webscope.sandbox.yahoo.com/catalog.php?datatype=r\n", 760 | "\n", 761 | "Jester - Movie Ratings Data Sets (Collaborative Filtering Dataset) http://www.ieor.berkeley.edu/~goldberg/jester-data/\n", 762 | "\n", 763 | "Cornell University - Movie-review data for use in sentiment-analysis experiments http://www.cs.cornell.edu/people/pabo/movie-review-data/\n", 764 | "\n", 765 | "**Music Recommendation:**\n", 766 | "\n", 767 | "\n", 768 | "Yahoo! - Movie, Music, and Images Ratings Data Sets http://webscope.sandbox.yahoo.com/catalog.php?datatype=r\n", 769 | "\n", 770 | "\n", 771 | "**Books Recommendation:**\n", 772 | "\n", 773 | "Institut für Informatik, Universität Freiburg - Book Ratings Data Sets http://www.informatik.uni-freiburg.de/~cziegler/BX/\n", 774 | "Food Recommendation:\n", 775 | "\n", 776 | "Chicago Entree - Food Ratings Data Sets http://archive.ics.uci.edu/ml/datasets/Entree+Chicago+Recommendation+Data\n", 777 | "Merchandise Recommendation:\n", 778 | "\n", 779 | "**Healthcare Recommendation:**\n", 780 | "\n", 781 | "Nursing Home - Provider Ratings Data Set http://data.medicare.gov/dataset/Nursing-Home-Compare-Provider-Ratings/mufm-vy8d\n", 782 | "\n", 783 | "Hospital Ratings - Survey of Patients Hospital Experiences http://data.medicare.gov/dataset/Survey-of-Patients-Hospital-Experiences-HCAHPS-/rj76-22dk\n" 784 | ] 785 | } 786 | ], 787 | "metadata": { 788 | "kernelspec": { 789 | "display_name": "Python 3", 790 | "language": "python", 791 | "name": "python3" 792 | }, 793 | "language_info": { 794 | "codemirror_mode": { 795 | "name": "ipython", 796 | "version": 3 797 | }, 798 | "file_extension": ".py", 799 | "mimetype": "text/x-python", 800 | "name": "python", 801 | "nbconvert_exporter": "python", 802 | "pygments_lexer": "ipython3", 803 | "version": "3.8.3" 804 | } 805 | }, 806 | "nbformat": 4, 807 | "nbformat_minor": 1 808 | } 809 | -------------------------------------------------------------------------------- /MACHINE_LEARNING/RECOMMENDER_SYSTEMS_ALGORITHM/u.item: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OakAcademy/Machine-Learning-with-Python/41fb3da8eeb9058649703d6f83a0a583a2cd4388/MACHINE_LEARNING/RECOMMENDER_SYSTEMS_ALGORITHM/u.item -------------------------------------------------------------------------------- /MACHINE_LEARNING/SUPPORT_VECTOR_MACHINES_ALGORITHM/.ipynb_checkpoints/SUPPORT_VECTOR_MACHINES_WITH_PYTHON-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "# Support Vector Machines with Python\n", 9 | "\n", 10 | "\n", 11 | "## Episode 1: Import Libraries" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "import seaborn as sns\n", 24 | "%matplotlib inline" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Episode 2: Get the Data\n", 32 | "\n", 33 | "We'll use the built in breast cancer dataset from Scikit Learn. We can get with the load function:" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from sklearn.datasets import load_breast_cancer" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "cancer = load_breast_cancer()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "The data set is presented in a dictionary form:" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])" 70 | ] 71 | }, 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "cancer.keys()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "We can get information and arrays from this glossary to build our data frame and understand the features:" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | ".. _breast_cancer_dataset:\n", 98 | "\n", 99 | "Breast cancer wisconsin (diagnostic) dataset\n", 100 | "--------------------------------------------\n", 101 | "\n", 102 | "**Data Set Characteristics:**\n", 103 | "\n", 104 | " :Number of Instances: 569\n", 105 | "\n", 106 | " :Number of Attributes: 30 numeric, predictive attributes and the class\n", 107 | "\n", 108 | " :Attribute Information:\n", 109 | " - radius (mean of distances from center to points on the perimeter)\n", 110 | " - texture (standard deviation of gray-scale values)\n", 111 | " - perimeter\n", 112 | " - area\n", 113 | " - smoothness (local variation in radius lengths)\n", 114 | " - compactness (perimeter^2 / area - 1.0)\n", 115 | " - concavity (severity of concave portions of the contour)\n", 116 | " - concave points (number of concave portions of the contour)\n", 117 | " - symmetry\n", 118 | " - fractal dimension (\"coastline approximation\" - 1)\n", 119 | "\n", 120 | " The mean, standard error, and \"worst\" or largest (mean of the three\n", 121 | " worst/largest values) of these features were computed for each image,\n", 122 | " resulting in 30 features. For instance, field 0 is Mean Radius, field\n", 123 | " 10 is Radius SE, field 20 is Worst Radius.\n", 124 | "\n", 125 | " - class:\n", 126 | " - WDBC-Malignant\n", 127 | " - WDBC-Benign\n", 128 | "\n", 129 | " :Summary Statistics:\n", 130 | "\n", 131 | " ===================================== ====== ======\n", 132 | " Min Max\n", 133 | " ===================================== ====== ======\n", 134 | " radius (mean): 6.981 28.11\n", 135 | " texture (mean): 9.71 39.28\n", 136 | " perimeter (mean): 43.79 188.5\n", 137 | " area (mean): 143.5 2501.0\n", 138 | " smoothness (mean): 0.053 0.163\n", 139 | " compactness (mean): 0.019 0.345\n", 140 | " concavity (mean): 0.0 0.427\n", 141 | " concave points (mean): 0.0 0.201\n", 142 | " symmetry (mean): 0.106 0.304\n", 143 | " fractal dimension (mean): 0.05 0.097\n", 144 | " radius (standard error): 0.112 2.873\n", 145 | " texture (standard error): 0.36 4.885\n", 146 | " perimeter (standard error): 0.757 21.98\n", 147 | " area (standard error): 6.802 542.2\n", 148 | " smoothness (standard error): 0.002 0.031\n", 149 | " compactness (standard error): 0.002 0.135\n", 150 | " concavity (standard error): 0.0 0.396\n", 151 | " concave points (standard error): 0.0 0.053\n", 152 | " symmetry (standard error): 0.008 0.079\n", 153 | " fractal dimension (standard error): 0.001 0.03\n", 154 | " radius (worst): 7.93 36.04\n", 155 | " texture (worst): 12.02 49.54\n", 156 | " perimeter (worst): 50.41 251.2\n", 157 | " area (worst): 185.2 4254.0\n", 158 | " smoothness (worst): 0.071 0.223\n", 159 | " compactness (worst): 0.027 1.058\n", 160 | " concavity (worst): 0.0 1.252\n", 161 | " concave points (worst): 0.0 0.291\n", 162 | " symmetry (worst): 0.156 0.664\n", 163 | " fractal dimension (worst): 0.055 0.208\n", 164 | " ===================================== ====== ======\n", 165 | "\n", 166 | " :Missing Attribute Values: None\n", 167 | "\n", 168 | " :Class Distribution: 212 - Malignant, 357 - Benign\n", 169 | "\n", 170 | " :Creator: Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian\n", 171 | "\n", 172 | " :Donor: Nick Street\n", 173 | "\n", 174 | " :Date: November, 1995\n", 175 | "\n", 176 | "This is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.\n", 177 | "https://goo.gl/U2Uwz2\n", 178 | "\n", 179 | "Features are computed from a digitized image of a fine needle\n", 180 | "aspirate (FNA) of a breast mass. They describe\n", 181 | "characteristics of the cell nuclei present in the image.\n", 182 | "\n", 183 | "Separating plane described above was obtained using\n", 184 | "Multisurface Method-Tree (MSM-T) [K. P. Bennett, \"Decision Tree\n", 185 | "Construction Via Linear Programming.\" Proceedings of the 4th\n", 186 | "Midwest Artificial Intelligence and Cognitive Science Society,\n", 187 | "pp. 97-101, 1992], a classification method which uses linear\n", 188 | "programming to construct a decision tree. Relevant features\n", 189 | "were selected using an exhaustive search in the space of 1-4\n", 190 | "features and 1-3 separating planes.\n", 191 | "\n", 192 | "The actual linear program used to obtain the separating plane\n", 193 | "in the 3-dimensional space is that described in:\n", 194 | "[K. P. Bennett and O. L. Mangasarian: \"Robust Linear\n", 195 | "Programming Discrimination of Two Linearly Inseparable Sets\",\n", 196 | "Optimization Methods and Software 1, 1992, 23-34].\n", 197 | "\n", 198 | "This database is also available through the UW CS ftp server:\n", 199 | "\n", 200 | "ftp ftp.cs.wisc.edu\n", 201 | "cd math-prog/cpo-dataset/machine-learn/WDBC/\n", 202 | "\n", 203 | ".. topic:: References\n", 204 | "\n", 205 | " - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction \n", 206 | " for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on \n", 207 | " Electronic Imaging: Science and Technology, volume 1905, pages 861-870,\n", 208 | " San Jose, CA, 1993.\n", 209 | " - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and \n", 210 | " prognosis via linear programming. Operations Research, 43(4), pages 570-577, \n", 211 | " July-August 1995.\n", 212 | " - W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques\n", 213 | " to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) \n", 214 | " 163-171.\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "print(cancer['DESCR'])" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 6, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/plain": [ 230 | "array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',\n", 231 | " 'mean smoothness', 'mean compactness', 'mean concavity',\n", 232 | " 'mean concave points', 'mean symmetry', 'mean fractal dimension',\n", 233 | " 'radius error', 'texture error', 'perimeter error', 'area error',\n", 234 | " 'smoothness error', 'compactness error', 'concavity error',\n", 235 | " 'concave points error', 'symmetry error',\n", 236 | " 'fractal dimension error', 'worst radius', 'worst texture',\n", 237 | " 'worst perimeter', 'worst area', 'worst smoothness',\n", 238 | " 'worst compactness', 'worst concavity', 'worst concave points',\n", 239 | " 'worst symmetry', 'worst fractal dimension'], dtype='\n", 270 | "RangeIndex: 569 entries, 0 to 568\n", 271 | "Data columns (total 30 columns):\n", 272 | " # Column Non-Null Count Dtype \n", 273 | "--- ------ -------------- ----- \n", 274 | " 0 mean radius 569 non-null float64\n", 275 | " 1 mean texture 569 non-null float64\n", 276 | " 2 mean perimeter 569 non-null float64\n", 277 | " 3 mean area 569 non-null float64\n", 278 | " 4 mean smoothness 569 non-null float64\n", 279 | " 5 mean compactness 569 non-null float64\n", 280 | " 6 mean concavity 569 non-null float64\n", 281 | " 7 mean concave points 569 non-null float64\n", 282 | " 8 mean symmetry 569 non-null float64\n", 283 | " 9 mean fractal dimension 569 non-null float64\n", 284 | " 10 radius error 569 non-null float64\n", 285 | " 11 texture error 569 non-null float64\n", 286 | " 12 perimeter error 569 non-null float64\n", 287 | " 13 area error 569 non-null float64\n", 288 | " 14 smoothness error 569 non-null float64\n", 289 | " 15 compactness error 569 non-null float64\n", 290 | " 16 concavity error 569 non-null float64\n", 291 | " 17 concave points error 569 non-null float64\n", 292 | " 18 symmetry error 569 non-null float64\n", 293 | " 19 fractal dimension error 569 non-null float64\n", 294 | " 20 worst radius 569 non-null float64\n", 295 | " 21 worst texture 569 non-null float64\n", 296 | " 22 worst perimeter 569 non-null float64\n", 297 | " 23 worst area 569 non-null float64\n", 298 | " 24 worst smoothness 569 non-null float64\n", 299 | " 25 worst compactness 569 non-null float64\n", 300 | " 26 worst concavity 569 non-null float64\n", 301 | " 27 worst concave points 569 non-null float64\n", 302 | " 28 worst symmetry 569 non-null float64\n", 303 | " 29 worst fractal dimension 569 non-null float64\n", 304 | "dtypes: float64(30)\n", 305 | "memory usage: 133.5 KB\n" 306 | ] 307 | } 308 | ], 309 | "source": [ 310 | "df_feat = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])\n", 311 | "df_feat.info()" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 9, 317 | "metadata": { 318 | "scrolled": true 319 | }, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,\n", 325 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,\n", 326 | " 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,\n", 327 | " 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,\n", 328 | " 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,\n", 329 | " 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,\n", 330 | " 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,\n", 331 | " 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,\n", 332 | " 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,\n", 333 | " 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,\n", 334 | " 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,\n", 335 | " 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 336 | " 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,\n", 337 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,\n", 338 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,\n", 339 | " 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,\n", 340 | " 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,\n", 341 | " 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,\n", 342 | " 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,\n", 343 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,\n", 344 | " 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,\n", 345 | " 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,\n", 346 | " 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,\n", 347 | " 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,\n", 348 | " 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", 349 | " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])" 350 | ] 351 | }, 352 | "execution_count": 9, 353 | "metadata": {}, 354 | "output_type": "execute_result" 355 | } 356 | ], 357 | "source": [ 358 | "cancer['target']" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 10, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "df_target = pd.DataFrame(cancer['target'],columns=['Cancer'])" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "Now let's actually check out the dataframe!" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 12, 380 | "metadata": { 381 | "scrolled": true 382 | }, 383 | "outputs": [ 384 | { 385 | "data": { 386 | "text/html": [ 387 | "
\n", 388 | "\n", 401 | "\n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | "
Cancer
00
10
20
30
40
\n", 431 | "
" 432 | ], 433 | "text/plain": [ 434 | " Cancer\n", 435 | "0 0\n", 436 | "1 0\n", 437 | "2 0\n", 438 | "3 0\n", 439 | "4 0" 440 | ] 441 | }, 442 | "execution_count": 12, 443 | "metadata": {}, 444 | "output_type": "execute_result" 445 | } 446 | ], 447 | "source": [ 448 | "df_target.head()" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "# Episode 4: Exploratory Data Analysis\n", 456 | "\n" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "We will skip the Data Visualization part of this lesson because there are many features that are difficult to interpret if you do not have field knowledge about cancer or tumor cells. There will be more to visualize for data in your project." 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "## Episode 5: Train Test Split" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 13, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "from sklearn.model_selection import train_test_split" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 14, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "X_train, X_test, y_train, y_test = train_test_split(df_feat, np.ravel(df_target), test_size=0.30, random_state=101)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "# Episode 6: Train the Support Vector Classifier" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 15, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "from sklearn.svm import SVC" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 16, 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "model = SVC()" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 17, 519 | "metadata": {}, 520 | "outputs": [ 521 | { 522 | "data": { 523 | "text/plain": [ 524 | "SVC()" 525 | ] 526 | }, 527 | "execution_count": 17, 528 | "metadata": {}, 529 | "output_type": "execute_result" 530 | } 531 | ], 532 | "source": [ 533 | "model.fit(X_train,y_train)" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "## Episode 7: Predictions and Evaluations\n", 541 | "\n", 542 | "Now let's predict using the trained model." 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 18, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "predictions = model.predict(X_test)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 19, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "from sklearn.metrics import classification_report,confusion_matrix" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 20, 566 | "metadata": {}, 567 | "outputs": [ 568 | { 569 | "name": "stdout", 570 | "output_type": "stream", 571 | "text": [ 572 | "[[ 56 10]\n", 573 | " [ 3 102]]\n" 574 | ] 575 | } 576 | ], 577 | "source": [ 578 | "print(confusion_matrix(y_test,predictions))" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 21, 584 | "metadata": {}, 585 | "outputs": [ 586 | { 587 | "name": "stdout", 588 | "output_type": "stream", 589 | "text": [ 590 | " precision recall f1-score support\n", 591 | "\n", 592 | " 0 0.95 0.85 0.90 66\n", 593 | " 1 0.91 0.97 0.94 105\n", 594 | "\n", 595 | " accuracy 0.92 171\n", 596 | " macro avg 0.93 0.91 0.92 171\n", 597 | "weighted avg 0.93 0.92 0.92 171\n", 598 | "\n" 599 | ] 600 | } 601 | ], 602 | "source": [ 603 | "print(classification_report(y_test,predictions))" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "Notice that we are classifying everything into a single class! This means our model needs to have it parameters adjusted (it may also help to normalize the data).\n", 611 | "\n", 612 | "We can search for parameters using a GridSearch!" 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "metadata": {}, 618 | "source": [ 619 | "# Episode 8: Gridsearch\n", 620 | "\n", 621 | "Finding the right parameters (like what C or gamma values to use) is a difficult task! But luckily, we can be a little lazy and just try a few combinations and see what works best! The idea of creating a 'grid' of parameters and just trying all possible combinations is called Gridsearch, this method is common enough for Scikit-learn to have this functionality built with GridSearchCV!The CV stands for Model Selection;\n", 622 | "\n", 623 | "GridSearchCV takes a glossary explaining the parameters to be tested and the model to train. The grid of parameters is defined as a dictionary where keys are parameters and values are settings to test." 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 22, 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [ 632 | "param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} " 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": 23, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "from sklearn.model_selection import GridSearchCV" 642 | ] 643 | }, 644 | { 645 | "cell_type": "markdown", 646 | "metadata": {}, 647 | "source": [ 648 | "One of the great things about GridSearchCV is that it is a meta-estimator. It takes an estimator like SVC, and creates a new estimator, that behaves exactly the same - in this case, like a classifier. You should add refit=True and choose verbose to whatever number you want, higher the number, the more verbose (verbose just means the text output describing the process)." 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 65, 654 | "metadata": { 655 | "collapsed": true 656 | }, 657 | "outputs": [], 658 | "source": [ 659 | "grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)" 660 | ] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": {}, 665 | "source": [ 666 | "What fit does is a bit more involved then usual. First, it runs the same loop with cross-validation, to find the best parameter combination. Once it has the best combination, it runs fit again on all data passed to fit (without cross-validation), to built a single new model using the best parameter setting." 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 40, 672 | "metadata": {}, 673 | "outputs": [ 674 | { 675 | "name": "stdout", 676 | "output_type": "stream", 677 | "text": [ 678 | "Fitting 3 folds for each of 25 candidates, totalling 75 fits\n", 679 | "[CV] gamma=1, C=0.1, kernel=rbf ......................................\n", 680 | "[CV] ............. gamma=1, C=0.1, kernel=rbf, score=0.631579 - 0.0s\n", 681 | "[CV] gamma=1, C=0.1, kernel=rbf ......................................\n", 682 | "[CV] ............. gamma=1, C=0.1, kernel=rbf, score=0.631579 - 0.0s\n", 683 | "[CV] gamma=1, C=0.1, kernel=rbf ......................................\n", 684 | "[CV] ............. gamma=1, C=0.1, kernel=rbf, score=0.636364 - 0.0s\n", 685 | "[CV] gamma=0.1, C=0.1, kernel=rbf ....................................\n", 686 | "[CV] ........... gamma=0.1, C=0.1, kernel=rbf, score=0.631579 - 0.0s\n", 687 | "[CV] gamma=0.1, C=0.1, kernel=rbf ....................................\n", 688 | "[CV] ........... gamma=0.1, C=0.1, kernel=rbf, score=0.631579 - 0.0s\n", 689 | "[CV] gamma=0.1, C=0.1, kernel=rbf ....................................\n", 690 | "[CV] ........... gamma=0.1, C=0.1, kernel=rbf, score=0.636364 - 0.0s\n", 691 | "[CV] gamma=0.01, C=0.1, kernel=rbf ...................................\n", 692 | "[CV] .......... gamma=0.01, C=0.1, kernel=rbf, score=0.631579 - 0.0s\n", 693 | "[CV] gamma=0.01, C=0.1, kernel=rbf ...................................\n", 694 | "[CV] .......... gamma=0.01, C=0.1, kernel=rbf, score=0.631579 - 0.0s\n", 695 | "[CV] gamma=0.01, C=0.1, kernel=rbf ...................................\n", 696 | "[CV] .......... gamma=0.01, C=0.1, kernel=rbf, score=0.636364 - 0.0s\n", 697 | "[CV] gamma=0.001, C=0.1, kernel=rbf ..................................\n", 698 | "[CV] ......... gamma=0.001, C=0.1, kernel=rbf, score=0.631579 - 0.0s\n", 699 | "[CV] gamma=0.001, C=0.1, kernel=rbf ..................................\n", 700 | "[CV] ......... gamma=0.001, C=0.1, kernel=rbf, score=0.631579 - 0.0s\n", 701 | "[CV] gamma=0.001, C=0.1, kernel=rbf ..................................\n", 702 | "[CV] ......... gamma=0.001, C=0.1, kernel=rbf, score=0.636364 - 0.0s\n", 703 | "[CV] gamma=0.0001, C=0.1, kernel=rbf .................................\n", 704 | "[CV] ........ gamma=0.0001, C=0.1, kernel=rbf, score=0.902256 - 0.0s\n", 705 | "[CV] gamma=0.0001, C=0.1, kernel=rbf .................................\n", 706 | "[CV] ........ gamma=0.0001, C=0.1, kernel=rbf, score=0.962406 - 0.0s\n", 707 | "[CV] gamma=0.0001, C=0.1, kernel=rbf .................................\n", 708 | "[CV] ........ gamma=0.0001, C=0.1, kernel=rbf, score=0.916667 - 0.0s\n", 709 | "[CV] gamma=1, C=1, kernel=rbf ........................................\n", 710 | "[CV] ............... gamma=1, C=1, kernel=rbf, score=0.631579 - 0.0s\n", 711 | "[CV] gamma=1, C=1, kernel=rbf ........................................\n", 712 | "[CV] ............... gamma=1, C=1, kernel=rbf, score=0.631579 - 0.0s\n", 713 | "[CV] gamma=1, C=1, kernel=rbf ........................................\n", 714 | "[CV] ............... gamma=1, C=1, kernel=rbf, score=0.636364 - 0.0s\n", 715 | "[CV] gamma=0.1, C=1, kernel=rbf ......................................\n", 716 | "[CV] ............. gamma=0.1, C=1, kernel=rbf, score=0.631579 - 0.0s\n", 717 | "[CV] gamma=0.1, C=1, kernel=rbf ......................................\n", 718 | "[CV] ............. gamma=0.1, C=1, kernel=rbf, score=0.631579 - 0.0s\n", 719 | "[CV] gamma=0.1, C=1, kernel=rbf ......................................\n", 720 | "[CV] ............. gamma=0.1, C=1, kernel=rbf, score=0.636364 - 0.0s\n", 721 | "[CV] gamma=0.01, C=1, kernel=rbf .....................................\n", 722 | "[CV] ............ gamma=0.01, C=1, kernel=rbf, score=0.631579 - 0.0s\n", 723 | "[CV] gamma=0.01, C=1, kernel=rbf .....................................\n", 724 | "[CV] ............ gamma=0.01, C=1, kernel=rbf, score=0.631579 - 0.0s\n", 725 | "[CV] gamma=0.01, C=1, kernel=rbf .....................................\n", 726 | "[CV] ............ gamma=0.01, C=1, kernel=rbf, score=0.636364 - 0.0s\n", 727 | "[CV] gamma=0.001, C=1, kernel=rbf ....................................\n", 728 | "[CV] ........... gamma=0.001, C=1, kernel=rbf, score=0.902256 - 0.0s\n", 729 | "[CV] gamma=0.001, C=1, kernel=rbf ....................................\n", 730 | "[CV] ........... gamma=0.001, C=1, kernel=rbf, score=0.939850 - 0.0s\n", 731 | "[CV] gamma=0.001, C=1, kernel=rbf ....................................\n", 732 | "[CV] ........... gamma=0.001, C=1, kernel=rbf, score=0.954545 - 0.0s\n", 733 | "[CV] gamma=0.0001, C=1, kernel=rbf ...................................\n", 734 | "[CV] .......... gamma=0.0001, C=1, kernel=rbf, score=0.939850 - 0.0s\n", 735 | "[CV] gamma=0.0001, C=1, kernel=rbf ...................................\n", 736 | "[CV] .......... gamma=0.0001, C=1, kernel=rbf, score=0.969925 - 0.0s\n", 737 | "[CV] gamma=0.0001, C=1, kernel=rbf ...................................\n", 738 | "[CV] .......... gamma=0.0001, C=1, kernel=rbf, score=0.946970 - 0.0s\n", 739 | "[CV] gamma=1, C=10, kernel=rbf .......................................\n", 740 | "[CV] .............. gamma=1, C=10, kernel=rbf, score=0.631579 - 0.0s\n", 741 | "[CV] gamma=1, C=10, kernel=rbf .......................................\n", 742 | "[CV] .............. gamma=1, C=10, kernel=rbf, score=0.631579 - 0.0s\n", 743 | "[CV] gamma=1, C=10, kernel=rbf .......................................\n", 744 | "[CV] .............. gamma=1, C=10, kernel=rbf, score=0.636364 - 0.0s\n", 745 | "[CV] gamma=0.1, C=10, kernel=rbf .....................................\n", 746 | "[CV] ............ gamma=0.1, C=10, kernel=rbf, score=0.631579 - 0.0s\n", 747 | "[CV] gamma=0.1, C=10, kernel=rbf .....................................\n", 748 | "[CV] ............ gamma=0.1, C=10, kernel=rbf, score=0.631579 - 0.0s\n", 749 | "[CV] gamma=0.1, C=10, kernel=rbf .....................................\n", 750 | "[CV] ............ gamma=0.1, C=10, kernel=rbf, score=0.636364 - 0.0s\n", 751 | "[CV] gamma=0.01, C=10, kernel=rbf ....................................\n", 752 | "[CV] ........... gamma=0.01, C=10, kernel=rbf, score=0.631579 - 0.0s\n", 753 | "[CV] gamma=0.01, C=10, kernel=rbf ....................................\n", 754 | "[CV] ........... gamma=0.01, C=10, kernel=rbf, score=0.631579 - 0.0s\n", 755 | "[CV] gamma=0.01, C=10, kernel=rbf ....................................\n", 756 | "[CV] ........... gamma=0.01, C=10, kernel=rbf, score=0.636364 - 0.0s\n", 757 | "[CV] gamma=0.001, C=10, kernel=rbf ...................................\n", 758 | "[CV] .......... gamma=0.001, C=10, kernel=rbf, score=0.894737 - 0.0s\n", 759 | "[CV] gamma=0.001, C=10, kernel=rbf ...................................\n", 760 | "[CV] .......... gamma=0.001, C=10, kernel=rbf, score=0.932331 - 0.0s\n", 761 | "[CV] gamma=0.001, C=10, kernel=rbf ...................................\n", 762 | "[CV] .......... gamma=0.001, C=10, kernel=rbf, score=0.916667 - 0.0s\n", 763 | "[CV] gamma=0.0001, C=10, kernel=rbf ..................................\n", 764 | "[CV] ......... gamma=0.0001, C=10, kernel=rbf, score=0.932331 - 0.0s\n", 765 | "[CV] gamma=0.0001, C=10, kernel=rbf ..................................\n", 766 | "[CV] ......... gamma=0.0001, C=10, kernel=rbf, score=0.969925 - 0.0s\n", 767 | "[CV] gamma=0.0001, C=10, kernel=rbf ..................................\n", 768 | "[CV] ......... gamma=0.0001, C=10, kernel=rbf, score=0.962121 - 0.0s\n", 769 | "[CV] gamma=1, C=100, kernel=rbf ......................................\n", 770 | "[CV] ............. gamma=1, C=100, kernel=rbf, score=0.631579 - 0.0s\n", 771 | "[CV] gamma=1, C=100, kernel=rbf ......................................\n", 772 | "[CV] ............. gamma=1, C=100, kernel=rbf, score=0.631579 - 0.0s\n", 773 | "[CV] gamma=1, C=100, kernel=rbf ......................................\n", 774 | "[CV] ............. gamma=1, C=100, kernel=rbf, score=0.636364 - 0.0s\n", 775 | "[CV] gamma=0.1, C=100, kernel=rbf ....................................\n", 776 | "[CV] ........... gamma=0.1, C=100, kernel=rbf, score=0.631579 - 0.0s\n", 777 | "[CV] gamma=0.1, C=100, kernel=rbf ....................................\n", 778 | "[CV] ........... gamma=0.1, C=100, kernel=rbf, score=0.631579 - 0.0s\n", 779 | "[CV] gamma=0.1, C=100, kernel=rbf ....................................\n", 780 | "[CV] ........... gamma=0.1, C=100, kernel=rbf, score=0.636364 - 0.0s\n", 781 | "[CV] gamma=0.01, C=100, kernel=rbf ...................................\n", 782 | "[CV] .......... gamma=0.01, C=100, kernel=rbf, score=0.631579 - 0.0s\n", 783 | "[CV] gamma=0.01, C=100, kernel=rbf ...................................\n", 784 | "[CV] .......... gamma=0.01, C=100, kernel=rbf, score=0.631579 - 0.0s\n", 785 | "[CV] gamma=0.01, C=100, kernel=rbf ...................................\n", 786 | "[CV] .......... gamma=0.01, C=100, kernel=rbf, score=0.636364 - 0.0s\n", 787 | "[CV] gamma=0.001, C=100, kernel=rbf ..................................\n", 788 | "[CV] ......... gamma=0.001, C=100, kernel=rbf, score=0.894737 - 0.0s\n", 789 | "[CV] gamma=0.001, C=100, kernel=rbf ..................................\n", 790 | "[CV] ......... gamma=0.001, C=100, kernel=rbf, score=0.932331 - 0.0s\n", 791 | "[CV] gamma=0.001, C=100, kernel=rbf ..................................\n", 792 | "[CV] ......... gamma=0.001, C=100, kernel=rbf, score=0.916667 - 0.0s\n", 793 | "[CV] gamma=0.0001, C=100, kernel=rbf .................................\n", 794 | "[CV] ........ gamma=0.0001, C=100, kernel=rbf, score=0.917293 - 0.0s\n", 795 | "[CV] gamma=0.0001, C=100, kernel=rbf .................................\n", 796 | "[CV] ........ gamma=0.0001, C=100, kernel=rbf, score=0.977444 - 0.0s\n", 797 | "[CV] gamma=0.0001, C=100, kernel=rbf .................................\n", 798 | "[CV] ........ gamma=0.0001, C=100, kernel=rbf, score=0.939394 - 0.0s\n", 799 | "[CV] gamma=1, C=1000, kernel=rbf .....................................\n", 800 | "[CV] ............ gamma=1, C=1000, kernel=rbf, score=0.631579 - 0.0s\n", 801 | "[CV] gamma=1, C=1000, kernel=rbf .....................................\n", 802 | "[CV] ............ gamma=1, C=1000, kernel=rbf, score=0.631579 - 0.0s\n", 803 | "[CV] gamma=1, C=1000, kernel=rbf .....................................\n", 804 | "[CV] ............ gamma=1, C=1000, kernel=rbf, score=0.636364 - 0.0s\n", 805 | "[CV] gamma=0.1, C=1000, kernel=rbf ...................................\n", 806 | "[CV] .......... gamma=0.1, C=1000, kernel=rbf, score=0.631579 - 0.0s\n", 807 | "[CV] gamma=0.1, C=1000, kernel=rbf ...................................\n", 808 | "[CV] .......... gamma=0.1, C=1000, kernel=rbf, score=0.631579 - 0.0s\n", 809 | "[CV] gamma=0.1, C=1000, kernel=rbf ...................................\n", 810 | "[CV] .......... gamma=0.1, C=1000, kernel=rbf, score=0.636364 - 0.0s\n", 811 | "[CV] gamma=0.01, C=1000, kernel=rbf ..................................\n", 812 | "[CV] ......... gamma=0.01, C=1000, kernel=rbf, score=0.631579 - 0.0s\n", 813 | "[CV] gamma=0.01, C=1000, kernel=rbf ..................................\n", 814 | "[CV] ......... gamma=0.01, C=1000, kernel=rbf, score=0.631579 - 0.0s\n", 815 | "[CV] gamma=0.01, C=1000, kernel=rbf ..................................\n", 816 | "[CV] ......... gamma=0.01, C=1000, kernel=rbf, score=0.636364 - 0.0s\n", 817 | "[CV] gamma=0.001, C=1000, kernel=rbf .................................\n", 818 | "[CV] ........ gamma=0.001, C=1000, kernel=rbf, score=0.894737 - 0.0s\n", 819 | "[CV] gamma=0.001, C=1000, kernel=rbf .................................\n", 820 | "[CV] ........ gamma=0.001, C=1000, kernel=rbf, score=0.932331 - 0.0s\n", 821 | "[CV] gamma=0.001, C=1000, kernel=rbf .................................\n", 822 | "[CV] ........ gamma=0.001, C=1000, kernel=rbf, score=0.916667 - 0.0s" 823 | ] 824 | }, 825 | { 826 | "name": "stderr", 827 | "output_type": "stream", 828 | "text": [ 829 | "[Parallel(n_jobs=1)]: Done 31 tasks | elapsed: 0.3s\n", 830 | "[Parallel(n_jobs=1)]: Done 75 out of 75 | elapsed: 0.8s finished\n" 831 | ] 832 | }, 833 | { 834 | "name": "stdout", 835 | "output_type": "stream", 836 | "text": [ 837 | "\n", 838 | "[CV] gamma=0.0001, C=1000, kernel=rbf ................................\n", 839 | "[CV] ....... gamma=0.0001, C=1000, kernel=rbf, score=0.909774 - 0.0s\n", 840 | "[CV] gamma=0.0001, C=1000, kernel=rbf ................................\n", 841 | "[CV] ....... gamma=0.0001, C=1000, kernel=rbf, score=0.969925 - 0.0s\n", 842 | "[CV] gamma=0.0001, C=1000, kernel=rbf ................................\n", 843 | "[CV] ....... gamma=0.0001, C=1000, kernel=rbf, score=0.931818 - 0.0s\n" 844 | ] 845 | }, 846 | { 847 | "data": { 848 | "text/plain": [ 849 | "GridSearchCV(cv=None, error_score='raise',\n", 850 | " estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 851 | " decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n", 852 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 853 | " tol=0.001, verbose=False),\n", 854 | " fit_params={}, iid=True, n_jobs=1,\n", 855 | " param_grid={'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'C': [0.1, 1, 10, 100, 1000], 'kernel': ['rbf']},\n", 856 | " pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)" 857 | ] 858 | }, 859 | "execution_count": 40, 860 | "metadata": {}, 861 | "output_type": "execute_result" 862 | } 863 | ], 864 | "source": [ 865 | "# May take awhile!\n", 866 | "grid.fit(X_train,y_train)" 867 | ] 868 | }, 869 | { 870 | "cell_type": "markdown", 871 | "metadata": {}, 872 | "source": [ 873 | "You can inspect the best parameters found by GridSearchCV in the best_params_ attribute, and the best estimator in the best\\_estimator_ attribute:" 874 | ] 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": 41, 879 | "metadata": {}, 880 | "outputs": [ 881 | { 882 | "data": { 883 | "text/plain": [ 884 | "{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}" 885 | ] 886 | }, 887 | "execution_count": 41, 888 | "metadata": {}, 889 | "output_type": "execute_result" 890 | } 891 | ], 892 | "source": [ 893 | "grid.best_params_" 894 | ] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "execution_count": null, 899 | "metadata": {}, 900 | "outputs": [], 901 | "source": [ 902 | "grid.best_estimator_" 903 | ] 904 | }, 905 | { 906 | "cell_type": "markdown", 907 | "metadata": {}, 908 | "source": [ 909 | "Then you can re-run predictions on this grid object just like you would with a normal model." 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": 48, 915 | "metadata": {}, 916 | "outputs": [], 917 | "source": [ 918 | "grid_predictions = grid.predict(X_test)" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": 49, 924 | "metadata": {}, 925 | "outputs": [ 926 | { 927 | "name": "stdout", 928 | "output_type": "stream", 929 | "text": [ 930 | "[[ 60 6]\n", 931 | " [ 3 102]]\n" 932 | ] 933 | } 934 | ], 935 | "source": [ 936 | "print(confusion_matrix(y_test,grid_predictions))" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": 50, 942 | "metadata": { 943 | "scrolled": true 944 | }, 945 | "outputs": [ 946 | { 947 | "name": "stdout", 948 | "output_type": "stream", 949 | "text": [ 950 | " precision recall f1-score support\n", 951 | "\n", 952 | " 0 0.95 0.91 0.93 66\n", 953 | " 1 0.94 0.97 0.96 105\n", 954 | "\n", 955 | "avg / total 0.95 0.95 0.95 171\n", 956 | "\n" 957 | ] 958 | } 959 | ], 960 | "source": [ 961 | "print(classification_report(y_test,grid_predictions))" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "# Great job!" 969 | ] 970 | } 971 | ], 972 | "metadata": { 973 | "kernelspec": { 974 | "display_name": "Python 3", 975 | "language": "python", 976 | "name": "python3" 977 | }, 978 | "language_info": { 979 | "codemirror_mode": { 980 | "name": "ipython", 981 | "version": 3 982 | }, 983 | "file_extension": ".py", 984 | "mimetype": "text/x-python", 985 | "name": "python", 986 | "nbconvert_exporter": "python", 987 | "pygments_lexer": "ipython3", 988 | "version": "3.8.3" 989 | } 990 | }, 991 | "nbformat": 4, 992 | "nbformat_minor": 1 993 | } 994 | --------------------------------------------------------------------------------