├── Canlapan_Inah_HW1_Report.pdf ├── Canlapan_Inah_HW2_Report.pdf ├── Canlapan_Inah_HW3_Report.pdf ├── Canlapan_Inah_HW4_Report.pdf ├── Canlapan_Inah_HW5_Report.pdf ├── Canlapan_Inah_HW6_Report.pdf ├── Property Style Reclassification ├── README.txt ├── iCanlapan - Final Report.pdf └── Property Style Reclassification - Final.ipynb ├── Topics.txt ├── LICENSE ├── Canlapan_Inah_HW1.ipynb └── Canlapan_Inah_HW4.ipynb /Canlapan_Inah_HW1_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inahpatrizia/isye_6740/HEAD/Canlapan_Inah_HW1_Report.pdf -------------------------------------------------------------------------------- /Canlapan_Inah_HW2_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inahpatrizia/isye_6740/HEAD/Canlapan_Inah_HW2_Report.pdf -------------------------------------------------------------------------------- /Canlapan_Inah_HW3_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inahpatrizia/isye_6740/HEAD/Canlapan_Inah_HW3_Report.pdf -------------------------------------------------------------------------------- /Canlapan_Inah_HW4_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inahpatrizia/isye_6740/HEAD/Canlapan_Inah_HW4_Report.pdf -------------------------------------------------------------------------------- /Canlapan_Inah_HW5_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inahpatrizia/isye_6740/HEAD/Canlapan_Inah_HW5_Report.pdf -------------------------------------------------------------------------------- /Canlapan_Inah_HW6_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inahpatrizia/isye_6740/HEAD/Canlapan_Inah_HW6_Report.pdf -------------------------------------------------------------------------------- /Property Style Reclassification/README.txt: -------------------------------------------------------------------------------- 1 | Final project for this course. Students were instructed to form a group, pick a topic, design the methodology and report results. 2 | -------------------------------------------------------------------------------- /Property Style Reclassification/iCanlapan - Final Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inahpatrizia/isye_6740/HEAD/Property Style Reclassification/iCanlapan - Final Report.pdf -------------------------------------------------------------------------------- /Topics.txt: -------------------------------------------------------------------------------- 1 | HW 1: Clustering 2 | HW 2: Dimensionality Reduction 3 | HW 3: Density Estimation, Gaussian Mixture Model and Estimation-Maximization Algorithm 4 | HW 4: Optimization, Classification Naive Bayes, Logisitic Regression, SVM 5 | HW 5: Neural Networks, Feature Selection and Anomaly Detection 6 | HW 6: Boosting Algorithms and AdaBoost, Random Forest 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Inah Canlapan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Canlapan_Inah_HW1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Homework 1" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### 1. Concept Questions\n", 15 | "Code below is for 1.5" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "\n", 26 | "A = np.array([[0,1,1,0,0], [1,0,1,0,0], [1,1,0,0,0], [0,0,0,0,1], [0,0,0,1,0]])\n", 27 | "D = np.array([[2,0,0,0,0], [0,2,0,0,0], [0,0,2,0,0], [0,0,0,1,0], [0,0,0,0,1]])\n", 28 | "L = D-A\n", 29 | "\n", 30 | "# Eigenvalue decomposition\n", 31 | "s, v = np.linalg.eig(L)\n", 32 | "\n", 33 | "# Get index of eigenvalues that are 0\n", 34 | "zero_evalue_index = [i for i, x in enumerate(np.around(s,1)) if x == 0]\n", 35 | "print(\"Eigenvalues: \" + str(np.around(s,1)))\n", 36 | "print(\"Indexes of 0 eigenvalues: \" + str(zero_evalue_index))\n", 37 | "\n", 38 | "# Columns corresponding to the eigenvalues that are 0\n", 39 | "print(\"Eigenvectors: \")\n", 40 | "print(str(v))\n", 41 | "print(\"Cluster Assignment:\")\n", 42 | "print(str(v[:, zero_evalue_index]))" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Based on the resulting matrix, I can conclude that nodes 1, 2 and 3 are connected in one cluster while nodes 4 and 5 are connected in another. " 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## 2. Image Compression Using Clustering" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Q1" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "import imageio \n", 73 | "import matplotlib.image\n", 74 | "import random \n", 75 | "import numpy as np\n", 76 | "import time\n", 77 | "from datetime import timedelta\n", 78 | "from collections import defaultdict" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "###### Define Functions ######\n", 88 | "def assign_cluster(pxl, c, norm=2):\n", 89 | " \"\"\"\n", 90 | " Calculates distance between single pixel and list of centroids.\n", 91 | "\n", 92 | " Input:\n", 93 | " pxl = pixel [R,G,B]\n", 94 | " c = list of centroids\n", 95 | " norm = L1 or L2, default is L2\n", 96 | " Output:\n", 97 | " Cluster Index\n", 98 | " \n", 99 | " Author's Note: \n", 100 | " This function is what takes the longest when I run the code. \n", 101 | " I couldn't figure out how to optimize, but it still works! \n", 102 | " \n", 103 | " \"\"\" \n", 104 | " min_distance = 0\n", 105 | " cluster_index = 0\n", 106 | " \n", 107 | " # Loop through list of centroids\n", 108 | " for i in range(len(c)):\n", 109 | " x = pxl-c[i]\n", 110 | "\n", 111 | " if norm==1:\n", 112 | " distance = np.linalg.norm(x, 1)\n", 113 | " else:\n", 114 | " distance = np.linalg.norm(x, 2)\n", 115 | " \n", 116 | " if i == 0:\n", 117 | " min_distance = distance\n", 118 | " cluster_index = i\n", 119 | " else:\n", 120 | " if distance < min_distance:\n", 121 | " cluster_index = i\n", 122 | " min_distance = distance \n", 123 | " \n", 124 | " return cluster_index\n", 125 | "\n", 126 | "def adjust_centroids(pixel, cluster_assign, centroids, norm=2):\n", 127 | " \"\"\"\n", 128 | " Adjusts the centroids based on mean (L2 norm) or median (L1 norm)\n", 129 | " \n", 130 | " Input\n", 131 | " pixel = RGB representation of image\n", 132 | " cluster_assign = cluster assignment\n", 133 | " centroids = current list of centroids\n", 134 | " Output\n", 135 | " adjusted centroid list \n", 136 | " \"\"\"\n", 137 | " \n", 138 | " m = pixel.shape[0]\n", 139 | " n = pixel.shape[1]\n", 140 | " d = pixel.shape[2]\n", 141 | " \n", 142 | " new_centroids = defaultdict()\n", 143 | " centroid_list = []\n", 144 | " final_centroids = []\n", 145 | " \n", 146 | " # Loop through pixels in image. Add pixel to corresponding cluster list\n", 147 | " for i in range(m):\n", 148 | " for j in range(n):\n", 149 | " pxl = pixel[i,j]\n", 150 | " cluster = cluster_assign[i,j] \n", 151 | " \n", 152 | " if cluster not in new_centroids:\n", 153 | " new_centroids[cluster] = list()\n", 154 | " \n", 155 | " new_centroids[cluster].append(pxl) \n", 156 | " \n", 157 | " # Calculate the new list of centroids\n", 158 | " for cluster in new_centroids:\n", 159 | " centroid_list = np.array(new_centroids[cluster])\n", 160 | " \n", 161 | " if norm==1:\n", 162 | " updated_centroid = np.median(centroid_list, axis=0).astype(np.uint8)\n", 163 | " else:\n", 164 | " updated_centroid = np.around(centroid_list.sum(axis=0)/len(centroid_list)).astype(np.uint8)\n", 165 | " \n", 166 | " final_centroids.append(updated_centroid.tolist())\n", 167 | " \n", 168 | " return final_centroids \n", 169 | "\n", 170 | "def run_kmeans(pixel, k, norm=2):\n", 171 | " \"\"\"\n", 172 | " Runs the kmeans algorithm.\n", 173 | " \n", 174 | " Input:\n", 175 | " pixel = RGB representation of image\n", 176 | " k = number of desired clusters\n", 177 | " norm = distance measure, can be 1 or 2\n", 178 | " \n", 179 | " Output:\n", 180 | " cluster labels\n", 181 | " new centroids \n", 182 | " compressed image \n", 183 | " number of iterations \n", 184 | " run time\n", 185 | " \"\"\" \n", 186 | " start_time = time.monotonic()\n", 187 | " \n", 188 | " m = pixel.shape[0] \n", 189 | " n = pixel.shape[1] \n", 190 | " d = pixel.shape[2] \n", 191 | " cluster_assign = np.empty(shape=(m,n),dtype='object')\n", 192 | " \n", 193 | " np.random.seed(206)\n", 194 | " \n", 195 | " # Initialize random centroids\n", 196 | " c_r = np.random.randint(0, 255, size = k)\n", 197 | " c_g = np.random.randint(0, 255, size = k)\n", 198 | " c_b = np.random.randint(0, 255, size = k)\n", 199 | " centroids = np.array(list(zip(c_r, c_g, c_b))).tolist()\n", 200 | " centroids.sort()\n", 201 | " \n", 202 | " # Initialize the clusters\n", 203 | " cluster_assign = np.empty(shape=(m,n),dtype='object')\n", 204 | "\n", 205 | " iter_count = 1\n", 206 | "\n", 207 | " # Assign the cluster to each pixel for the first time\n", 208 | " for i in range(m):\n", 209 | " for j in range(n):\n", 210 | " pxl = pixel[i][j]\n", 211 | " cluster_assign[i,j] = assign_cluster(pxl, centroids, norm)\n", 212 | " \n", 213 | " # Get new centroids\n", 214 | " new_centroids = adjust_centroids(pixel, cluster_assign, centroids, norm) \n", 215 | " new_centroids.sort()\n", 216 | " \n", 217 | " while centroids != new_centroids: # Keep looping until the new centroids are equal to the old centroids\n", 218 | " iter_count += 1\n", 219 | " \n", 220 | " centroids = new_centroids\n", 221 | "\n", 222 | " for i in range(m):\n", 223 | " for j in range(n):\n", 224 | " pxl = pixel[i][j]\n", 225 | " cluster_assign[i,j] = assign_cluster(pxl, centroids, norm)\n", 226 | " \n", 227 | " # Get new centroids\n", 228 | " new_centroids = adjust_centroids(pixel, cluster_assign, centroids, norm) \n", 229 | " new_centroids.sort()\n", 230 | " \n", 231 | " # Time Calculation\n", 232 | " end_time = time.monotonic()\n", 233 | " time_diff = timedelta(seconds=end_time - start_time).seconds\n", 234 | " \n", 235 | " # Create compressed image\n", 236 | " final_image = np.empty(shape=(m,n,d),dtype='object')\n", 237 | " for row in range(m):\n", 238 | " for col in range(n):\n", 239 | " pxl = cluster_assign[row][col] \n", 240 | " final_image[row,col] = np.array(new_centroids[pxl]) \n", 241 | " \n", 242 | " # Final Outputs\n", 243 | " cluster_labels = cluster_assign+1\n", 244 | " final_image = np.reshape(final_image, (pixel.shape))\n", 245 | " \n", 246 | " return cluster_labels, new_centroids, final_image, iter_count, time_diff" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# Loop through pictures and k values using norm = 2\n", 256 | "\n", 257 | "k_vals = [2,4,8,16]\n", 258 | "\n", 259 | "image = ['data/GeorgiaTech.bmp', 'data/football.bmp', 'data/beach.jpg']\n", 260 | "\n", 261 | "for k in k_vals:\n", 262 | " for pic in image: \n", 263 | " \n", 264 | " # Read image\n", 265 | " original_image = imageio.imread(pic)\n", 266 | " \n", 267 | " # Run kmeans\n", 268 | " cluster_label, cluster_center, comp_image, iterations, run_time = run_kmeans(original_image, k, norm=2)\n", 269 | " \n", 270 | " # Export file\n", 271 | " pic_name = pic.split('/')[1].split('.')[0]\n", 272 | " matplotlib.image.imsave('Q2_output/'+pic_name+'_'+str(k)+'.png', comp_image.astype(np.uint8))\n", 273 | " \n", 274 | " print(pic_name + ', k = ' + str(k) + ', Iterations = ' + str(iterations) + ', Run Time = ' + str(run_time) + ' secs.')" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "### Q2" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# Loop through pictures and k values using norm = 1\n", 291 | "k_vals = [2,4,8,16]\n", 292 | "\n", 293 | "image = ['data/GeorgiaTech.bmp', 'data/football.bmp', 'data/beach.jpg']\n", 294 | "\n", 295 | "for k in k_vals:\n", 296 | " for pic in image: \n", 297 | " \n", 298 | " # Read image\n", 299 | " original_image = imageio.imread(pic)\n", 300 | " \n", 301 | " # Run kmeans\n", 302 | " cluster_label, cluster_center, comp_image, iterations, run_time = run_kmeans(original_image, k, norm=1)\n", 303 | " \n", 304 | " # Export file\n", 305 | " pic_name = pic.split('/')[1].split('.')[0]\n", 306 | " matplotlib.image.imsave('Q2_output/Part_2/'+pic_name+'_'+str(k)+'.png', comp_image.astype(np.uint8))\n", 307 | " \n", 308 | " print(pic_name + ', k = ' + str(k) + ', Iterations = ' + str(iterations) + ', Run Time = ' + str(run_time) + ' secs.')" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "# Loop through pictures and k values using norm = 1\n", 318 | "\n", 319 | "k_vals = [16]\n", 320 | "\n", 321 | "image = ['data/beach.jpg', 'data/football.bmp' ]\n", 322 | "\n", 323 | "for k in k_vals:\n", 324 | " for pic in image: \n", 325 | " \n", 326 | " # Read image\n", 327 | " original_image = imageio.imread(pic)\n", 328 | " \n", 329 | " # Run kmeans\n", 330 | " cluster_label, cluster_center, comp_image, iterations, run_time = run_kmeans(original_image, k, norm=1)\n", 331 | " \n", 332 | " # Export file\n", 333 | " pic_name = pic.split('/')[1].split('.')[0]\n", 334 | " matplotlib.image.imsave('Q2_output/Part_2/'+pic_name+'_'+str(k)+'.png', comp_image.astype(np.uint8))\n", 335 | " \n", 336 | " print(pic_name + ', k = ' + str(k) + ', Iterations = ' + str(iterations) + ', Run Time = ' + str(run_time) + ' secs.')" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "## Political Blogs Dataset" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "### Q1" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "import csv\n", 360 | "from sklearn.cluster import KMeans\n", 361 | "from collections import defaultdict\n", 362 | "from statistics import mode\n", 363 | "\n", 364 | "##### Define Functions #####\n", 365 | "def run_spec(L, k):\n", 366 | " \"\"\"\n", 367 | " Performs eigenvalue/eigenvector decomposition and performs kmeans based on value of k.\n", 368 | " \n", 369 | " Input:\n", 370 | " L = Laplacian \n", 371 | " k = number of clusters\n", 372 | " Output:\n", 373 | " kmeans object\n", 374 | " cluster labels \n", 375 | " \n", 376 | " Author's Note:\n", 377 | " Code was borrowed from test_football.py\n", 378 | " \n", 379 | " \"\"\"\n", 380 | " # Eigenvector Decomposition\n", 381 | " v, x = np.linalg.eig(L)\n", 382 | " idx = np.argsort(v)\n", 383 | " x = np.real(x[:, idx[-k:]]) # select the k largest eigenvectors\n", 384 | " x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1) # ensure all vectors are of unit length\n", 385 | " \n", 386 | " # Run kmeans\n", 387 | " kmeans = KMeans(n_clusters=k).fit(x)\n", 388 | " c_idx = kmeans.labels_\n", 389 | "\n", 390 | " return kmeans, c_idx\n", 391 | "\n", 392 | "def calc_metrics(nodes, labels):\n", 393 | " \"\"\"\n", 394 | " Calculates majority labels and mismatch rates for clusters.\n", 395 | " \n", 396 | " Input:\n", 397 | " Nodes = node list\n", 398 | " Labels = cluster labels (output from run_spec)\n", 399 | " Output:\n", 400 | " majority = dictionary, {cluster: majority political orientation}\n", 401 | " mismatch = dictionary, {cluster: mismatch rate}\n", 402 | " \"\"\"\n", 403 | " \n", 404 | " num_nodes = nodes.shape[0]\n", 405 | " cluster_groups = defaultdict()\n", 406 | " majority = {}\n", 407 | " mismatch = {}\n", 408 | " \n", 409 | " # Collect political orientation of blogs associated with each cluster\n", 410 | " for i in range(num_nodes):\n", 411 | " node = nodes[i]\n", 412 | " group = node[2].astype(int)\n", 413 | " cluster = labels[i]\n", 414 | " \n", 415 | " if cluster not in cluster_groups:\n", 416 | " cluster_groups[cluster] = list()\n", 417 | " \n", 418 | " cluster_groups[cluster].append(group)\n", 419 | " \n", 420 | " # Majority Label\n", 421 | " for key in cluster_groups.keys():\n", 422 | " majority[key]=mode(cluster_groups[key])\n", 423 | " \n", 424 | " # Calculate Match Rate\n", 425 | " for key in cluster_groups.keys():\n", 426 | " value = cluster_groups[key]\n", 427 | " most_common = majority[key]\n", 428 | " \n", 429 | " total = len(value)\n", 430 | " count = 0\n", 431 | " \n", 432 | " # Calculate Mismatch Rate\n", 433 | " for i in range(total):\n", 434 | " if value[i] == most_common:\n", 435 | " count += 1\n", 436 | " \n", 437 | " # Return mismatch rate to dictionary\n", 438 | " mismatch[key] = float(format(1 - count/total, '.4f'))\n", 439 | " \n", 440 | " return majority, mismatch " 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "# Load txt files\n", 450 | "# Source: https://stackoverflow.com/questions/16989647/importing-large-tab-delimited-txt-file-into-python/16999000\n", 451 | "\n", 452 | "with open('data/nodes.txt') as f:\n", 453 | " reader = csv.reader(f, delimiter=\"\\t\")\n", 454 | " nodes0 = list(reader) # Schema = [\"Node\", \"Site\", \"Orientation\", \"Tags\"]\n", 455 | "\n", 456 | "with open('data/edges.txt') as f:\n", 457 | " reader = csv.reader(f, delimiter=\"\\t\")\n", 458 | " edges0 = list(reader) # Schema = Start, End\n", 459 | "\n", 460 | "# Change data types\n", 461 | "edges = np.array(edges0).astype(int)\n", 462 | "\n", 463 | "nodes = nodes0\n", 464 | "for i in range(len(nodes0)): \n", 465 | " nodes[i][0] = int(nodes0[i][0])\n", 466 | " nodes[i][2] = int(nodes0[i][2])\n", 467 | "\n", 468 | "# Create Adjacency Matrix \n", 469 | "n = len(nodes)\n", 470 | "A = np.zeros(shape=(n,n),dtype='object')\n", 471 | "\n", 472 | "for edge in edges:\n", 473 | " i = (edge-1)[0]\n", 474 | " j = (edge-1)[1]\n", 475 | " A[i,j] = 1\n", 476 | " A[j,i] = 1\n", 477 | "\n", 478 | "# Create Degree matrix\n", 479 | "d_i = A.sum(axis=0)\n", 480 | "D = np.diagflat(d_i)\n", 481 | "\n", 482 | "# Get index of nodes that don't have any connections\n", 483 | "del_list = []\n", 484 | "\n", 485 | "for i in range(len(d_i)):\n", 486 | " if d_i[i] == 0:\n", 487 | " del_list.append(i)\n", 488 | " \n", 489 | "# Remove corresponding rows and columns from D, A and nodes\n", 490 | "A = np.delete(A, del_list, 0)\n", 491 | "A = np.delete(A, del_list, 1)\n", 492 | "D = np.delete(D, del_list, 0)\n", 493 | "D = np.delete(D, del_list, 1)\n", 494 | "nodes = np.delete(nodes, del_list, 0)\n", 495 | "\n", 496 | "# Calculate Laplacian: L = D-A\n", 497 | "L = (D-A).astype(float)\n" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "# Loop through different values of k\n", 507 | "k_vals = [2,5,10,20]\n", 508 | "\n", 509 | "for k in k_vals:\n", 510 | " kmeans_obj, labels = run_spec(L, k)\n", 511 | " majority, mismatch = calc_metrics(nodes, labels)\n", 512 | " \n", 513 | " avg = mean(mismatch.values())\n", 514 | " min_rate = min(mismatch.values())\n", 515 | " max_rate = max(mismatch.values())\n", 516 | " \n", 517 | " print(\"k = \" + str(k))\n", 518 | " print(\"Majority: \" + str(dict(sorted(majority.items()))))\n", 519 | " print(\"Mismatch Rates: \" + str(dict(sorted(mismatch.items()))))\n", 520 | " print(\"Average of Mismatch Rates = \" + str(avg))\n", 521 | " print(\"Spread of Mismatch Rates = (\" + str(min_rate) + \", \" + str(max_rate) + \") = \" + str(max_rate-min_rate))\n", 522 | " print(\"\")" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "metadata": {}, 528 | "source": [ 529 | "### Q2" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "for k in range(2,10):\n", 539 | " kmeans_obj, labels = run_spec(L, k)\n", 540 | " majority, mismatch = calc_metrics(nodes, labels)\n", 541 | " \n", 542 | " avg = mean(mismatch.values())\n", 543 | " min_rate = min(mismatch.values())\n", 544 | " max_rate = max(mismatch.values())\n", 545 | " \n", 546 | " print(\"k = \" + str(k))\n", 547 | " print(\"Majority: \" + str(dict(sorted(majority.items()))))\n", 548 | " print(\"Mismatch Rates: \" + str(dict(sorted(mismatch.items()))))\n", 549 | " print(\"Average of Mismatch Rates = \" + str(avg))\n", 550 | " print(\"Spread of Mismatch Rates = (\" + str(min_rate) + \", \" + str(max_rate) + \") = \" + str(max_rate-min_rate))\n", 551 | " print(\"\")" 552 | ] 553 | } 554 | ], 555 | "metadata": { 556 | "kernelspec": { 557 | "display_name": "Python 3 (ipykernel)", 558 | "language": "python", 559 | "name": "python3" 560 | }, 561 | "language_info": { 562 | "codemirror_mode": { 563 | "name": "ipython", 564 | "version": 3 565 | }, 566 | "file_extension": ".py", 567 | "mimetype": "text/x-python", 568 | "name": "python", 569 | "nbconvert_exporter": "python", 570 | "pygments_lexer": "ipython3", 571 | "version": "3.9.7" 572 | } 573 | }, 574 | "nbformat": 4, 575 | "nbformat_minor": 4 576 | } 577 | -------------------------------------------------------------------------------- /Canlapan_Inah_HW4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4c38567e", 6 | "metadata": {}, 7 | "source": [ 8 | "# HW 4" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "7d5b7969", 14 | "metadata": {}, 15 | "source": [ 16 | "## 2. Comparing Classifiers\n", 17 | "\n", 18 | "### 2.1 Divorce Classification " 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "id": "040482de", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "from sklearn.model_selection import train_test_split\n", 32 | "from sklearn.neighbors import KNeighborsClassifier \n", 33 | "from sklearn.linear_model import LogisticRegression\n", 34 | "from sklearn.naive_bayes import GaussianNB\n", 35 | "from sklearn.decomposition import PCA\n", 36 | "from mlxtend.plotting import plot_decision_regions\n", 37 | "from sklearn.model_selection import GridSearchCV" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "id": "ef093ea5", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "data = pd.read_csv('data/marriage.csv', header = None)\n", 48 | "data.rename(columns = {54: 'y'}, inplace = True)\n", 49 | "\n", 50 | "# Split test/train data\n", 51 | "X = data.iloc[:,0:54]\n", 52 | "y = data.iloc[:,54:55]\n", 53 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4264)\n", 54 | "\n", 55 | "X_train = np.array(X_train)\n", 56 | "X_test = np.array(X_test)\n", 57 | "y_train = np.array(y_train).reshape(len(y_train)).astype(int)\n", 58 | "y_test = np.array(y_test).reshape(len(y_test)).astype(int)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "15a69278", 64 | "metadata": {}, 65 | "source": [ 66 | "### Part A\n", 67 | "Report testing accuracy for each of the three classifiers. Comment on their performance: which performs the best and make a guess why they perform the best in this setting. " 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "id": "e0a95a95", 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "Training Accuracy: 1.0\n", 81 | "Test Accuracy: 0.9411764705882353\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "# Logistic Regression - from heart.py\n", 87 | "lr_model = LogisticRegression(max_iter=200, solver='liblinear')\n", 88 | "lr_model.fit(X_train, y_train)\n", 89 | "lr_score_train = lr_model.score(X_train, y_train)\n", 90 | "lr_score_test = lr_model.score(X_test, y_test)\n", 91 | "\n", 92 | "print('Training Accuracy: ' + str(lr_score_train))\n", 93 | "print('Test Accuracy: ' + str(lr_score_test))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "id": "1e60f1f9", 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "Training Accuracy: 0.9779411764705882\n", 107 | "Test Accuracy: 0.9705882352941176\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "# KNN - from plot_classifier_comparison.py\n", 113 | "knn = KNeighborsClassifier(n_neighbors=5)\n", 114 | "knn.fit(X_train, y_train)\n", 115 | "knn_score_train = knn.score(X_train, y_train)\n", 116 | "knn_score_test = knn.score(X_test, y_test)\n", 117 | "\n", 118 | "print('Training Accuracy: ' + str(knn_score_train))\n", 119 | "print('Test Accuracy: ' + str(knn_score_test))" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "id": "5050ab49", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "Training Accuracy: 0.9779411764705882\n", 133 | "Test Accuracy: 0.9705882352941176\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "# Naive Bayes - from plot_classifier_comparison.py\n", 139 | "naive_bayes = GaussianNB(var_smoothing=0.001)\n", 140 | "naive_bayes.fit(X_train, y_train)\n", 141 | "nb_score_train = naive_bayes.score(X_train, y_train)\n", 142 | "nb_score_test = naive_bayes.score(X_test, y_test)\n", 143 | "print('Training Accuracy: ' + str(nb_score_train))\n", 144 | "print('Test Accuracy: ' + str(nb_score_test))" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "e35033da", 150 | "metadata": {}, 151 | "source": [ 152 | "### Part B\n", 153 | "Now perform PCA to project the data into two-dimensional space. Build the classifiers (Naive Bayes, Logistic Regression, and KNN) using the two-dimensional PCA results. Plot the data points and decision boundary of each classifier in the two-dimensional space. Comment on the difference between the decision boundary for the three classifiers. Please clearly represent the data points with different labels using different colors." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 6, 159 | "id": "4274eeaa", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "# Use PCA package to perform PCA\n", 164 | "pca = PCA(n_components=2)\n", 165 | "data_redux = pca.fit_transform(X)\n", 166 | "\n", 167 | "# Split into test/train sets \n", 168 | "X_train, X_test, y_train, y_test = train_test_split(data_redux, y, test_size=0.2, random_state=4264)\n", 169 | "\n", 170 | "X_train = np.array(X_train)\n", 171 | "X_test = np.array(X_test)\n", 172 | "y_train = np.array(y_train).reshape(len(y_train)).astype(int)\n", 173 | "y_test = np.array(y_test).reshape(len(y_test)).astype(int)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 7, 179 | "id": "2fce14f7", 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "Training Accuracy: 0.9779411764705882\n", 187 | "Test Accuracy: 0.9705882352941176\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "# Logistic Regression - from heart.py\n", 193 | "lr_model = LogisticRegression(max_iter=200, solver='liblinear')\n", 194 | "lr_model.fit(X_train, y_train)\n", 195 | "lr_score_train = lr_model.score(X_train, y_train)\n", 196 | "lr_score_test = lr_model.score(X_test, y_test)\n", 197 | "\n", 198 | "print('Training Accuracy: ' + str(lr_score_train))\n", 199 | "print('Test Accuracy: ' + str(lr_score_test))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 8, 205 | "id": "54c535e3", 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "Training Accuracy: 0.9852941176470589\n", 213 | "Test Accuracy: 0.9705882352941176\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "# KNN - from plot_classifier_comparison.py\n", 219 | "knn = KNeighborsClassifier(n_neighbors=5)\n", 220 | "knn.fit(X_train, y_train)\n", 221 | "knn_score_train = knn.score(X_train, y_train)\n", 222 | "knn_score_test = knn.score(X_test, y_test)\n", 223 | "\n", 224 | "print('Training Accuracy: ' + str(knn_score_train))\n", 225 | "print('Test Accuracy: ' + str(knn_score_test))" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 9, 231 | "id": "38ef7b60", 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "Training Accuracy: 0.9779411764705882\n", 239 | "Test Accuracy: 0.9705882352941176\n" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "# Naive Bayes - from plot_classifier_comparison.py\n", 245 | "naive_bayes = GaussianNB()\n", 246 | "naive_bayes.fit(X_train, y_train)\n", 247 | "nb_score_train = naive_bayes.score(X_train, y_train)\n", 248 | "nb_score_test = naive_bayes.score(X_test, y_test)\n", 249 | "\n", 250 | "print('Training Accuracy: ' + str(nb_score_train))\n", 251 | "print('Test Accuracy: ' + str(nb_score_test))" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 10, 257 | "id": "52bab359", 258 | "metadata": { 259 | "scrolled": true 260 | }, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "Text(0.5, 1.0, 'Naive Bayes')" 266 | ] 267 | }, 268 | "execution_count": 10, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | }, 272 | { 273 | "data": { 274 | "image/png": "\n", 275 | "text/plain": [ 276 | "
" 277 | ] 278 | }, 279 | "metadata": { 280 | "needs_background": "light" 281 | }, 282 | "output_type": "display_data" 283 | } 284 | ], 285 | "source": [ 286 | "# http://rasbt.github.io/mlxtend/user_guide/plotting/plot_decision_regions/\n", 287 | "import matplotlib.gridspec as gridspec\n", 288 | "gs = gridspec.GridSpec(1, 3)\n", 289 | "fig = plt.figure(figsize=(10,3))\n", 290 | "\n", 291 | "ax = plt.subplot(gs[0, 0])\n", 292 | "fig = plot_decision_regions(X_test, y_test, clf=lr_model, legend=2)\n", 293 | "plt.title('Logistic Regression')\n", 294 | "\n", 295 | "ax = plt.subplot(gs[0, 1])\n", 296 | "fig = plot_decision_regions(X_test, y_test, clf=knn, legend=2)\n", 297 | "plt.title('KNN, k=5')\n", 298 | "\n", 299 | "ax = plt.subplot(gs[0, 2])\n", 300 | "fig = plot_decision_regions(X_test, y_test, clf=naive_bayes, legend=2)\n", 301 | "plt.title('Naive Bayes')" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "id": "baf9b80f", 307 | "metadata": {}, 308 | "source": [ 309 | "### 2.2 Handwriteen Digits Classification\n", 310 | "\n", 311 | "(b) Comment on the performance of the classifier and give your explanation why some of them perform better than the others.\n", 312 | "\n" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 11, 318 | "id": "3b3f7a75", 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "import scipy.io as spio\n", 323 | "import random\n", 324 | "from sklearn.svm import SVC\n", 325 | "from sklearn.neural_network import MLPClassifier\n", 326 | "from sklearn.metrics import classification_report\n", 327 | "from sklearn.metrics import confusion_matrix\n", 328 | "from sklearn.metrics import accuracy_score" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 12, 334 | "id": "05d22b38", 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "# Import Data\n", 339 | "mnist = spio.loadmat('data/mnist_10digits.mat',squeeze_me=True)\n", 340 | "\n", 341 | "X_train = mnist['xtrain'] # each row is an image\n", 342 | "X_test = mnist['xtest']\n", 343 | "y_train = mnist['ytrain'].ravel()\n", 344 | "y_test = mnist['ytest'].ravel()\n", 345 | "\n", 346 | "# Standardize X_train and X_test\n", 347 | "X_train = X_train/255\n", 348 | "X_test = X_test/255\n", 349 | "\n", 350 | "# Downsample\n", 351 | "seed = 4624\n", 352 | "index = random.sample(range(60000), 5000)\n", 353 | "\n", 354 | "X_train_down = X_train[index]\n", 355 | "y_train_down = y_train[index].ravel()" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "id": "7e6c0ec1", 361 | "metadata": {}, 362 | "source": [ 363 | "### Part A\n", 364 | "Report confusion matrix, precision, recall, and F-1 score for each of the classifiers. For precision, recall, and F-1 score of each classifier, we will need to report these for each of the digits. So you can create a table for this. For this question, each of the 5 classifier, KNN, logistic regression, SVM, kernel SVM, and neural networks, accounts for 10 points." 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 14, 370 | "id": "a83d5690", 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "name": "stdout", 375 | "output_type": "stream", 376 | "text": [ 377 | "Best n_neighbors: 1\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "# Tune KNN using GridSearchCV \n", 383 | "# https://medium.datadriveninvestor.com/k-nearest-neighbors-in-python-hyperparameters-tuning-716734bc557f\n", 384 | "\n", 385 | "n_neighbors = list(range(1,30))\n", 386 | "\n", 387 | "#Convert to dictionary\n", 388 | "hyperparameters = dict(n_neighbors=n_neighbors)\n", 389 | "\n", 390 | "#Create new KNN object\n", 391 | "knn_2 = KNeighborsClassifier()\n", 392 | "\n", 393 | "#Use GridSearch\n", 394 | "clf = GridSearchCV(knn_2, hyperparameters, cv=10)\n", 395 | "\n", 396 | "#Fit the model\n", 397 | "best_model = clf.fit(X_train_down,y_train_down)\n", 398 | "\n", 399 | "#Print The value of best Hyperparameters\n", 400 | "print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])\n", 401 | "\n", 402 | "# Current results = 1; previous results = 3" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 18, 408 | "id": "004c28f4", 409 | "metadata": { 410 | "scrolled": false 411 | }, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "KNN, k=3\n", 418 | "Training Accuracy: 0.9363833333333333\n", 419 | "Test Accuracy: 0.9366\n", 420 | " precision recall f1-score support\n", 421 | "\n", 422 | " 0 0.95 0.99 0.97 980\n", 423 | " 1 0.90 1.00 0.94 1135\n", 424 | " 2 0.97 0.91 0.94 1032\n", 425 | " 3 0.92 0.93 0.92 1010\n", 426 | " 4 0.95 0.91 0.93 982\n", 427 | " 5 0.93 0.91 0.92 892\n", 428 | " 6 0.96 0.97 0.97 958\n", 429 | " 7 0.94 0.94 0.94 1028\n", 430 | " 8 0.97 0.87 0.92 974\n", 431 | " 9 0.91 0.93 0.92 1009\n", 432 | "\n", 433 | " accuracy 0.94 10000\n", 434 | " macro avg 0.94 0.94 0.94 10000\n", 435 | "weighted avg 0.94 0.94 0.94 10000\n", 436 | "\n", 437 | "[[ 966 1 1 0 0 3 7 1 1 0]\n", 438 | " [ 0 1130 1 1 1 1 1 0 0 0]\n", 439 | " [ 16 32 937 9 5 1 3 21 8 0]\n", 440 | " [ 3 11 10 936 1 26 0 10 8 5]\n", 441 | " [ 1 22 0 0 896 0 9 1 2 51]\n", 442 | " [ 6 8 1 33 3 815 11 3 3 9]\n", 443 | " [ 8 5 0 1 6 5 933 0 0 0]\n", 444 | " [ 0 35 5 1 4 0 0 964 0 19]\n", 445 | " [ 15 9 9 27 11 28 2 12 847 14]\n", 446 | " [ 6 8 2 10 20 0 2 19 0 942]]\n", 447 | "Logistic Regression\n", 448 | "Training Accuracy: 0.9392666666666667\n", 449 | "Test Accuracy: 0.9256\n", 450 | " precision recall f1-score support\n", 451 | "\n", 452 | " 0 0.95 0.97 0.96 980\n", 453 | " 1 0.96 0.98 0.97 1135\n", 454 | " 2 0.93 0.90 0.91 1032\n", 455 | " 3 0.90 0.92 0.91 1010\n", 456 | " 4 0.94 0.94 0.94 982\n", 457 | " 5 0.90 0.87 0.88 892\n", 458 | " 6 0.94 0.95 0.95 958\n", 459 | " 7 0.93 0.92 0.93 1028\n", 460 | " 8 0.88 0.88 0.88 974\n", 461 | " 9 0.91 0.92 0.91 1009\n", 462 | "\n", 463 | " accuracy 0.93 10000\n", 464 | " macro avg 0.92 0.92 0.92 10000\n", 465 | "weighted avg 0.93 0.93 0.93 10000\n", 466 | "\n", 467 | "[[ 955 0 2 4 1 10 4 3 1 0]\n", 468 | " [ 0 1110 5 2 0 2 3 2 11 0]\n", 469 | " [ 6 9 930 14 10 3 12 10 34 4]\n", 470 | " [ 4 1 16 925 1 23 2 10 19 9]\n", 471 | " [ 1 3 7 3 921 0 6 5 6 30]\n", 472 | " [ 9 2 3 35 10 777 15 6 31 4]\n", 473 | " [ 8 3 8 2 6 16 912 2 1 0]\n", 474 | " [ 1 7 23 7 6 1 0 947 4 32]\n", 475 | " [ 9 11 6 22 7 29 13 10 855 12]\n", 476 | " [ 9 8 1 9 21 7 0 21 9 924]]\n", 477 | "SVM\n", 478 | "Training Accuracy: 0.9142\n", 479 | "Test Accuracy: 0.9121\n", 480 | " precision recall f1-score support\n", 481 | "\n", 482 | " 0 0.94 0.98 0.96 980\n", 483 | " 1 0.95 0.98 0.97 1135\n", 484 | " 2 0.90 0.90 0.90 1032\n", 485 | " 3 0.87 0.90 0.89 1010\n", 486 | " 4 0.89 0.92 0.91 982\n", 487 | " 5 0.87 0.86 0.87 892\n", 488 | " 6 0.94 0.95 0.94 958\n", 489 | " 7 0.91 0.92 0.91 1028\n", 490 | " 8 0.91 0.84 0.88 974\n", 491 | " 9 0.91 0.86 0.88 1009\n", 492 | "\n", 493 | " accuracy 0.91 10000\n", 494 | " macro avg 0.91 0.91 0.91 10000\n", 495 | "weighted avg 0.91 0.91 0.91 10000\n", 496 | "\n", 497 | "[[ 959 0 2 2 0 11 5 1 0 0]\n", 498 | " [ 0 1116 4 5 0 4 1 3 2 0]\n", 499 | " [ 14 13 931 17 16 2 15 9 14 1]\n", 500 | " [ 6 3 19 908 2 31 2 11 23 5]\n", 501 | " [ 2 4 8 1 908 1 13 8 1 36]\n", 502 | " [ 12 3 5 58 10 766 11 4 18 5]\n", 503 | " [ 8 4 13 0 9 14 906 0 4 0]\n", 504 | " [ 2 13 23 7 7 1 0 942 3 30]\n", 505 | " [ 10 8 15 29 17 43 9 10 820 13]\n", 506 | " [ 5 9 9 11 46 6 0 45 13 865]]\n", 507 | "Kernel SVM\n", 508 | "Training Accuracy: 0.9526\n", 509 | "Test Accuracy: 0.9545\n", 510 | " precision recall f1-score support\n", 511 | "\n", 512 | " 0 0.96 0.98 0.97 980\n", 513 | " 1 0.98 0.99 0.98 1135\n", 514 | " 2 0.96 0.94 0.95 1032\n", 515 | " 3 0.94 0.96 0.95 1010\n", 516 | " 4 0.93 0.96 0.95 982\n", 517 | " 5 0.95 0.95 0.95 892\n", 518 | " 6 0.96 0.96 0.96 958\n", 519 | " 7 0.96 0.94 0.95 1028\n", 520 | " 8 0.94 0.94 0.94 974\n", 521 | " 9 0.94 0.92 0.93 1009\n", 522 | "\n", 523 | " accuracy 0.95 10000\n", 524 | " macro avg 0.95 0.95 0.95 10000\n", 525 | "weighted avg 0.95 0.95 0.95 10000\n", 526 | "\n", 527 | "[[ 964 0 2 0 0 8 4 1 1 0]\n", 528 | " [ 0 1123 2 3 0 1 3 0 3 0]\n", 529 | " [ 8 0 970 10 9 0 7 8 20 0]\n", 530 | " [ 0 1 7 968 0 10 1 10 11 2]\n", 531 | " [ 1 0 5 0 941 0 5 3 2 25]\n", 532 | " [ 3 1 3 20 4 843 8 2 5 3]\n", 533 | " [ 9 3 2 0 8 10 924 0 2 0]\n", 534 | " [ 1 12 18 3 7 0 0 962 3 22]\n", 535 | " [ 3 1 3 13 7 16 6 4 918 3]\n", 536 | " [ 10 6 1 9 31 2 0 9 9 932]]\n", 537 | "Neural Networks\n", 538 | "Training Accuracy: 0.95985\n", 539 | "Test Accuracy: 0.9572\n", 540 | " precision recall f1-score support\n", 541 | "\n", 542 | " 0 0.96 0.98 0.97 980\n", 543 | " 1 0.98 0.98 0.98 1135\n", 544 | " 2 0.96 0.95 0.95 1032\n", 545 | " 3 0.95 0.96 0.96 1010\n", 546 | " 4 0.95 0.96 0.96 982\n", 547 | " 5 0.97 0.93 0.95 892\n", 548 | " 6 0.95 0.96 0.96 958\n", 549 | " 7 0.97 0.93 0.95 1028\n", 550 | " 8 0.95 0.96 0.95 974\n", 551 | " 9 0.94 0.95 0.95 1009\n", 552 | "\n", 553 | " accuracy 0.96 10000\n", 554 | " macro avg 0.96 0.96 0.96 10000\n", 555 | "weighted avg 0.96 0.96 0.96 10000\n", 556 | "\n", 557 | "[[ 963 1 0 0 0 4 6 1 3 2]\n", 558 | " [ 0 1112 3 4 0 0 3 2 11 0]\n", 559 | " [ 12 2 976 7 7 0 11 9 8 0]\n", 560 | " [ 0 0 10 973 0 4 2 8 12 1]\n", 561 | " [ 1 0 3 0 944 0 9 2 2 21]\n", 562 | " [ 8 1 1 20 2 829 12 3 9 7]\n", 563 | " [ 9 3 3 0 7 9 924 0 3 0]\n", 564 | " [ 2 9 19 7 7 0 0 960 0 24]\n", 565 | " [ 5 1 3 5 6 6 9 5 931 3]\n", 566 | " [ 7 6 1 11 16 2 1 4 1 960]]\n" 567 | ] 568 | } 569 | ], 570 | "source": [ 571 | "# from plot_classifier_comparison.py\n", 572 | "names = ['KNN, k=3', 'Logistic Regression', 'SVM', 'Kernel SVM', 'Neural Networks']\n", 573 | "\n", 574 | "classifiers = [KNeighborsClassifier(3), \n", 575 | " LogisticRegression(solver='newton-cg'),\n", 576 | " SVC(kernel=\"linear\"),\n", 577 | " SVC(kernel='rbf'),\n", 578 | " MLPClassifier(alpha=1, hidden_layer_sizes = (20, 10))\n", 579 | " ]\n", 580 | "\n", 581 | "# Iterate over classifiers\n", 582 | "for i in range(len(names)):\n", 583 | " clf_name = names[i]\n", 584 | " clf = classifiers[i]\n", 585 | " \n", 586 | " print(clf_name)\n", 587 | " \n", 588 | " # Fit the model\n", 589 | " if (clf_name == 'KNN, k=3') | (clf_name == 'SVM') | (clf_name == 'Kernel SVM'):\n", 590 | " clf.fit(X_train_down, y_train_down)\n", 591 | " else:\n", 592 | " clf.fit(X_train, y_train)\n", 593 | " \n", 594 | " # Predict Test Records \n", 595 | " y_pred_train = clf.predict(X_train)\n", 596 | " y_pred_test = clf.predict(X_test)\n", 597 | " \n", 598 | " # Accuracy \n", 599 | " acc_train = accuracy_score(y_train, y_pred_train)\n", 600 | " acc_test = accuracy_score(y_test, y_pred_test)\n", 601 | " print('Training Accuracy: ' + str(acc_train))\n", 602 | " print('Test Accuracy: ' + str(acc_test))\n", 603 | " \n", 604 | " # Scores \n", 605 | " # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html\n", 606 | " print(classification_report(y_test, y_pred_test))\n", 607 | " \n", 608 | " # Confusion Matrix\n", 609 | " # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html\n", 610 | " print(confusion_matrix(y_test, y_pred_test))\n" 611 | ] 612 | } 613 | ], 614 | "metadata": { 615 | "kernelspec": { 616 | "display_name": "Python 3 (ipykernel)", 617 | "language": "python", 618 | "name": "python3" 619 | }, 620 | "language_info": { 621 | "codemirror_mode": { 622 | "name": "ipython", 623 | "version": 3 624 | }, 625 | "file_extension": ".py", 626 | "mimetype": "text/x-python", 627 | "name": "python", 628 | "nbconvert_exporter": "python", 629 | "pygments_lexer": "ipython3", 630 | "version": "3.9.12" 631 | } 632 | }, 633 | "nbformat": 4, 634 | "nbformat_minor": 5 635 | } 636 | -------------------------------------------------------------------------------- /Property Style Reclassification/Property Style Reclassification - Final.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "737626aa", 6 | "metadata": {}, 7 | "source": [ 8 | "# Property Style Reclassification\n", 9 | "\n", 10 | "**1. Organize Data**\n", 11 | "\n", 12 | "Training/Validation Data = rps_property_data.csv\n", 13 | "Test Data = rps_property_data_rlp.csv\n", 14 | " \n", 15 | "**2. Train Models**\n", 16 | "\n", 17 | "**3. Test on Validation**\n", 18 | "\n", 19 | "**4. Test on RLP Data with labels**\n", 20 | "\n", 21 | "**5. Test on RLP Data without labels**" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "id": "0b026b9c", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "from sklearn.model_selection import train_test_split\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "import warnings\n", 36 | "from datetime import datetime\n", 37 | "warnings.filterwarnings(\"ignore\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "b2d00e22", 43 | "metadata": {}, 44 | "source": [ 45 | "## Data Cleaning, Conversion and Partitioning" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "id": "84fca9e5", 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "(1601086, 16)" 58 | ] 59 | }, 60 | "execution_count": 2, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "# Import training data\n", 67 | "property_data = pd.read_csv('rps_property_data.csv')\n", 68 | "property_data = property_data.replace({np.nan: None})\n", 69 | "property_data = property_data.rename(columns = {'Unnamed: 0':'RecordID'})\n", 70 | "property_data = property_data[(property_data['PROPERTY_STYLE'].notnull()) & \\\n", 71 | " (property_data['PROPERTY_STYLE']!='8_OTHER') & \\\n", 72 | " (property_data['PROVINCE'].notnull()) & \\\n", 73 | " (property_data['LATITUDE'].notnull()) & \\\n", 74 | " (property_data['LONGITUDE'].notnull()) \n", 75 | " ]\n", 76 | "property_data.shape" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "id": "45fe6bca", 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/html": [ 88 | "
\n", 89 | "\n", 102 | "\n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | "
RecordIDFULL_ADDRESSSTREET_ADDRESSSUITESTREET_NUMBERSTREET_NAMESTREET_TYPESTREET_DIRECTIONCITYPROVINCEPOSTAL_CODELATITUDELONGITUDEDISSEMINATION_AREAPROPERTY_STYLETEST_FLAG
1152 AUSTIN DR, WATERLOO ON N2L 3Y352 AUSTIN DRNone52AUSTINDRNoneWATERLOOONN2L3Y343.4817-80.5322353000621_SFD0
223 PEACHDALE AVE, ST CATHARINES ON L2M 5M23 PEACHDALE AVENone3PEACHDALEAVENoneST CATHARINESONL2M5M243.1737-79.2287352605131_SFD0
331093 PRINCIPALE RTE, AUMOND, QC J0W1W01093 PRINCIPALE RTENone1093PRINCIPALERTENoneAUMONDQCJ0W1W046.5425-75.8237248300601_SFD0
44106-10 BASSETT BLVD, WHITBY ON L1N 9C510 BASSETT BLVD10610BASSETTBLVDNoneWHITBYONL1N9C543.8946-78.9441351806024_ROW0
55122 QUAIL RUN DR, DORCHESTER, ON N0L1G3122 QUAIL RUN DRNone122QUAIL RUNDRNoneDORCHESTERONN0L1G342.978-81.0429353908711_SFD0
\n", 222 | "
" 223 | ], 224 | "text/plain": [ 225 | " RecordID FULL_ADDRESS STREET_ADDRESS \\\n", 226 | "1 1 52 AUSTIN DR, WATERLOO ON N2L 3Y3 52 AUSTIN DR \n", 227 | "2 2 3 PEACHDALE AVE, ST CATHARINES ON L2M 5M2 3 PEACHDALE AVE \n", 228 | "3 3 1093 PRINCIPALE RTE, AUMOND, QC J0W1W0 1093 PRINCIPALE RTE \n", 229 | "4 4 106-10 BASSETT BLVD, WHITBY ON L1N 9C5 10 BASSETT BLVD \n", 230 | "5 5 122 QUAIL RUN DR, DORCHESTER, ON N0L1G3 122 QUAIL RUN DR \n", 231 | "\n", 232 | " SUITE STREET_NUMBER STREET_NAME STREET_TYPE STREET_DIRECTION CITY \\\n", 233 | "1 None 52 AUSTIN DR None WATERLOO \n", 234 | "2 None 3 PEACHDALE AVE None ST CATHARINES \n", 235 | "3 None 1093 PRINCIPALE RTE None AUMOND \n", 236 | "4 106 10 BASSETT BLVD None WHITBY \n", 237 | "5 None 122 QUAIL RUN DR None DORCHESTER \n", 238 | "\n", 239 | " PROVINCE POSTAL_CODE LATITUDE LONGITUDE DISSEMINATION_AREA PROPERTY_STYLE \\\n", 240 | "1 ON N2L3Y3 43.4817 -80.5322 35300062 1_SFD \n", 241 | "2 ON L2M5M2 43.1737 -79.2287 35260513 1_SFD \n", 242 | "3 QC J0W1W0 46.5425 -75.8237 24830060 1_SFD \n", 243 | "4 ON L1N9C5 43.8946 -78.9441 35180602 4_ROW \n", 244 | "5 ON N0L1G3 42.978 -81.0429 35390871 1_SFD \n", 245 | "\n", 246 | " TEST_FLAG \n", 247 | "1 0 \n", 248 | "2 0 \n", 249 | "3 0 \n", 250 | "4 0 \n", 251 | "5 0 " 252 | ] 253 | }, 254 | "execution_count": 3, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "property_data.head()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 5, 266 | "id": "8e9a447b", 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "# Convert Categorical/String to Numerical\n", 271 | "property_data_model = property_data[['SUITE', 'STREET_NUMBER',\n", 272 | " 'STREET_NAME', 'STREET_TYPE', 'STREET_DIRECTION', 'CITY', 'PROVINCE',\n", 273 | " 'POSTAL_CODE', 'LATITUDE', 'LONGITUDE', 'DISSEMINATION_AREA',\n", 274 | " 'PROPERTY_STYLE']]\n", 275 | "\n", 276 | "property_data_model['SUITE'] = pd.factorize(property_data_model['SUITE'])[0]\n", 277 | "property_data_model['STREET_NUMBER'] = pd.factorize(property_data_model['STREET_NUMBER'])[0]\n", 278 | "property_data_model['STREET_NAME'] = pd.factorize(property_data_model['STREET_NAME'])[0]\n", 279 | "property_data_model['STREET_TYPE'] = pd.factorize(property_data_model['STREET_TYPE'])[0]\n", 280 | "property_data_model['STREET_DIRECTION'] = pd.factorize(property_data_model['STREET_DIRECTION'])[0]\n", 281 | "property_data_model['CITY'] = pd.factorize(property_data_model['CITY'])[0]\n", 282 | "property_data_model['PROVINCE'] = pd.factorize(property_data_model['PROVINCE'])[0]\n", 283 | "property_data_model['POSTAL_CODE'] = pd.factorize(property_data_model['POSTAL_CODE'])[0]\n", 284 | "property_data_model[\"LATITUDE\"] = pd.to_numeric(property_data_model[\"LATITUDE\"], downcast=\"float\")\n", 285 | "property_data_model[\"LONGITUDE\"] = pd.to_numeric(property_data_model[\"LONGITUDE\"], downcast=\"float\")\n", 286 | "property_data_model['PROPERTY_STYLE'] = property_data_model['PROPERTY_STYLE'].apply(lambda x: int(x[0]))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 4, 292 | "id": "3fcdfcd0", 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/plain": [ 298 | "array(['1_SFD', '4_ROW', '2_SD', '3_CONDOAPT', '5_PLEX', '6_MOBILE',\n", 299 | " '7_LAND'], dtype=object)" 300 | ] 301 | }, 302 | "execution_count": 4, 303 | "metadata": {}, 304 | "output_type": "execute_result" 305 | } 306 | ], 307 | "source": [ 308 | "property_data['PROPERTY_STYLE'].unique()" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 6, 314 | "id": "989a6500", 315 | "metadata": { 316 | "scrolled": true 317 | }, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "((1230533, 16), (534173, 16), (696360, 16))" 323 | ] 324 | }, 325 | "execution_count": 6, 326 | "metadata": {}, 327 | "output_type": "execute_result" 328 | } 329 | ], 330 | "source": [ 331 | "# Import datasets to test final model on\n", 332 | "rlp_data = pd.read_csv('rps_property_data_rlp.csv')\n", 333 | "rlp_data = rlp_data.replace({np.nan: None})\n", 334 | "rlp_data = rlp_data[(rlp_data['PROVINCE'].notnull()) & \\\n", 335 | " (rlp_data['PROVINCE']!='CO') & \\\n", 336 | " (rlp_data['LATITUDE'].notnull()) & \\\n", 337 | " (rlp_data['LONGITUDE'].notnull())]\n", 338 | " \n", 339 | "rlp_nolabel = rlp_data[(rlp_data['PROPERTY_STYLE'].isnull()) | \\\n", 340 | " (rlp_data['PROPERTY_STYLE'] == '8_OTHER')]\n", 341 | "rlp_label = rlp_data[(rlp_data['PROPERTY_STYLE'].notnull()) & \\\n", 342 | " (rlp_data['PROPERTY_STYLE']!='8_OTHER')]\n", 343 | "\n", 344 | "rlp_data.shape, rlp_nolabel.shape, rlp_label.shape" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 7, 350 | "id": "9f72ea13", 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "# Convert Categorical/String to Numerical\n", 355 | "rlp_nolabel_model = rlp_nolabel[['SUITE', 'STREET_NUMBER',\n", 356 | " 'STREET_NAME', 'STREET_TYPE', 'STREET_DIRECTION', 'CITY', 'PROVINCE',\n", 357 | " 'POSTAL_CODE', 'LATITUDE', 'LONGITUDE', 'DISSEMINATION_AREA']]\n", 358 | "\n", 359 | "rlp_nolabel_model['SUITE'] = pd.factorize(rlp_nolabel_model['SUITE'])[0]\n", 360 | "rlp_nolabel_model['STREET_NUMBER'] = pd.factorize(rlp_nolabel_model['STREET_NUMBER'])[0]\n", 361 | "rlp_nolabel_model['STREET_NAME'] = pd.factorize(rlp_nolabel_model['STREET_NAME'])[0]\n", 362 | "rlp_nolabel_model['STREET_TYPE'] = pd.factorize(rlp_nolabel_model['STREET_TYPE'])[0]\n", 363 | "rlp_nolabel_model['STREET_DIRECTION'] = pd.factorize(rlp_nolabel_model['STREET_DIRECTION'])[0]\n", 364 | "rlp_nolabel_model['CITY'] = pd.factorize(rlp_nolabel_model['CITY'])[0]\n", 365 | "rlp_nolabel_model['PROVINCE'] = pd.factorize(rlp_nolabel_model['PROVINCE'])[0]\n", 366 | "rlp_nolabel_model['POSTAL_CODE'] = pd.factorize(rlp_nolabel_model['POSTAL_CODE'])[0]\n", 367 | "rlp_nolabel_model[\"LATITUDE\"] = pd.to_numeric(rlp_nolabel_model[\"LATITUDE\"], downcast=\"float\")\n", 368 | "rlp_nolabel_model[\"LONGITUDE\"] = pd.to_numeric(rlp_nolabel_model[\"LONGITUDE\"], downcast=\"float\")\n", 369 | "\n", 370 | "rlp_label_model = rlp_label[['SUITE', 'STREET_NUMBER',\n", 371 | " 'STREET_NAME', 'STREET_TYPE', 'STREET_DIRECTION', 'CITY', 'PROVINCE',\n", 372 | " 'POSTAL_CODE', 'LATITUDE', 'LONGITUDE', 'DISSEMINATION_AREA',\n", 373 | " 'PROPERTY_STYLE']]\n", 374 | "\n", 375 | "rlp_label_model['SUITE'] = pd.factorize(rlp_label_model['SUITE'])[0]\n", 376 | "rlp_label_model['STREET_NUMBER'] = pd.factorize(rlp_label_model['STREET_NUMBER'])[0]\n", 377 | "rlp_label_model['STREET_NAME'] = pd.factorize(rlp_label_model['STREET_NAME'])[0]\n", 378 | "rlp_label_model['STREET_TYPE'] = pd.factorize(rlp_label_model['STREET_TYPE'])[0]\n", 379 | "rlp_label_model['STREET_DIRECTION'] = pd.factorize(rlp_label_model['STREET_DIRECTION'])[0]\n", 380 | "rlp_label_model['CITY'] = pd.factorize(rlp_label_model['CITY'])[0]\n", 381 | "rlp_label_model['PROVINCE'] = pd.factorize(rlp_label_model['PROVINCE'])[0]\n", 382 | "rlp_label_model['POSTAL_CODE'] = pd.factorize(rlp_label_model['POSTAL_CODE'])[0]\n", 383 | "rlp_label_model[\"LATITUDE\"] = pd.to_numeric(rlp_label_model[\"LATITUDE\"], downcast=\"float\")\n", 384 | "rlp_label_model[\"LONGITUDE\"] = pd.to_numeric(rlp_label_model[\"LONGITUDE\"], downcast=\"float\")\n", 385 | "rlp_label_model['PROPERTY_STYLE'] = rlp_label_model['PROPERTY_STYLE'].apply(lambda x: int(x[0]))" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "id": "1ecbdb87", 391 | "metadata": {}, 392 | "source": [ 393 | "### Data Facts" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 8, 399 | "id": "370316f4", 400 | "metadata": { 401 | "scrolled": true 402 | }, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/html": [ 407 | "
\n", 408 | "\n", 421 | "\n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | "
FULL_ADDRESS
PROPERTY_STYLE
1_SFD1166357
2_SD61259
3_CONDOAPT217768
4_ROW137929
5_PLEX11770
6_MOBILE5947
7_LAND56
\n", 463 | "
" 464 | ], 465 | "text/plain": [ 466 | " FULL_ADDRESS\n", 467 | "PROPERTY_STYLE \n", 468 | "1_SFD 1166357\n", 469 | "2_SD 61259\n", 470 | "3_CONDOAPT 217768\n", 471 | "4_ROW 137929\n", 472 | "5_PLEX 11770\n", 473 | "6_MOBILE 5947\n", 474 | "7_LAND 56" 475 | ] 476 | }, 477 | "execution_count": 8, 478 | "metadata": {}, 479 | "output_type": "execute_result" 480 | } 481 | ], 482 | "source": [ 483 | "property_data[['PROPERTY_STYLE','FULL_ADDRESS']].groupby('PROPERTY_STYLE').count()" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 9, 489 | "id": "e3912c17", 490 | "metadata": { 491 | "scrolled": true 492 | }, 493 | "outputs": [ 494 | { 495 | "data": { 496 | "text/html": [ 497 | "
\n", 498 | "\n", 511 | "\n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | "
FULL_ADDRESS
PROPERTY_STYLE
1_SFD511076
2_SD27778
3_CONDOAPT94345
4_ROW56801
5_PLEX3044
6_MOBILE3307
7_LAND9
\n", 553 | "
" 554 | ], 555 | "text/plain": [ 556 | " FULL_ADDRESS\n", 557 | "PROPERTY_STYLE \n", 558 | "1_SFD 511076\n", 559 | "2_SD 27778\n", 560 | "3_CONDOAPT 94345\n", 561 | "4_ROW 56801\n", 562 | "5_PLEX 3044\n", 563 | "6_MOBILE 3307\n", 564 | "7_LAND 9" 565 | ] 566 | }, 567 | "execution_count": 9, 568 | "metadata": {}, 569 | "output_type": "execute_result" 570 | } 571 | ], 572 | "source": [ 573 | "rlp_label[['PROPERTY_STYLE','FULL_ADDRESS']].groupby('PROPERTY_STYLE').count()" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 10, 579 | "id": "b00554e6", 580 | "metadata": {}, 581 | "outputs": [ 582 | { 583 | "data": { 584 | "text/html": [ 585 | "
\n", 586 | "\n", 599 | "\n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | "
FULL_ADDRESS
PROVINCE
AB168387
BC225951
MB27572
NB17317
NL9000
NS25991
NT29
NU13
ON742827
PE4889
QC354395
SK23355
YT1360
\n", 665 | "
" 666 | ], 667 | "text/plain": [ 668 | " FULL_ADDRESS\n", 669 | "PROVINCE \n", 670 | "AB 168387\n", 671 | "BC 225951\n", 672 | "MB 27572\n", 673 | "NB 17317\n", 674 | "NL 9000\n", 675 | "NS 25991\n", 676 | "NT 29\n", 677 | "NU 13\n", 678 | "ON 742827\n", 679 | "PE 4889\n", 680 | "QC 354395\n", 681 | "SK 23355\n", 682 | "YT 1360" 683 | ] 684 | }, 685 | "execution_count": 10, 686 | "metadata": {}, 687 | "output_type": "execute_result" 688 | } 689 | ], 690 | "source": [ 691 | "# Use distribution to confirm if we should stratify the test sample by province\n", 692 | "property_data[['PROVINCE','FULL_ADDRESS']].groupby('PROVINCE').count()" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 11, 698 | "id": "07e88e60", 699 | "metadata": {}, 700 | "outputs": [ 701 | { 702 | "data": { 703 | "text/html": [ 704 | "
\n", 705 | "\n", 718 | "\n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | "
FULL_ADDRESS
PROVINCE
AB111109
BC214764
MB44566
NB23747
NL13339
NS24297
NT5
ON663521
PE3621
QC107247
SK24312
YT5
\n", 780 | "
" 781 | ], 782 | "text/plain": [ 783 | " FULL_ADDRESS\n", 784 | "PROVINCE \n", 785 | "AB 111109\n", 786 | "BC 214764\n", 787 | "MB 44566\n", 788 | "NB 23747\n", 789 | "NL 13339\n", 790 | "NS 24297\n", 791 | "NT 5\n", 792 | "ON 663521\n", 793 | "PE 3621\n", 794 | "QC 107247\n", 795 | "SK 24312\n", 796 | "YT 5" 797 | ] 798 | }, 799 | "execution_count": 11, 800 | "metadata": {}, 801 | "output_type": "execute_result" 802 | } 803 | ], 804 | "source": [ 805 | "rlp_data[['PROVINCE','FULL_ADDRESS']].groupby('PROVINCE').count()" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": 12, 811 | "id": "73d9d601", 812 | "metadata": {}, 813 | "outputs": [ 814 | { 815 | "data": { 816 | "text/html": [ 817 | "
\n", 818 | "\n", 831 | "\n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | "
FULL_ADDRESS
PROVINCE
AB47684
BC117059
MB23293
NB13735
NL6457
NS12637
NT4
ON248269
PE2030
QC50341
SK12662
YT2
\n", 893 | "
" 894 | ], 895 | "text/plain": [ 896 | " FULL_ADDRESS\n", 897 | "PROVINCE \n", 898 | "AB 47684\n", 899 | "BC 117059\n", 900 | "MB 23293\n", 901 | "NB 13735\n", 902 | "NL 6457\n", 903 | "NS 12637\n", 904 | "NT 4\n", 905 | "ON 248269\n", 906 | "PE 2030\n", 907 | "QC 50341\n", 908 | "SK 12662\n", 909 | "YT 2" 910 | ] 911 | }, 912 | "execution_count": 12, 913 | "metadata": {}, 914 | "output_type": "execute_result" 915 | } 916 | ], 917 | "source": [ 918 | "rlp_nolabel[['PROVINCE','FULL_ADDRESS']].groupby('PROVINCE').count()" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": 15, 924 | "id": "9786cd38", 925 | "metadata": {}, 926 | "outputs": [], 927 | "source": [ 928 | "# Split into training data into train and test set (test set used for model selection)\n", 929 | "X = property_data_model.iloc[:, 0:11]\n", 930 | "y = property_data_model.iloc[:, 11:12]\n", 931 | "\n", 932 | "# Stratify on sample based on Province \n", 933 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True, \\\n", 934 | " random_state=4624, stratify=X['PROVINCE'])" 935 | ] 936 | }, 937 | { 938 | "cell_type": "code", 939 | "execution_count": 16, 940 | "id": "151c46e0", 941 | "metadata": {}, 942 | "outputs": [ 943 | { 944 | "data": { 945 | "text/plain": [ 946 | "((800543, 11), (800543, 11))" 947 | ] 948 | }, 949 | "execution_count": 16, 950 | "metadata": {}, 951 | "output_type": "execute_result" 952 | } 953 | ], 954 | "source": [ 955 | "X_train.shape, X_test.shape" 956 | ] 957 | }, 958 | { 959 | "cell_type": "markdown", 960 | "id": "030f3df5", 961 | "metadata": {}, 962 | "source": [ 963 | "## Model Creation\n", 964 | "1. KNN\n", 965 | "2. Logistic Regression\n", 966 | "3. Random Forest\n", 967 | "4. Neural Network" 968 | ] 969 | }, 970 | { 971 | "cell_type": "code", 972 | "execution_count": 17, 973 | "id": "996660d3", 974 | "metadata": {}, 975 | "outputs": [], 976 | "source": [ 977 | "from sklearn.neighbors import KNeighborsClassifier \n", 978 | "from sklearn.linear_model import LogisticRegression\n", 979 | "from sklearn.model_selection import GridSearchCV\n", 980 | "from datetime import datetime" 981 | ] 982 | }, 983 | { 984 | "cell_type": "markdown", 985 | "id": "55fdb60d", 986 | "metadata": {}, 987 | "source": [ 988 | "### KNN\n", 989 | "Testing Manhattan and Euclidean distance to tune k." 990 | ] 991 | }, 992 | { 993 | "cell_type": "code", 994 | "execution_count": null, 995 | "id": "ec1cdb1e", 996 | "metadata": { 997 | "scrolled": true 998 | }, 999 | "outputs": [ 1000 | { 1001 | "name": "stdout", 1002 | "output_type": "stream", 1003 | "text": [ 1004 | "2022-04-28 21:16:03.815036\n", 1005 | "[1, 2, 3, 4, 5]\n", 1006 | "1 euclidean\n", 1007 | "2 euclidean\n", 1008 | "3 euclidean\n" 1009 | ] 1010 | } 1011 | ], 1012 | "source": [ 1013 | "# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier\n", 1014 | "start = datetime.now()\n", 1015 | "print(start)\n", 1016 | "\n", 1017 | "train_accuracy_euc = []\n", 1018 | "test_accuracy_euc = []\n", 1019 | "train_accuracy_man = []\n", 1020 | "test_accuracy_man = []\n", 1021 | "\n", 1022 | "# Loop through different values of k\n", 1023 | "n_neighbors = list(np.arange(1,6,1))\n", 1024 | "print(n_neighbors)\n", 1025 | "\n", 1026 | "for m in ['euclidean', 'manhattan']:\n", 1027 | " for k in n_neighbors:\n", 1028 | " print(k,m)\n", 1029 | " \n", 1030 | " knn = KNeighborsClassifier(n_neighbors=k, metric=m)\n", 1031 | " knn.fit(X_train,y_train) \n", 1032 | " \n", 1033 | " train_score = knn.score(X_train, y_train)\n", 1034 | " test_score = knn.score(X_test, y_test)\n", 1035 | " \n", 1036 | " if m=='euclidean':\n", 1037 | " train_accuracy_euc.append(train_score)\n", 1038 | " test_accuracy_euc.append(test_score)\n", 1039 | " else:\n", 1040 | " train_accuracy_man.append(train_score)\n", 1041 | " test_accuracy_man.append(test_score)\n", 1042 | " \n", 1043 | "end = datetime.now()\n", 1044 | "print (end-start)" 1045 | ] 1046 | }, 1047 | { 1048 | "cell_type": "code", 1049 | "execution_count": null, 1050 | "id": "bb409c6f", 1051 | "metadata": {}, 1052 | "outputs": [], 1053 | "source": [ 1054 | "# https://matplotlib.org/3.5.0/api/_as_gen/matplotlib.axes.Axes.set_ylim.html\n", 1055 | "f = plt.figure(figsize=(10,3))\n", 1056 | "ax = f.add_subplot(121)\n", 1057 | "ax2 = f.add_subplot(122)\n", 1058 | "ax.plot(n_neighbors, train_accuracy_euc, label='Euclidean')\n", 1059 | "ax.plot(n_neighbors, train_accuracy_man, label='Manhattan')\n", 1060 | "ax2.plot(n_neighbors, test_accuracy_euc, label='Euclidean')\n", 1061 | "ax2.plot(n_neighbors, test_accuracy_man, label='Manhattan')\n", 1062 | "\n", 1063 | "ax.set_title('Training Set Accuracy')\n", 1064 | "ax2.set_title('Test Set Accuracy')\n", 1065 | "ax.set_xlabel('k value')\n", 1066 | "ax2.set_xlabel('k value')\n", 1067 | "ax.legend()\n", 1068 | "ax2.legend()\n", 1069 | "\n", 1070 | "f.savefig('KNN_Results.jpg')" 1071 | ] 1072 | }, 1073 | { 1074 | "cell_type": "markdown", 1075 | "id": "052450f3", 1076 | "metadata": {}, 1077 | "source": [ 1078 | "### Final KNN Model\n", 1079 | "knn = KNeighborsClassifier(n_neighbors=2, metric='manhattan')" 1080 | ] 1081 | }, 1082 | { 1083 | "cell_type": "code", 1084 | "execution_count": 18, 1085 | "id": "0ee37e35", 1086 | "metadata": {}, 1087 | "outputs": [ 1088 | { 1089 | "name": "stdout", 1090 | "output_type": "stream", 1091 | "text": [ 1092 | "Training Accuracy: 0.9225987860739523\n", 1093 | "Test Accuracy: 0.8502403993289555\n" 1094 | ] 1095 | } 1096 | ], 1097 | "source": [ 1098 | "# Fitting the KNN model\n", 1099 | "knn = KNeighborsClassifier(n_neighbors=2, metric='manhattan')\n", 1100 | "knn.fit(X_train, y_train)\n", 1101 | "knn_score_train = knn.score(X_train, y_train)\n", 1102 | "knn_score_test = knn.score(X_test, y_test)\n", 1103 | "\n", 1104 | "print('Training Accuracy: ' + str(knn_score_train))\n", 1105 | "print('Test Accuracy: ' + str(knn_score_test))" 1106 | ] 1107 | }, 1108 | { 1109 | "cell_type": "markdown", 1110 | "id": "36824d7e", 1111 | "metadata": {}, 1112 | "source": [ 1113 | "### Multinomial Logistic Regression\n", 1114 | "* newton-cg took 15 hours to run!!" 1115 | ] 1116 | }, 1117 | { 1118 | "cell_type": "code", 1119 | "execution_count": 16, 1120 | "id": "eaee3fa3", 1121 | "metadata": {}, 1122 | "outputs": [ 1123 | { 1124 | "name": "stdout", 1125 | "output_type": "stream", 1126 | "text": [ 1127 | "2022-04-24 15:04:48.812556\n", 1128 | "lbfgs\n", 1129 | "sag\n", 1130 | "saga\n", 1131 | "0:10:47.379437\n" 1132 | ] 1133 | } 1134 | ], 1135 | "source": [ 1136 | "# https://machinelearningmastery.com/multinomial-logistic-regression-with-python/\n", 1137 | "# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n", 1138 | "# https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-definitions\n", 1139 | "\n", 1140 | "start = datetime.now()\n", 1141 | "print(start)\n", 1142 | "\n", 1143 | "solver = ['lbfgs', 'sag', 'saga']\n", 1144 | "lr_train_accuracy = []\n", 1145 | "lr_test_accuracy = []\n", 1146 | "\n", 1147 | "for s in solver:\n", 1148 | " print(s)\n", 1149 | "\n", 1150 | " lr_model = LogisticRegression(multi_class='multinomial', max_iter=200, solver=s)\n", 1151 | " lr_model.fit(X_train, y_train)\n", 1152 | " lr_score_train = lr_model.score(X_train, y_train)\n", 1153 | " lr_score_test = lr_model.score(X_test, y_test)\n", 1154 | " lr_train_accuracy.append(lr_score_train)\n", 1155 | " lr_test_accuracy.append(lr_score_test)\n", 1156 | " \n", 1157 | "end = datetime.now()\n", 1158 | "print (end-start)" 1159 | ] 1160 | }, 1161 | { 1162 | "cell_type": "code", 1163 | "execution_count": 37, 1164 | "id": "90046016", 1165 | "metadata": { 1166 | "scrolled": true 1167 | }, 1168 | "outputs": [ 1169 | { 1170 | "data": { 1171 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlMAAADgCAYAAAAngOMaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAczUlEQVR4nO3de5hddX3v8feHBEQUSTGxSAgGNVUjCkdHLK1WLHgkVp9gpRVUwNvJgZbeTtsjtqce7e3Uqm2twompoihW1FptWkOh9YjWKprQpshFbEQlEdFwF+Ri4Hv+WGtkO+zJ7Jm198wkeb+eZz9Zl99e67dXZj7zXb+19t6pKiRJkjQze811ByRJknZlFlOSJEkdWExJkiR1YDElSZLUgcWUJElSBxZTkiRJHVhM7cKSXJjktGG3lSRJg7OYmmVJ7uh53J/krp75l09nW1W1qqrOG3bb6UryO0m+3r6GbUk+PODzXpnkcwO2fV+SHUkO7tZbSbNlmHnXbu+SJK+dos1rknwlyfeSfCfJJ5PsP8C2j0mybcB+vDFJJTlq0L5r92YxNcuq6uHjD+A64EU9yz443i7Jwrnr5eDa0a5TgOPa1zQGfGrI+3gY8BLgNmDaAdxx37vE/4M0Hw2ad8OS5DnAHwMnV9X+wJOAjwx5H6HJvJuBWR3tN4/mL4upeWL8rCjJ65LcALw3yY8l+Yck25Pc0k4f0vOcH56ljY/yJHlr2/brSVbNsO1hST7bntn9c5Kzk5w/SdefAVxUVV8DqKobqmpdz7YOSPKeJN9O8q0kf5hkQZInAWuBo9uz1Ft3cnheAtwK/D4TwivJgUnem+T69rV8omfd6iSbk9ye5GtJjm+XfyPJcT3t3jj++pIsb884X5PkOuD/tcs/muSGJLe1x+bJPc9/aJK3Jflmu/5z7bJPJvmVCf29PMkJO3mt0m4vyV5Jzmp/L29K8pEkB7br9k1yfrv81iQbk/x4kj8Cng28s82Md/bZ9DOAL1TVvwNU1c1VdV5Vfa/d9kPa3LuuHbVa2/6uPgy4EDi4Z+RsslHwZwMHA78GnJRkn57X1TcL2nXPSvL59jVtTfLKdvmPjLZlwoh9m0e/nOQ/gf9sl7293cbtSS5L8uye9gvSXC34WpvhlyVZ1ub42yb8P/x9kl8f4L9MU7CYml8OAg4EHgOsofn/eW87fyhwF9AvQMY9E7gGWAz8KfCeJJlB278GvgQ8EngjzVnYZC4FTk3y20nGkiyYsP48YAfweOC/AP8VeG1VXQ2cThN8D6+qRTvZx2nAh4ALgCcmeVrPug8A+wFPBh4F/DlAmuH39wO/DSwCfgb4xk72MdFzaM5qn9/OXwisaPfxb0DvWfVbgacDP0Xz//c/gfvb1/6K8UZJjgCWAhum0Q9pd/SrwAk0v2cHA7cAZ7frTgMOAJbRZNDpwF1V9bvAvwBntplxZp/tfhF4fpI3JfnpJA+ZsP7NwE8AR9Jk0lLgDVV1J7AKuL5n5Oz6Sfp+GvD3wPjtDC/sWdc3C5IcSpMh7wCWtPvfPNnB6eMEmsxe2c5vbLdxIE1efzTJvu26/wGcDLwAeATwauD7NHl0cpK9AJIsBo6lyVZ1VVU+5uhB88f9uHb6GOBeYN+dtD8SuKVn/hKawgTglcCWnnX7AQUcNJ22NEXbDmC/nvXnA+fvpF8vB/4ZuBO4CTirXf7jwD3AQ3vangx8uqcfn5viGB1KU5gc2c5fBLy9nX50u+7H+jzvXcCfT3Xc2/k3jr8+YHl7LB67kz4tatscQFPw3gUc0afdQ2guBaxo598KnDPXP3c+fMzFY0LeXQ0c27Pu0cAPgIU0f/w/Dzy1zzZ+mGM72c8qmmLnVuAO4M+ABUDajHpcT9ujga+308cA26bY9n7A7cAJ7fy7gL9rp3eWBa8HPj7JNn/kNU3MxTZrfnaKft0yvl+ak+TVk7S7GnheO30msGGufy52l4cjU/PL9qq6e3wmyX5J3tUOGd8OfBZY1Gf0Z9wN4xNV9f128uHTbHswcHPPMoCtO+t0VX2wqo6jKTJOB34/yfNpRtT2Br7dDm3fShM+j9rZ9iY4Bbi6qja38x8EXpZkb5oz15ur6pY+z1sGfG0a+5noh6+5HTb/k3bY/HYeGOFa3D727bevqrqH5n6NV7RngyfTjKRJe7rHAB/vyYWrgftoTsA+QHPSdEGay/d/2v6+D6SqLqyqF9GM2qymKU5eSzMitB9wWc9+/7FdPqgX05xsjo8ufxBYlWQJO8kChphHAEl+M8nV7aXEW2lO7BYPsK/e0fJXYB4NjcXU/FIT5n8TeALwzKp6BM2lKmjOsEbl28CBSfbrWbZskCdW1Q+q6qPA5cDhNAFwD7C4qha1j0dU1fj9RhNfbz+nAo9t71e6geYsczHN2efWtq+L+jxvK/C4SbZ5J02ojjuo38vpmX4ZTSgfRxNay9vlAW4E7t7Jvs6jGbk7Fvh+VX1hknbSnmQrsKonFxZV1b5V9a02R95UVStpLpe9kCYHYLDMaBpW3V9Vn6K57/Fwmt/Vu4An9+zzgGpujh9026fRnHRe1+bRR2lOGE9m51kwtDxq7496HfCLNKPyi2jenDP+d2Fn+zofWN3ecvAk4BOTtNM0WUzNb/vT/PLf2t6c+b9HvcOq+iawCXhjkn2SHA28aLL27c2SP5dk//am0lU09y99saq+DVwMvC3JI9r1j0vzjhuA7wCH9N7AOWHbR9OEwlE0lziPpAnFvwZOa7d/IXBOmpv1904yXnC+B3hVkmPb/S5N8sR23WaaG0f3TjIGnDjFYdmfpii8iSb0/rjneN0PnAv8WZKD21Gso8fv1WiLp/uBt+FZoDRuLfBHSR4DkGRJktXt9HOTPKUdgb+d5vLffe3zvgM8drKNpnnTyUltHqS9d/I5wKXt7+pfAX+e5FFt+6XtKPr4th+Z5IBJtr2U5qTohTyQR0fQ3Id12hRZ8EHguCS/mGRhkkcmObLd9Gbg59srEY8HXjPFsdufZnRsO7AwyRto7o0a927gD5KsaI/BU5M8EqCqttHcb/UB4GNVddcU+9KALKbmt78AHkpzxnMpzZD0bHg5zb0ENwF/SHOj5T2TtL0d+B2atz3fSnMz+xlVNf5ulFOBfYCraK7r/w3N/RHQnDFeCdyQ5MY+2z6N5n6EL1fzLsEbquoG4O3AC9sC8xSasP0K8F3g1wGq6kvAq2huSL8N+AzNpQWA36Mp0m4B3kRTnO3M+4FvAt9qX8elE9b/FvBlmpC6mSZc95rw/KfQnBVKan6H1wMXJ/keze/UM9t1B9HkxO00l/8+wwO/O28HTkzzzt2/7LPdW4D/RvOut9vb572lHvgYhtcBW4BL20v2/0wz+k9VfYXmZuxr28uAE9/NdwqwuaounpBHfwk8NcnhTJIFVXUdzQ3hv9ku30xTiEGTUffSFHPn8aNvbunnIpqTyK/S5NLd/OhlwD+jub3g4vYYvIfm78i482jyyJO7IUrVwKOm2kOl+RDOr1TVyEfGdkdJTgXWVNWz5rovkvZs7ej9+cDydjRNQ+DIlB4kyTPay3F7pflsptV4bX1G2nvPfglYN1VbSRql9kb+XwPebSE1XBZT6ucgmrfr3kEzhH1GtR+Cp8G192Jspxm+n+pSoiSNTJoPSr6V5jaLv5jTzuyGvMwnSZLUgSNTkiRJHVhMSZIkdTBn30C9ePHiWr58+VztXtIcuOyyy26squl84vS8ZYZJe5ad5decFVPLly9n06ZNc7V7SXMgyTfnug/DYoZJe5ad5ZeX+SRJkjqwmJIkSerAYkqSJKkDiylJkqQOLKYkSZI6mLN3803XFVdcMdddmBcOP/zwTs9fftYnh9STXds3/uTnOj3fn8dG159HaTrMr0bX/AIzbNywMmyXKaYkaVflH66Gxbd2V17mkyRJ6sBiSpIkqYOBiqkkxye5JsmWJGf1Wf/bSTa3jyuS3JfkwOF3V5Kmx/ySNGpTFlNJFgBnA6uAlcDJSVb2tqmqt1TVkVV1JPB64DNVdfMI+itJAzO/JM2GQUamjgK2VNW1VXUvcAGweiftTwY+NIzOSVJH5pekkRukmFoKbO2Z39Yue5Ak+wHHAx+bZP2aJJuSbNq+fft0+ypJ0zW0/JKkyQxSTKXPspqk7YuAf51siLyq1lXVWFWNLVmyZNA+StJMDS2/wBNCSf0NUkxtA5b1zB8CXD9J25NwiFzS/DHU/PKEUFI/gxRTG4EVSQ5Lsg9N4Kyf2CjJAcBzgL8bbhclacbML0kjN+UnoFfVjiRnAhcBC4Bzq+rKJKe369e2TV8MXFxVd46st5I0DeaXpNkw0NfJVNUGYMOEZWsnzL8PeN+wOiZJw2B+SRo1PwFdkiSpA4spSZKkDiymJEmSOrCYkiRJ6sBiSpIkqQOLKUmSpA4spiRJkjqwmJIkSerAYkqSJKkDiylJkqQOLKYkSZI6sJiSJEnqYKBiKsnxSa5JsiXJWZO0OSbJ5iRXJvnMcLspSTNjfkkatYVTNUiyADgbeB6wDdiYZH1VXdXTZhFwDnB8VV2X5FEj6q8kDcz8kjQbBhmZOgrYUlXXVtW9wAXA6gltXgb8bVVdB1BV3x1uNyVpRswvSSM3SDG1FNjaM7+tXdbrJ4AfS3JJksuSnDqsDkpSB+aXpJGb8jIfkD7Lqs92ng4cCzwU+EKSS6vqqz+yoWQNsAbg0EMPnX5vJWl6hpZfYIZJ6m+QkaltwLKe+UOA6/u0+cequrOqbgQ+CxwxcUNVta6qxqpqbMmSJTPtsyQNamj5BWaYpP4GKaY2AiuSHJZkH+AkYP2ENn8HPDvJwiT7Ac8Erh5uVyVp2swvSSM35WW+qtqR5EzgImABcG5VXZnk9Hb92qq6Osk/ApcD9wPvrqorRtlxSZqK+SVpNgxyzxRVtQHYMGHZ2gnzbwHeMryuSVJ35pekUfMT0CVJkjqwmJIkSerAYkqSJKkDiylJkqQOLKYkSZI6sJiSJEnqwGJKkiSpA4spSZKkDiymJEmSOrCYkiRJ6sBiSpIkqQOLKUmSpA4GKqaSHJ/kmiRbkpzVZ/0xSW5Lsrl9vGH4XZWk6TO/JI3awqkaJFkAnA08D9gGbEyyvqqumtD0X6rqhSPooyTNiPklaTYMMjJ1FLClqq6tqnuBC4DVo+2WJA2F+SVp5AYpppYCW3vmt7XLJjo6yX8kuTDJk4fSO0nqxvySNHJTXuYD0mdZTZj/N+AxVXVHkhcAnwBWPGhDyRpgDcChhx46vZ5K0vQNLb/ADJPU3yAjU9uAZT3zhwDX9zaoqtur6o52egOwd5LFEzdUVeuqaqyqxpYsWdKh25I0kKHlV7veDJP0IIMUUxuBFUkOS7IPcBKwvrdBkoOSpJ0+qt3uTcPurCRNk/klaeSmvMxXVTuSnAlcBCwAzq2qK5Oc3q5fC5wInJFkB3AXcFJVTRxKl6RZZX5Jmg2D3DM1PvS9YcKytT3T7wTeOdyuSVJ35pekUfMT0CVJkjqwmJIkSerAYkqSJKkDiylJkqQOLKYkSZI6sJiSJEnqwGJKkiSpA4spSZKkDiymJEmSOrCYkiRJ6sBiSpIkqQOLKUmSpA4spiRJkjoYqJhKcnySa5JsSXLWTto9I8l9SU4cXhclaebML0mjNmUxlWQBcDawClgJnJxk5STt3gxcNOxOStJMmF+SZsMgI1NHAVuq6tqquhe4AFjdp92vAB8DvjvE/klSF+aXpJEbpJhaCmztmd/WLvuhJEuBFwNrd7ahJGuSbEqyafv27dPtqyRN19Dyq21rhkl6kEGKqfRZVhPm/wJ4XVXdt7MNVdW6qhqrqrElS5YM2EVJmrGh5ReYYZL6WzhAm23Asp75Q4DrJ7QZAy5IArAYeEGSHVX1iWF0UpJmyPySNHKDFFMbgRVJDgO+BZwEvKy3QVUdNj6d5H3APxhEkuYB80vSyE1ZTFXVjiRn0rzLZQFwblVdmeT0dv2U9xlI0lwwvyTNhkFGpqiqDcCGCcv6hlBVvbJ7tyRpOMwvSaPmJ6BLkiR1YDElSZLUgcWUJElSBxZTkiRJHVhMSZIkdWAxJUmS1IHFlCRJUgcWU5IkSR1YTEmSJHVgMSVJktSBxZQkSVIHFlOSJEkdDFRMJTk+yTVJtiQ5q8/61UkuT7I5yaYkzxp+VyVp+swvSaO2cKoGSRYAZwPPA7YBG5Osr6qrepp9ClhfVZXkqcBHgCeOosOSNCjzS9JsGGRk6ihgS1VdW1X3AhcAq3sbVNUdVVXt7MOAQpLmnvklaeQGKaaWAlt75re1y35Ekhcn+QrwSeDV/TaUZE07jL5p+/btM+mvJE3H0PKrbWeGSXqQQYqp9Fn2oDO3qvp4VT0ROAH4g34bqqp1VTVWVWNLliyZVkclaQaGll9tOzNM0oMMUkxtA5b1zB8CXD9Z46r6LPC4JIs79k2SujK/JI3cIMXURmBFksOS7AOcBKzvbZDk8UnSTj8N2Ae4adidlaRpMr8kjdyU7+arqh1JzgQuAhYA51bVlUlOb9evBV4CnJrkB8BdwEt7buiUpDlhfkmaDVMWUwBVtQHYMGHZ2p7pNwNvHm7XJKk780vSqPkJ6JIkSR1YTEmSJHVgMSVJktSBxZQkSVIHFlOSJEkdWExJkiR1YDElSZLUgcWUJElSBxZTkiRJHVhMSZIkdWAxJUmS1IHFlCRJUgcDFVNJjk9yTZItSc7qs/7lSS5vH59PcsTwuypJ02d+SRq1KYupJAuAs4FVwErg5CQrJzT7OvCcqnoq8AfAumF3VJKmy/ySNBsGGZk6CthSVddW1b3ABcDq3gZV9fmquqWdvRQ4ZLjdlKQZMb8kjdwgxdRSYGvP/LZ22WReA1zYb0WSNUk2Jdm0ffv2wXspSTMztPwCM0xSf4MUU+mzrPo2TJ5LE0av67e+qtZV1VhVjS1ZsmTwXkrSzAwtv8AMk9TfwgHabAOW9cwfAlw/sVGSpwLvBlZV1U3D6Z4kdWJ+SRq5QUamNgIrkhyWZB/gJGB9b4MkhwJ/C5xSVV8dfjclaUbML0kjN+XIVFXtSHImcBGwADi3qq5Mcnq7fi3wBuCRwDlJAHZU1djoui1JUzO/JM2GQS7zUVUbgA0Tlq3tmX4t8Nrhdk2SujO/JI2an4AuSZLUgcWUJElSBxZTkiRJHVhMSZIkdWAxJUmS1IHFlCRJUgcWU5IkSR1YTEmSJHVgMSVJktSBxZQkSVIHFlOSJEkdWExJkiR1MFAxleT4JNck2ZLkrD7rn5jkC0nuSfJbw++mJM2M+SVp1BZO1SDJAuBs4HnANmBjkvVVdVVPs5uBXwVOGEUnJWkmzC9Js2GQkamjgC1VdW1V3QtcAKzubVBV362qjcAPRtBHSZop80vSyA1STC0FtvbMb2uXSdJ8Z35JGrlBiqn0WVYz2VmSNUk2Jdm0ffv2mWxCkqZjaPkFZpik/gYpprYBy3rmDwGun8nOqmpdVY1V1diSJUtmsglJmo6h5ReYYZL6G6SY2gisSHJYkn2Ak4D1o+2WJA2F+SVp5KZ8N19V7UhyJnARsAA4t6quTHJ6u35tkoOATcAjgPuT/DqwsqpuH13XJWnnzC9Js2HKYgqgqjYAGyYsW9szfQPN8LkkzSvml6RR8xPQJUmSOrCYkiRJ6sBiSpIkqQOLKUmSpA4spiRJkjqwmJIkSerAYkqSJKkDiylJkqQOLKYkSZI6sJiSJEnqwGJKkiSpA4spSZKkDgYqppIcn+SaJFuSnNVnfZL8Zbv+8iRPG35XJWn6zC9JozZlMZVkAXA2sApYCZycZOWEZquAFe1jDfB/h9xPSZo280vSbBhkZOooYEtVXVtV9wIXAKsntFkNvL8alwKLkjx6yH2VpOkyvySN3CDF1FJga8/8tnbZdNtI0mwzvySN3MIB2qTPsppBG5KsoRlGB7gjyTUD7H8+WQzcONed2A3M+XHMm+dy70Mz58dxBh4zy/sbWn7BLp9hu+LPy3w058dxN8kvmAfHcpomza9BiqltwLKe+UOA62fQhqpaB6wbYJ/zUpJNVTU21/3Y1Xkch8PjOJCh5Rfs2hnmz8tweByHZ3c6loNc5tsIrEhyWJJ9gJOA9RParAdObd8V85PAbVX17SH3VZKmy/ySNHJTjkxV1Y4kZwIXAQuAc6vqyiSnt+vXAhuAFwBbgO8DrxpdlyVpMOaXpNmQqr63BqiPJGvaYX514HEcDo+jpsOfl+HwOA7P7nQsLaYkSZI68OtkJEmSOtiji6kkd7T/HpPkHyZp8wtJrk7y6dntnSTtnBkmzQ97dDE1oNcAv1RVz53rjkjSDJhh0ohZTD3gEUk+nuSqJGuT7JXkDcCzgLVJ3pJkvyQfab8M9cNJvphkLMmCJO9LckWSLyf5jbl+MXMpycOSfDLJf7TH5KVJ3pBkYzu/Lknats9oj+cX2mN8xVz3fy4N49glWZ7kX5L8W/v4qbl9VZolZtgQmF8zt0fnV1XtsQ/gjvbfY4C7gcfSvH36n4AT23WXAGPt9G8B72qnDwd2AGPA04F/6tnuorl+bXN8XF8C/FXP/AHAgT3zHwBe1E5fAfxUO/0nwBVz3f9d/dgB+wH7ttMrgE1z/bp8jOznxQwb/jE1v+bw2O2q+eXI1AO+VM2Xod4HfIjmbG6iZ9F8USpVdQVwebv8WuCxSd6R5Hjg9tno8Dz2ZeC4JG9O8uyqug14bnsW/GXgZ4EnJ1kE7F9Vn2+f99dz1N/5ZBjHbm/gr9r2HwVWzmL/NXfMsOEwv2Zuj82vQb5OZk8x8TMi+n1mRL/v8KKqbklyBPB84JeBXwRePdzu7Tqq6qtJnk7zQYj/J8nFNMdlrKq2JnkjsC+THM892ZCO3W8A3wGOoLmUf/doe615wgwbAvNr5vbk/HJk6gFHpfnKib2AlwKf69PmczQhQ5KVwFPa6cXAXlX1MeD3gKfNTpfnpyQHA9+vqvOBt/LA8bgxycOBE6EJcOB7ab7CA5qv+tijDenYHQB8u6ruB06hueyj3Z8ZNgTm18ztyfnlyNQDvkBz3fYpwGeBj/dpcw5wXpLLgX+nGSK/DVgKvLcNMYDXj76789pTgLckuR/4AXAGcALNEPA3aL4vbdxraIZ076S5t+O22ezoPDSMY3cO8LEkvwB8GrhzNjquOWeGDYf5NXN7bH75CejTkGQBsHdV3Z3kccCngJ+oqnvnuGu7rCQPr6rxz8o5C3h0Vf3aHHdrl+Cx03SZYcPl7+DM7W7HzpGp6dkP+HSSvWmu+Z5hCHX2c0leT/Oz+E3glXPbnV2Kx07TZYYNl7+DM7dbHTtHpiRJkjrwBnRJkqQOLKYkSZI6sJiSJEnqwGJKM5bkd5Nc2X6/0uYkz9xJ20uSjM1m/yRpMuaXhsl382lGkhwNvBB4WlXd037o3z6zsN8F7ddlSNKMmF8aNkemNFOPBm6sqnsAqurGqro+ybFJ/j3NN8+fm+QhvU9KckaSP+2Zf2WSd7TTr0jypfYs8V3tZ+KQ5I4kv5/ki8DRs/cSJe2mzC8NlcWUZupiYFmSryY5J8lzkuwLvA94aVU9hWbk84wJz/sb4Od75l8KfDjJk9rpn66qI4H7gJe3bR5G843iz6yqfl+RIUnTYX5pqCymNCPtJ9c+HVgDbAc+DPx34OtV9dW22XnAz0x43nbg2iQ/meSRwBOAfwWObbe3Mcnmdv6x7dPuAz420hckaY9hfmnYvGdKM9Ze+78EuCTJl4HTBnzqh2m+bPUrwMerqpIEOK+q+n0n2N3eZyBpmMwvDZMjU5qRJE9IsqJn0ZHAd4DlSR7fLjsF+Eyfp/8tzZdfnkwTTNB8R9iJSR7Vbv/AJI8ZQdcl7eHMLw2bI1OaqYcD70iyCNgBbKEZMv8Q8NEkC2m+IXztxCdW1S1JrgJWVtWX2mVXJflfwMXtN9f/APhlmu9skqRhMr80VH43nyRJUgde5pMkSerAYkqSJKkDiylJkqQOLKYkSZI6sJiSJEnqwGJKkiSpA4spSZKkDiymJEmSOvj/OiH5iTFQHyAAAAAASUVORK5CYII=\n", 1172 | "text/plain": [ 1173 | "
" 1174 | ] 1175 | }, 1176 | "metadata": { 1177 | "needs_background": "light" 1178 | }, 1179 | "output_type": "display_data" 1180 | } 1181 | ], 1182 | "source": [ 1183 | "f = plt.figure(figsize=(10,3))\n", 1184 | "ax = f.add_subplot(121)\n", 1185 | "ax2 = f.add_subplot(122)\n", 1186 | "\n", 1187 | "clrs_train = ['lightgrey' if (x < max(lr_train_accuracy)) else 'tab:blue' for x in lr_train_accuracy]\n", 1188 | "clrs_test = ['lightgrey' if (x < max(lr_test_accuracy)) else 'tab:blue' for x in lr_test_accuracy]\n", 1189 | "\n", 1190 | "ax.bar(solver, lr_train_accuracy, color=clrs_train)\n", 1191 | "ax2.bar(solver, lr_test_accuracy, color=clrs_test)\n", 1192 | "\n", 1193 | "ax.set_title('Training Set Accuracy')\n", 1194 | "ax2.set_title('Test Set Accuracy')\n", 1195 | "ax.set_xlabel('Solver')\n", 1196 | "ax2.set_xlabel('Solver')\n", 1197 | "\n", 1198 | "f.savefig('LogReg_Results.jpg')" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "markdown", 1203 | "id": "24a6ce86", 1204 | "metadata": {}, 1205 | "source": [ 1206 | "### Final Logistic Regression Model\n", 1207 | "LogisticRegression(multi_class='multinomial', max_iter=200, solver='sag')" 1208 | ] 1209 | }, 1210 | { 1211 | "cell_type": "code", 1212 | "execution_count": 19, 1213 | "id": "8ee6534f", 1214 | "metadata": {}, 1215 | "outputs": [ 1216 | { 1217 | "name": "stdout", 1218 | "output_type": "stream", 1219 | "text": [ 1220 | "Training Accuracy: 0.7329500101805899\n", 1221 | "Test Accuracy: 0.7333547354732975\n" 1222 | ] 1223 | } 1224 | ], 1225 | "source": [ 1226 | "# Fitting the LR model\n", 1227 | "lr_model = LogisticRegression(multi_class='multinomial', max_iter=200, solver='sag')\n", 1228 | "lr_model.fit(X_train, y_train)\n", 1229 | "lr_score_train = lr_model.score(X_train, y_train)\n", 1230 | "lr_score_test = lr_model.score(X_test, y_test)\n", 1231 | "\n", 1232 | "print('Training Accuracy: ' + str(lr_score_train))\n", 1233 | "print('Test Accuracy: ' + str(lr_score_test))" 1234 | ] 1235 | }, 1236 | { 1237 | "cell_type": "markdown", 1238 | "id": "de948e5b", 1239 | "metadata": {}, 1240 | "source": [ 1241 | "### Random Forest" 1242 | ] 1243 | }, 1244 | { 1245 | "cell_type": "code", 1246 | "execution_count": 21, 1247 | "id": "f57c8b71", 1248 | "metadata": {}, 1249 | "outputs": [ 1250 | { 1251 | "name": "stdout", 1252 | "output_type": "stream", 1253 | "text": [ 1254 | "2022-04-28 21:35:40.224612\n", 1255 | "0:08:27.791727\n" 1256 | ] 1257 | } 1258 | ], 1259 | "source": [ 1260 | "from sklearn.ensemble import RandomForestClassifier\n", 1261 | "# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html\n", 1262 | "\n", 1263 | "start = datetime.now()\n", 1264 | "print(start)\n", 1265 | "\n", 1266 | "rf_clf = RandomForestClassifier(min_samples_leaf = 100, random_state=0)\n", 1267 | "rf_clf.fit(X_train, y_train)\n", 1268 | "\n", 1269 | "end = datetime.now()\n", 1270 | "print (end-start)" 1271 | ] 1272 | }, 1273 | { 1274 | "cell_type": "code", 1275 | "execution_count": 23, 1276 | "id": "55f3bbe5", 1277 | "metadata": { 1278 | "scrolled": true 1279 | }, 1280 | "outputs": [ 1281 | { 1282 | "name": "stdout", 1283 | "output_type": "stream", 1284 | "text": [ 1285 | "Training Accuracy: 0.8530909645078403\n", 1286 | "Test Accuracy: 0.8507488042491159\n" 1287 | ] 1288 | } 1289 | ], 1290 | "source": [ 1291 | "rf_train = rf_clf.score(X_train, y_train)\n", 1292 | "rf_test = rf_clf.score(X_test, y_test)\n", 1293 | "\n", 1294 | "print('Training Accuracy: ' + str(rf_train))\n", 1295 | "print('Test Accuracy: ' + str(rf_test))" 1296 | ] 1297 | }, 1298 | { 1299 | "cell_type": "markdown", 1300 | "id": "8ba4aaab", 1301 | "metadata": {}, 1302 | "source": [ 1303 | "**Random Forest - Feature Importance**" 1304 | ] 1305 | }, 1306 | { 1307 | "cell_type": "code", 1308 | "execution_count": 51, 1309 | "id": "2cb90b7c", 1310 | "metadata": {}, 1311 | "outputs": [ 1312 | { 1313 | "data": { 1314 | "image/png": "\n", 1315 | "text/plain": [ 1316 | "
" 1317 | ] 1318 | }, 1319 | "metadata": { 1320 | "needs_background": "light" 1321 | }, 1322 | "output_type": "display_data" 1323 | } 1324 | ], 1325 | "source": [ 1326 | "sorted_idx = rf_clf.feature_importances_.argsort()\n", 1327 | "plt.barh(X_train.columns[sorted_idx], rf_clf.feature_importances_[sorted_idx])\n", 1328 | "plt.xlabel(\"Random Forest Feature Importance\")\n", 1329 | "plt.savefig('RandomForest_FeatureImportance.jpg')" 1330 | ] 1331 | }, 1332 | { 1333 | "cell_type": "code", 1334 | "execution_count": null, 1335 | "id": "b3f877f6", 1336 | "metadata": {}, 1337 | "outputs": [], 1338 | "source": [ 1339 | "# import shap\n", 1340 | "# explainer = shap.TreeExplainer(rf_clf)\n", 1341 | "# shap_values = explainer.shap_values(X_test)\n", 1342 | "# shap.summary_plot(shap_values, X_test, plot_type=\"bar\")" 1343 | ] 1344 | }, 1345 | { 1346 | "cell_type": "markdown", 1347 | "id": "558f2575", 1348 | "metadata": {}, 1349 | "source": [ 1350 | "### Neural Network" 1351 | ] 1352 | }, 1353 | { 1354 | "cell_type": "code", 1355 | "execution_count": 26, 1356 | "id": "56d3fc68", 1357 | "metadata": { 1358 | "scrolled": false 1359 | }, 1360 | "outputs": [ 1361 | { 1362 | "name": "stdout", 1363 | "output_type": "stream", 1364 | "text": [ 1365 | "2022-04-24 21:15:36.813732\n", 1366 | "4:08:42.773813\n", 1367 | "{'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (150, 100, 50), 'max_iter': 200}\n" 1368 | ] 1369 | } 1370 | ], 1371 | "source": [ 1372 | "# https://michael-fuchs-python.netlify.app/2021/02/03/nn-multi-layer-perceptron-classifier-mlpclassifier/#mlpclassifier-for-multi-class-classification\n", 1373 | " \n", 1374 | "param_grid = { 'hidden_layer_sizes': [(150,100,50), (100,50,30)],\n", 1375 | " 'max_iter': [200],\n", 1376 | " 'activation': ['logistic', 'relu'],\n", 1377 | " 'alpha': [0.0001, 0.05] }\n", 1378 | "\n", 1379 | "start = datetime.now()\n", 1380 | "print(start)\n", 1381 | "\n", 1382 | "grid = GridSearchCV(MLPClassifier(), param_grid, n_jobs= -1, cv=5)\n", 1383 | "grid.fit(X_train, y_train['PROPERTY_STYLE'].ravel())\n", 1384 | "\n", 1385 | "end = datetime.now()\n", 1386 | "print (end-start)\n", 1387 | "\n", 1388 | "print(grid.best_params_)" 1389 | ] 1390 | }, 1391 | { 1392 | "cell_type": "code", 1393 | "execution_count": 22, 1394 | "id": "74d28f46", 1395 | "metadata": {}, 1396 | "outputs": [ 1397 | { 1398 | "name": "stdout", 1399 | "output_type": "stream", 1400 | "text": [ 1401 | "2022-04-28 21:44:08.152110\n", 1402 | "0:04:24.058111\n", 1403 | "Training Accuracy: 0.7282244676425876\n", 1404 | "Test Accuracy: 0.728732872562748\n" 1405 | ] 1406 | } 1407 | ], 1408 | "source": [ 1409 | "# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html\n", 1410 | "from sklearn.neural_network import MLPClassifier\n", 1411 | "\n", 1412 | "start = datetime.now()\n", 1413 | "print(start)\n", 1414 | "\n", 1415 | "nn_model = MLPClassifier(alpha=0.0001, hidden_layer_sizes =(150, 100, 50), activation='logistic')\n", 1416 | "nn_model.fit(X_train, y_train)\n", 1417 | "\n", 1418 | "end = datetime.now()\n", 1419 | "print (end-start)\n", 1420 | "\n", 1421 | "nn_train = nn_model.score(X_train, y_train)\n", 1422 | "nn_test = nn_model.score(X_test, y_test)\n", 1423 | "\n", 1424 | "print('Training Accuracy: ' + str(nn_train))\n", 1425 | "print('Test Accuracy: ' + str(nn_test))" 1426 | ] 1427 | }, 1428 | { 1429 | "cell_type": "markdown", 1430 | "id": "7d2e6890", 1431 | "metadata": {}, 1432 | "source": [ 1433 | "### Naive Bayes" 1434 | ] 1435 | }, 1436 | { 1437 | "cell_type": "code", 1438 | "execution_count": 27, 1439 | "id": "428cf536", 1440 | "metadata": {}, 1441 | "outputs": [ 1442 | { 1443 | "name": "stdout", 1444 | "output_type": "stream", 1445 | "text": [ 1446 | "2022-04-25 01:24:19.619733\n", 1447 | "Fitting 5 folds for each of 100 candidates, totalling 500 fits\n", 1448 | "0:07:32.545579\n", 1449 | "{'var_smoothing': 1e-09}\n" 1450 | ] 1451 | } 1452 | ], 1453 | "source": [ 1454 | "nb_classifier = GaussianNB()\n", 1455 | "\n", 1456 | "start = datetime.now()\n", 1457 | "print(start)\n", 1458 | "\n", 1459 | "params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}\n", 1460 | "gs_NB = GridSearchCV(estimator=nb_classifier, \n", 1461 | " param_grid=params_NB, \n", 1462 | " cv=5, # use any cross validation technique \n", 1463 | " verbose=1, \n", 1464 | " scoring='accuracy') \n", 1465 | "gs_NB.fit(X_train, y_train)\n", 1466 | "\n", 1467 | "end = datetime.now()\n", 1468 | "print (end-start)\n", 1469 | "\n", 1470 | "print(gs_NB.best_params_)" 1471 | ] 1472 | }, 1473 | { 1474 | "cell_type": "code", 1475 | "execution_count": 20, 1476 | "id": "a9d4745e", 1477 | "metadata": {}, 1478 | "outputs": [ 1479 | { 1480 | "name": "stdout", 1481 | "output_type": "stream", 1482 | "text": [ 1483 | "2022-04-28 21:32:55.594175\n", 1484 | "0:00:00.862308\n", 1485 | "Training Accuracy: 0.7349811315569558\n", 1486 | "Test Accuracy: 0.7351660060733777\n" 1487 | ] 1488 | } 1489 | ], 1490 | "source": [ 1491 | "from sklearn.naive_bayes import GaussianNB\n", 1492 | "\n", 1493 | "start = datetime.now()\n", 1494 | "print(start)\n", 1495 | "\n", 1496 | "naive_bayes = GaussianNB(var_smoothing=1e-09)\n", 1497 | "naive_bayes.fit(X_train, y_train)\n", 1498 | "\n", 1499 | "end = datetime.now()\n", 1500 | "print (end-start)\n", 1501 | "\n", 1502 | "nb_score_train = naive_bayes.score(X_train, y_train)\n", 1503 | "nb_score_test = naive_bayes.score(X_test, y_test)\n", 1504 | "print('Training Accuracy: ' + str(nb_score_train))\n", 1505 | "print('Test Accuracy: ' + str(nb_score_test))" 1506 | ] 1507 | }, 1508 | { 1509 | "cell_type": "code", 1510 | "execution_count": 47, 1511 | "id": "c51a769a", 1512 | "metadata": {}, 1513 | "outputs": [ 1514 | { 1515 | "data": { 1516 | "text/plain": [ 1517 | "100" 1518 | ] 1519 | }, 1520 | "execution_count": 47, 1521 | "metadata": {}, 1522 | "output_type": "execute_result" 1523 | } 1524 | ], 1525 | "source": [ 1526 | "len(np.logspace(0,-9, num=100))" 1527 | ] 1528 | }, 1529 | { 1530 | "cell_type": "markdown", 1531 | "id": "970b56f5", 1532 | "metadata": {}, 1533 | "source": [ 1534 | "# Final Model Selection\n", 1535 | "\n", 1536 | "KNN (k=2, manhattan)\n", 1537 | "\n", 1538 | "Logistic Regression\n", 1539 | "\n", 1540 | "Random Forest\n", 1541 | "\n", 1542 | "Neural Network\n", 1543 | "\n", 1544 | "Naive Bayes" 1545 | ] 1546 | }, 1547 | { 1548 | "cell_type": "markdown", 1549 | "id": "28a4c6ee", 1550 | "metadata": {}, 1551 | "source": [ 1552 | "# Use of Model on rlp_label" 1553 | ] 1554 | }, 1555 | { 1556 | "cell_type": "code", 1557 | "execution_count": 25, 1558 | "id": "fb1d7cd0", 1559 | "metadata": { 1560 | "scrolled": true 1561 | }, 1562 | "outputs": [], 1563 | "source": [ 1564 | "X_rlp_label = rlp_label_model.iloc[:, 0:11]\n", 1565 | "y_rlp_label = rlp_label_model.iloc[:, 11:12]" 1566 | ] 1567 | }, 1568 | { 1569 | "cell_type": "code", 1570 | "execution_count": 26, 1571 | "id": "53200def", 1572 | "metadata": {}, 1573 | "outputs": [ 1574 | { 1575 | "name": "stdout", 1576 | "output_type": "stream", 1577 | "text": [ 1578 | "2022-04-28 21:55:35.452541\n", 1579 | "0:01:21.279238\n", 1580 | "KNN RLP Label Accuracy: 0.7260353839967832\n" 1581 | ] 1582 | } 1583 | ], 1584 | "source": [ 1585 | "# KNN Model\n", 1586 | "start = datetime.now()\n", 1587 | "print(start)\n", 1588 | "\n", 1589 | "knn = KNeighborsClassifier(n_neighbors=2, metric='manhattan')\n", 1590 | "knn.fit(X_train, y_train)\n", 1591 | "rlp_score_label = knn.score(X_rlp_label, y_rlp_label)\n", 1592 | "\n", 1593 | "end = datetime.now()\n", 1594 | "print (end-start)\n", 1595 | "\n", 1596 | "print('KNN RLP Label Accuracy: ' + str(rlp_score_label))" 1597 | ] 1598 | }, 1599 | { 1600 | "cell_type": "code", 1601 | "execution_count": 27, 1602 | "id": "9ba9cb82", 1603 | "metadata": { 1604 | "scrolled": true 1605 | }, 1606 | "outputs": [ 1607 | { 1608 | "name": "stdout", 1609 | "output_type": "stream", 1610 | "text": [ 1611 | "Random Forest RLP Label Accuracy: 0.8388017691998392\n" 1612 | ] 1613 | } 1614 | ], 1615 | "source": [ 1616 | "# Random Forest Model\n", 1617 | "from sklearn.ensemble import RandomForestClassifier\n", 1618 | "\n", 1619 | "start = datetime.now()\n", 1620 | "print(start)\n", 1621 | "\n", 1622 | "rf_clf = RandomForestClassifier(min_samples_leaf = 100, random_state=0)\n", 1623 | "rf_clf.fit(X_train, y_train)\n", 1624 | "\n", 1625 | "rlp_score_label = rf_clf.score(X_rlp_label, y_rlp_label)\n", 1626 | "\n", 1627 | "end = datetime.now()\n", 1628 | "print (end-start)\n", 1629 | "\n", 1630 | "print('Random Forest RLP Label Accuracy: ' + str(rlp_score_label))" 1631 | ] 1632 | }, 1633 | { 1634 | "cell_type": "markdown", 1635 | "id": "435e99ff", 1636 | "metadata": {}, 1637 | "source": [ 1638 | "### Get Accuracy per Property Style" 1639 | ] 1640 | }, 1641 | { 1642 | "cell_type": "code", 1643 | "execution_count": 61, 1644 | "id": "fd5b5c4a", 1645 | "metadata": { 1646 | "scrolled": true 1647 | }, 1648 | "outputs": [ 1649 | { 1650 | "data": { 1651 | "text/html": [ 1652 | "
\n", 1653 | "\n", 1666 | "\n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | " \n", 1743 | " \n", 1744 | " \n", 1745 | " \n", 1746 | " \n", 1747 | " \n", 1748 | " \n", 1749 | " \n", 1750 | " \n", 1751 | " \n", 1752 | " \n", 1753 | " \n", 1754 | " \n", 1755 | " \n", 1756 | " \n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | " \n", 1767 | " \n", 1768 | " \n", 1769 | " \n", 1770 | " \n", 1771 | " \n", 1772 | " \n", 1773 | " \n", 1774 | " \n", 1775 | " \n", 1776 | " \n", 1777 | " \n", 1778 | " \n", 1779 | " \n", 1780 | " \n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | " \n", 1787 | " \n", 1788 | " \n", 1789 | " \n", 1790 | " \n", 1791 | " \n", 1792 | " \n", 1793 | " \n", 1794 | " \n", 1795 | " \n", 1796 | " \n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | " \n", 1809 | " \n", 1810 | " \n", 1811 | " \n", 1812 | " \n", 1813 | " \n", 1814 | " \n", 1815 | " \n", 1816 | " \n", 1817 | " \n", 1818 | " \n", 1819 | " \n", 1820 | " \n", 1821 | " \n", 1822 | " \n", 1823 | " \n", 1824 | " \n", 1825 | " \n", 1826 | " \n", 1827 | " \n", 1828 | " \n", 1829 | " \n", 1830 | " \n", 1831 | " \n", 1832 | " \n", 1833 | " \n", 1834 | " \n", 1835 | " \n", 1836 | " \n", 1837 | " \n", 1838 | " \n", 1839 | " \n", 1840 | " \n", 1841 | " \n", 1842 | " \n", 1843 | " \n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | " \n", 1859 | " \n", 1860 | " \n", 1861 | " \n", 1862 | " \n", 1863 | " \n", 1864 | " \n", 1865 | " \n", 1866 | " \n", 1867 | " \n", 1868 | " \n", 1869 | " \n", 1870 | " \n", 1871 | " \n", 1872 | " \n", 1873 | " \n", 1874 | " \n", 1875 | " \n", 1876 | " \n", 1877 | " \n", 1878 | " \n", 1879 | " \n", 1880 | " \n", 1881 | " \n", 1882 | " \n", 1883 | " \n", 1884 | " \n", 1885 | " \n", 1886 | " \n", 1887 | " \n", 1888 | " \n", 1889 | " \n", 1890 | " \n", 1891 | " \n", 1892 | " \n", 1893 | " \n", 1894 | " \n", 1895 | " \n", 1896 | " \n", 1897 | " \n", 1898 | " \n", 1899 | "
SUITESTREET_NUMBERSTREET_NAMESTREET_TYPESTREET_DIRECTIONCITYPROVINCEPOSTAL_CODELATITUDELONGITUDEDISSEMINATION_AREAPROPERTY_STYLEPROPERTY_STYLE_PRED_RFPROPERTY_STYLE_PRED_KNNKNN_CORRECTRF_CORRECT
0-1000-100045.332199-75.7771993506129421100
3-1111-110142.758801-81.1968003534005211111
5-1220-120243.187401-79.5194023526082911111
7-1332-131352.259201-113.7969974808009211111
100442-142448.446701-123.5029985917043233311
...................................................
1230523133932920783-14842245249.210098-122.9629975915365543100
1230526-168193544-17444384749.600101-97.0302964602008111111
1230531-12623712-12441599449.795700-97.1948014611119011111
1230532116279118454-12342722749.259201-122.7370005915304044101
123053342338715788-125927667449.277699-122.8560035915396243300
\n", 1900 | "

696360 rows × 16 columns

\n", 1901 | "
" 1902 | ], 1903 | "text/plain": [ 1904 | " SUITE STREET_NUMBER STREET_NAME STREET_TYPE STREET_DIRECTION \\\n", 1905 | "0 -1 0 0 0 -1 \n", 1906 | "3 -1 1 1 1 -1 \n", 1907 | "5 -1 2 2 0 -1 \n", 1908 | "7 -1 3 3 2 -1 \n", 1909 | "10 0 4 4 2 -1 \n", 1910 | "... ... ... ... ... ... \n", 1911 | "1230523 133 9329 2078 3 -1 \n", 1912 | "1230526 -1 68 19354 4 -1 \n", 1913 | "1230531 -1 26 2371 2 -1 \n", 1914 | "1230532 116 2791 1845 4 -1 \n", 1915 | "1230533 423 3871 578 8 -1 \n", 1916 | "\n", 1917 | " CITY PROVINCE POSTAL_CODE LATITUDE LONGITUDE \\\n", 1918 | "0 0 0 0 45.332199 -75.777199 \n", 1919 | "3 1 0 1 42.758801 -81.196800 \n", 1920 | "5 2 0 2 43.187401 -79.519402 \n", 1921 | "7 3 1 3 52.259201 -113.796997 \n", 1922 | "10 4 2 4 48.446701 -123.502998 \n", 1923 | "... ... ... ... ... ... \n", 1924 | "1230523 484 2 2452 49.210098 -122.962997 \n", 1925 | "1230526 744 4 3847 49.600101 -97.030296 \n", 1926 | "1230531 24 4 15994 49.795700 -97.194801 \n", 1927 | "1230532 234 2 7227 49.259201 -122.737000 \n", 1928 | "1230533 259 2 76674 49.277699 -122.856003 \n", 1929 | "\n", 1930 | " DISSEMINATION_AREA PROPERTY_STYLE PROPERTY_STYLE_PRED_RF \\\n", 1931 | "0 35061294 2 1 \n", 1932 | "3 35340052 1 1 \n", 1933 | "5 35260829 1 1 \n", 1934 | "7 48080092 1 1 \n", 1935 | "10 59170432 3 3 \n", 1936 | "... ... ... ... \n", 1937 | "1230523 59153655 4 3 \n", 1938 | "1230526 46020081 1 1 \n", 1939 | "1230531 46111190 1 1 \n", 1940 | "1230532 59153040 4 4 \n", 1941 | "1230533 59153962 4 3 \n", 1942 | "\n", 1943 | " PROPERTY_STYLE_PRED_KNN KNN_CORRECT RF_CORRECT \n", 1944 | "0 1 0 0 \n", 1945 | "3 1 1 1 \n", 1946 | "5 1 1 1 \n", 1947 | "7 1 1 1 \n", 1948 | "10 3 1 1 \n", 1949 | "... ... ... ... \n", 1950 | "1230523 1 0 0 \n", 1951 | "1230526 1 1 1 \n", 1952 | "1230531 1 1 1 \n", 1953 | "1230532 1 0 1 \n", 1954 | "1230533 3 0 0 \n", 1955 | "\n", 1956 | "[696360 rows x 16 columns]" 1957 | ] 1958 | }, 1959 | "execution_count": 61, 1960 | "metadata": {}, 1961 | "output_type": "execute_result" 1962 | } 1963 | ], 1964 | "source": [ 1965 | "rlp_label_model['PROPERTY_STYLE_PRED_RF'] = rf_clf.predict(X_rlp_label)\n", 1966 | "rlp_label_model['PROPERTY_STYLE_PRED_KNN'] = knn.predict(X_rlp_label)" 1967 | ] 1968 | }, 1969 | { 1970 | "cell_type": "code", 1971 | "execution_count": 62, 1972 | "id": "c4562bd3", 1973 | "metadata": {}, 1974 | "outputs": [], 1975 | "source": [ 1976 | "rlp_label_model['KNN_CORRECT'] = np.where(rlp_label_model['PROPERTY_STYLE_PRED_KNN']==rlp_label_model['PROPERTY_STYLE'], 1,0)\n", 1977 | "rlp_label_model['RF_CORRECT'] = np.where(rlp_label_model['PROPERTY_STYLE_PRED_RF']==rlp_label_model['PROPERTY_STYLE'], 1,0)" 1978 | ] 1979 | }, 1980 | { 1981 | "cell_type": "code", 1982 | "execution_count": 65, 1983 | "id": "51389845", 1984 | "metadata": {}, 1985 | "outputs": [ 1986 | { 1987 | "data": { 1988 | "text/html": [ 1989 | "
\n", 1990 | "\n", 2007 | "\n", 2008 | " \n", 2009 | " \n", 2010 | " \n", 2011 | " \n", 2012 | " \n", 2013 | " \n", 2014 | " \n", 2015 | " \n", 2016 | " \n", 2017 | " \n", 2018 | " \n", 2019 | " \n", 2020 | " \n", 2021 | " \n", 2022 | " \n", 2023 | " \n", 2024 | " \n", 2025 | " \n", 2026 | " \n", 2027 | " \n", 2028 | " \n", 2029 | " \n", 2030 | " \n", 2031 | " \n", 2032 | " \n", 2033 | " \n", 2034 | " \n", 2035 | " \n", 2036 | " \n", 2037 | " \n", 2038 | " \n", 2039 | " \n", 2040 | " \n", 2041 | " \n", 2042 | " \n", 2043 | " \n", 2044 | " \n", 2045 | " \n", 2046 | " \n", 2047 | " \n", 2048 | " \n", 2049 | " \n", 2050 | " \n", 2051 | " \n", 2052 | " \n", 2053 | " \n", 2054 | " \n", 2055 | " \n", 2056 | " \n", 2057 | " \n", 2058 | " \n", 2059 | " \n", 2060 | " \n", 2061 | " \n", 2062 | "
sumcount
KNN_CORRECTKNN_CORRECT
PROPERTY_STYLE
1469245511076
2149427778
33055794345
4423956801
5213044
6263307
709
\n", 2063 | "
" 2064 | ], 2065 | "text/plain": [ 2066 | " sum count\n", 2067 | " KNN_CORRECT KNN_CORRECT\n", 2068 | "PROPERTY_STYLE \n", 2069 | "1 469245 511076\n", 2070 | "2 1494 27778\n", 2071 | "3 30557 94345\n", 2072 | "4 4239 56801\n", 2073 | "5 21 3044\n", 2074 | "6 26 3307\n", 2075 | "7 0 9" 2076 | ] 2077 | }, 2078 | "execution_count": 65, 2079 | "metadata": {}, 2080 | "output_type": "execute_result" 2081 | } 2082 | ], 2083 | "source": [ 2084 | "# KNN\n", 2085 | "rlp_label_model.pivot_table(index =['PROPERTY_STYLE'],\n", 2086 | " values =['KNN_CORRECT'],\n", 2087 | " aggfunc =['sum', 'count'])" 2088 | ] 2089 | }, 2090 | { 2091 | "cell_type": "code", 2092 | "execution_count": 66, 2093 | "id": "77293b0f", 2094 | "metadata": { 2095 | "scrolled": true 2096 | }, 2097 | "outputs": [ 2098 | { 2099 | "data": { 2100 | "text/html": [ 2101 | "
\n", 2102 | "\n", 2119 | "\n", 2120 | " \n", 2121 | " \n", 2122 | " \n", 2123 | " \n", 2124 | " \n", 2125 | " \n", 2126 | " \n", 2127 | " \n", 2128 | " \n", 2129 | " \n", 2130 | " \n", 2131 | " \n", 2132 | " \n", 2133 | " \n", 2134 | " \n", 2135 | " \n", 2136 | " \n", 2137 | " \n", 2138 | " \n", 2139 | " \n", 2140 | " \n", 2141 | " \n", 2142 | " \n", 2143 | " \n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | " \n", 2152 | " \n", 2153 | " \n", 2154 | " \n", 2155 | " \n", 2156 | " \n", 2157 | " \n", 2158 | " \n", 2159 | " \n", 2160 | " \n", 2161 | " \n", 2162 | " \n", 2163 | " \n", 2164 | " \n", 2165 | " \n", 2166 | " \n", 2167 | " \n", 2168 | " \n", 2169 | " \n", 2170 | " \n", 2171 | " \n", 2172 | " \n", 2173 | " \n", 2174 | "
sumcount
RF_CORRECTRF_CORRECT
PROPERTY_STYLE
1498695511076
2027778
37530094345
41009656801
5173044
603307
709
\n", 2175 | "
" 2176 | ], 2177 | "text/plain": [ 2178 | " sum count\n", 2179 | " RF_CORRECT RF_CORRECT\n", 2180 | "PROPERTY_STYLE \n", 2181 | "1 498695 511076\n", 2182 | "2 0 27778\n", 2183 | "3 75300 94345\n", 2184 | "4 10096 56801\n", 2185 | "5 17 3044\n", 2186 | "6 0 3307\n", 2187 | "7 0 9" 2188 | ] 2189 | }, 2190 | "execution_count": 66, 2191 | "metadata": {}, 2192 | "output_type": "execute_result" 2193 | } 2194 | ], 2195 | "source": [ 2196 | "# RF\n", 2197 | "rlp_label_model.pivot_table(index =['PROPERTY_STYLE'],\n", 2198 | " values =['RF_CORRECT'],\n", 2199 | " aggfunc =['sum', 'count'])" 2200 | ] 2201 | }, 2202 | { 2203 | "cell_type": "markdown", 2204 | "id": "ac6b7f5a", 2205 | "metadata": {}, 2206 | "source": [ 2207 | "### Accuracy By Province/Ciy" 2208 | ] 2209 | }, 2210 | { 2211 | "cell_type": "code", 2212 | "execution_count": 60, 2213 | "id": "5eb38c12", 2214 | "metadata": { 2215 | "scrolled": true 2216 | }, 2217 | "outputs": [], 2218 | "source": [ 2219 | "rlp_label['PROPERTY_STYLE_PRED_RF'] = rf_clf.predict(X_rlp_label)\n", 2220 | "rlp_label['PROPERTY_STYLE_PRED_KNN'] = knn.predict(X_rlp_label)" 2221 | ] 2222 | }, 2223 | { 2224 | "cell_type": "code", 2225 | "execution_count": 57, 2226 | "id": "4351010f", 2227 | "metadata": {}, 2228 | "outputs": [], 2229 | "source": [ 2230 | "rlp_label['PROPERTY_STYLE_INT'] = rlp_label['PROPERTY_STYLE'].apply(lambda x: int(x[0]))\n", 2231 | "rlp_label['KNN_CORRECT'] = np.where(rlp_label['PROPERTY_STYLE_PRED_KNN']==rlp_label['PROPERTY_STYLE_INT'], 1,0)\n", 2232 | "rlp_label['RF_CORRECT'] = np.where(rlp_label['PROPERTY_STYLE_PRED_RF']==rlp_label['PROPERTY_STYLE_INT'], 1,0)" 2233 | ] 2234 | }, 2235 | { 2236 | "cell_type": "code", 2237 | "execution_count": 59, 2238 | "id": "008b1fea", 2239 | "metadata": {}, 2240 | "outputs": [], 2241 | "source": [ 2242 | "piv = rlp_label.pivot_table(index =['PROVINCE', 'CITY'],\n", 2243 | " values =['KNN_CORRECT'],\n", 2244 | " aggfunc = ['sum', 'count'])\n", 2245 | "\n", 2246 | "piv.to_csv('rlp_label_accuracy_bycity.csv')" 2247 | ] 2248 | }, 2249 | { 2250 | "cell_type": "markdown", 2251 | "id": "705dc5e9", 2252 | "metadata": {}, 2253 | "source": [ 2254 | "# Use model on rlp_nolabel" 2255 | ] 2256 | }, 2257 | { 2258 | "cell_type": "code", 2259 | "execution_count": 34, 2260 | "id": "f0d69426", 2261 | "metadata": {}, 2262 | "outputs": [], 2263 | "source": [ 2264 | "rlp_nolabel['PROPERTY_STYLE_PRED_RF'] = rf_clf.predict(rlp_nolabel_model)\n", 2265 | "rlp_nolabel['PROPERTY_STYLE_PRED_KNN'] = knn.predict(rlp_nolabel_model)" 2266 | ] 2267 | }, 2268 | { 2269 | "cell_type": "code", 2270 | "execution_count": 72, 2271 | "id": "fb5cd119", 2272 | "metadata": { 2273 | "scrolled": false 2274 | }, 2275 | "outputs": [ 2276 | { 2277 | "data": { 2278 | "text/html": [ 2279 | "
\n", 2280 | "\n", 2297 | "\n", 2298 | " \n", 2299 | " \n", 2300 | " \n", 2301 | " \n", 2302 | " \n", 2303 | " \n", 2304 | " \n", 2305 | " \n", 2306 | " \n", 2307 | " \n", 2308 | " \n", 2309 | " \n", 2310 | " \n", 2311 | " \n", 2312 | " \n", 2313 | " \n", 2314 | " \n", 2315 | " \n", 2316 | " \n", 2317 | " \n", 2318 | " \n", 2319 | " \n", 2320 | " \n", 2321 | " \n", 2322 | " \n", 2323 | " \n", 2324 | " \n", 2325 | " \n", 2326 | " \n", 2327 | " \n", 2328 | " \n", 2329 | " \n", 2330 | " \n", 2331 | " \n", 2332 | " \n", 2333 | " \n", 2334 | " \n", 2335 | " \n", 2336 | " \n", 2337 | " \n", 2338 | " \n", 2339 | " \n", 2340 | " \n", 2341 | " \n", 2342 | "
count
FULL_ADDRESS
PROPERTY_STYLE_PRED_KNN
1453644
210382
347734
421572
5332
6505
74
\n", 2343 | "
" 2344 | ], 2345 | "text/plain": [ 2346 | " count\n", 2347 | " FULL_ADDRESS\n", 2348 | "PROPERTY_STYLE_PRED_KNN \n", 2349 | "1 453644\n", 2350 | "2 10382\n", 2351 | "3 47734\n", 2352 | "4 21572\n", 2353 | "5 332\n", 2354 | "6 505\n", 2355 | "7 4" 2356 | ] 2357 | }, 2358 | "execution_count": 72, 2359 | "metadata": {}, 2360 | "output_type": "execute_result" 2361 | } 2362 | ], 2363 | "source": [ 2364 | "# KNN Counts\n", 2365 | "rlp_nolabel.pivot_table(index =['PROPERTY_STYLE_PRED_KNN'],\n", 2366 | " values =['FULL_ADDRESS'],\n", 2367 | " aggfunc =['count'])" 2368 | ] 2369 | }, 2370 | { 2371 | "cell_type": "code", 2372 | "execution_count": 73, 2373 | "id": "0b7bd803", 2374 | "metadata": {}, 2375 | "outputs": [ 2376 | { 2377 | "data": { 2378 | "text/html": [ 2379 | "
\n", 2380 | "\n", 2397 | "\n", 2398 | " \n", 2399 | " \n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | " \n", 2404 | " \n", 2405 | " \n", 2406 | " \n", 2407 | " \n", 2408 | " \n", 2409 | " \n", 2410 | " \n", 2411 | " \n", 2412 | " \n", 2413 | " \n", 2414 | " \n", 2415 | " \n", 2416 | " \n", 2417 | " \n", 2418 | " \n", 2419 | " \n", 2420 | " \n", 2421 | " \n", 2422 | " \n", 2423 | " \n", 2424 | " \n", 2425 | " \n", 2426 | " \n", 2427 | " \n", 2428 | " \n", 2429 | " \n", 2430 | "
count
FULL_ADDRESS
PROPERTY_STYLE_PRED_RF
1432651
382491
418887
5144
\n", 2431 | "
" 2432 | ], 2433 | "text/plain": [ 2434 | " count\n", 2435 | " FULL_ADDRESS\n", 2436 | "PROPERTY_STYLE_PRED_RF \n", 2437 | "1 432651\n", 2438 | "3 82491\n", 2439 | "4 18887\n", 2440 | "5 144" 2441 | ] 2442 | }, 2443 | "execution_count": 73, 2444 | "metadata": {}, 2445 | "output_type": "execute_result" 2446 | } 2447 | ], 2448 | "source": [ 2449 | "# RF Counts\n", 2450 | "rlp_nolabel.pivot_table(index =['PROPERTY_STYLE_PRED_RF'],\n", 2451 | " values =['FULL_ADDRESS'],\n", 2452 | " aggfunc =['count'])" 2453 | ] 2454 | }, 2455 | { 2456 | "cell_type": "code", 2457 | "execution_count": 37, 2458 | "id": "d2a1a38c", 2459 | "metadata": {}, 2460 | "outputs": [], 2461 | "source": [ 2462 | "# Predicted Property Style by City\n", 2463 | "pred_count_bycity = rlp_nolabel[['CITY', 'PROPERTY_STYLE_PRED_KNN']].groupby('CITY').nunique()\n", 2464 | "pred_count_bycity.reset_index(inplace = True)\n", 2465 | "pred_count_bycity.sort_values('PROPERTY_STYLE_PRED_KNN').to_csv('propertystyle_count_bycity.csv')" 2466 | ] 2467 | }, 2468 | { 2469 | "cell_type": "code", 2470 | "execution_count": 74, 2471 | "id": "7b198d73", 2472 | "metadata": { 2473 | "scrolled": false 2474 | }, 2475 | "outputs": [ 2476 | { 2477 | "data": { 2478 | "text/html": [ 2479 | "
\n", 2480 | "\n", 2493 | "\n", 2494 | " \n", 2495 | " \n", 2496 | " \n", 2497 | " \n", 2498 | " \n", 2499 | " \n", 2500 | " \n", 2501 | " \n", 2502 | " \n", 2503 | " \n", 2504 | " \n", 2505 | " \n", 2506 | " \n", 2507 | " \n", 2508 | " \n", 2509 | " \n", 2510 | " \n", 2511 | " \n", 2512 | " \n", 2513 | " \n", 2514 | " \n", 2515 | " \n", 2516 | " \n", 2517 | " \n", 2518 | " \n", 2519 | " \n", 2520 | " \n", 2521 | " \n", 2522 | " \n", 2523 | " \n", 2524 | " \n", 2525 | " \n", 2526 | "
FULL_ADDRESS
PROPERTY_STYLE_PRED_KNN
112047
21531
312625
4955
71
\n", 2527 | "
" 2528 | ], 2529 | "text/plain": [ 2530 | " FULL_ADDRESS\n", 2531 | "PROPERTY_STYLE_PRED_KNN \n", 2532 | "1 12047\n", 2533 | "2 1531\n", 2534 | "3 12625\n", 2535 | "4 955\n", 2536 | "7 1" 2537 | ] 2538 | }, 2539 | "execution_count": 74, 2540 | "metadata": {}, 2541 | "output_type": "execute_result" 2542 | } 2543 | ], 2544 | "source": [ 2545 | "# Toronto counts\n", 2546 | "rlp_nolabel[(rlp_nolabel['CITY']=='TORONTO')][['PROPERTY_STYLE_PRED_KNN', 'FULL_ADDRESS']].groupby('PROPERTY_STYLE_PRED_KNN').count()" 2547 | ] 2548 | }, 2549 | { 2550 | "cell_type": "markdown", 2551 | "id": "5c793af8", 2552 | "metadata": {}, 2553 | "source": [ 2554 | "# Pull Sample of records to check" 2555 | ] 2556 | }, 2557 | { 2558 | "cell_type": "code", 2559 | "execution_count": 154, 2560 | "id": "c3be63c8", 2561 | "metadata": {}, 2562 | "outputs": [], 2563 | "source": [ 2564 | "# Take sample of these records and see which one is correct\n", 2565 | "n_samples = [[1,44], [2,6], [3,46], [4,4], [7,1]]\n", 2566 | "check_samples = pd.DataFrame()\n", 2567 | "\n", 2568 | "for i in n_samples:\n", 2569 | " prop_style = i[0]\n", 2570 | " n = i[1]\n", 2571 | " \n", 2572 | " samples = rlp_nolabel[(rlp_nolabel['PROPERTY_STYLE_PRED_KNN']==prop_style) & (rlp_nolabel['CITY']=='TORONTO')].sample(n=n)\n", 2573 | " check_samples = pd.concat([check_samples, samples])" 2574 | ] 2575 | }, 2576 | { 2577 | "cell_type": "code", 2578 | "execution_count": 53, 2579 | "id": "3a1a645b", 2580 | "metadata": {}, 2581 | "outputs": [], 2582 | "source": [ 2583 | "# Need 6 more records\n", 2584 | "n_samples = [[1,2], [2,1], [3,2], [4,1], [7,0]]\n", 2585 | "check_samples = pd.DataFrame()\n", 2586 | "\n", 2587 | "for i in n_samples:\n", 2588 | " prop_style = i[0]\n", 2589 | " n = i[1]\n", 2590 | " \n", 2591 | " samples = rlp_nolabel[(rlp_nolabel['PROPERTY_STYLE_PRED_KNN']==prop_style) & (rlp_nolabel['CITY']=='TORONTO')].sample(n=n)\n", 2592 | " check_samples = pd.concat([check_samples, samples])" 2593 | ] 2594 | }, 2595 | { 2596 | "cell_type": "code", 2597 | "execution_count": 155, 2598 | "id": "00e6d64a", 2599 | "metadata": {}, 2600 | "outputs": [], 2601 | "source": [ 2602 | "check_samples.to_csv('check_samples.csv')" 2603 | ] 2604 | } 2605 | ], 2606 | "metadata": { 2607 | "kernelspec": { 2608 | "display_name": "Python 3 (ipykernel)", 2609 | "language": "python", 2610 | "name": "python3" 2611 | }, 2612 | "language_info": { 2613 | "codemirror_mode": { 2614 | "name": "ipython", 2615 | "version": 3 2616 | }, 2617 | "file_extension": ".py", 2618 | "mimetype": "text/x-python", 2619 | "name": "python", 2620 | "nbconvert_exporter": "python", 2621 | "pygments_lexer": "ipython3", 2622 | "version": "3.9.12" 2623 | } 2624 | }, 2625 | "nbformat": 4, 2626 | "nbformat_minor": 5 2627 | } 2628 | --------------------------------------------------------------------------------