├── 3.Decision Tree Using ID3.ipynb ├── ID3+-+Algorithm+ID3(Examples,+TargetAttribute,+Attributes).jpg ├── PlayTennis.csv ├── README.md └── _config.yml /3.Decision Tree Using ID3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "raw", 5 | "metadata": {}, 6 | "source": [ 7 | "Write a program to demonstrate the working of the decision tree based ID3\n", 8 | "algorithm. Use an appropriate data set for building the decision tree and \n", 9 | "apply this knowledge to classify a new sample." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Import Play Tennis Data " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 149, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "\n", 29 | " Given Play Tennis Data Set:\n", 30 | "\n", 31 | " PlayTennis Outlook Temperature Humidity Wind\n", 32 | "0 No Sunny Hot High Weak\n", 33 | "1 No Sunny Hot High Strong\n", 34 | "2 Yes Overcast Hot High Weak\n", 35 | "3 Yes Rain Mild High Weak\n", 36 | "4 Yes Rain Cool Normal Weak\n", 37 | "5 No Rain Cool Normal Strong\n", 38 | "6 Yes Overcast Cool Normal Strong\n", 39 | "7 No Sunny Mild High Weak\n", 40 | "8 Yes Sunny Cool Normal Weak\n", 41 | "9 Yes Rain Mild Normal Weak\n", 42 | "10 Yes Sunny Mild Normal Strong\n", 43 | "11 Yes Overcast Mild High Strong\n", 44 | "12 Yes Overcast Hot Normal Weak\n", 45 | "13 No Rain Mild High Strong\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "# Author : Dr.Thyagaraju G S , Context Innovations Lab , DEpt of CSE , SDMIT - Ujire \n", 51 | "# Date : July 11 2018 \n", 52 | "import pandas as pd\n", 53 | "from pandas import DataFrame \n", 54 | "df_tennis = DataFrame.from_csv('C:\\\\Users\\\\Dr.Thyagaraju\\\\Desktop\\\\Data\\\\PlayTennis.csv')\n", 55 | "print(\"\\n Given Play Tennis Data Set:\\n\\n\", df_tennis)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 206, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "'PlayTennis'" 67 | ] 68 | }, 69 | "execution_count": 206, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "#df_tennis.columns[0]\n", 76 | "df_tennis.keys()[0]" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "# Entropy of the Training Data Set" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 215, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "\n", 96 | " INPUT DATA SET FOR ENTROPY CALCULATION:\n", 97 | " 0 No\n", 98 | "1 No\n", 99 | "2 Yes\n", 100 | "3 Yes\n", 101 | "4 Yes\n", 102 | "5 No\n", 103 | "6 Yes\n", 104 | "7 No\n", 105 | "8 Yes\n", 106 | "9 Yes\n", 107 | "10 Yes\n", 108 | "11 Yes\n", 109 | "12 Yes\n", 110 | "13 No\n", 111 | "Name: PlayTennis, dtype: object\n", 112 | "\n", 113 | " Number of Instances of the Current Sub Class is 14.0:\n", 114 | "\n", 115 | " Classes: No Yes\n", 116 | " \n", 117 | " Probabilities of Class No is 0.35714285714285715:\n", 118 | " \n", 119 | " Probabilities of Class Yes is 0.6428571428571429:\n", 120 | "\n", 121 | " Total Entropy of PlayTennis Data Set: 0.9402859586706309\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "#Function to calculate the entropy of probaility of observations\n", 127 | "# -p*log2*p\n", 128 | "\n", 129 | "def entropy(probs): \n", 130 | " import math\n", 131 | " return sum( [-prob*math.log(prob, 2) for prob in probs] )\n", 132 | "\n", 133 | "#Function to calulate the entropy of the given Data Sets/List with respect to target attributes\n", 134 | "def entropy_of_list(a_list): \n", 135 | " #print(\"A-list\",a_list)\n", 136 | " from collections import Counter\n", 137 | " cnt = Counter(x for x in a_list) # Counter calculates the propotion of class\n", 138 | " # print(\"\\nClasses:\",cnt)\n", 139 | " #print(\"No and Yes Classes:\",a_list.name,cnt)\n", 140 | " num_instances = len(a_list)*1.0 # = 14\n", 141 | " print(\"\\n Number of Instances of the Current Sub Class is {0}:\".format(num_instances ))\n", 142 | " probs = [x / num_instances for x in cnt.values()] # x means no of YES/NO\n", 143 | " print(\"\\n Classes:\",min(cnt),max(cnt))\n", 144 | " print(\" \\n Probabilities of Class {0} is {1}:\".format(min(cnt),min(probs)))\n", 145 | " print(\" \\n Probabilities of Class {0} is {1}:\".format(max(cnt),max(probs)))\n", 146 | " return entropy(probs) # Call Entropy :\n", 147 | " \n", 148 | "# The initial entropy of the YES/NO attribute for our dataset.\n", 149 | "print(\"\\n INPUT DATA SET FOR ENTROPY CALCULATION:\\n\", df_tennis['PlayTennis'])\n", 150 | "\n", 151 | "total_entropy = entropy_of_list(df_tennis['PlayTennis'])\n", 152 | "\n", 153 | "print(\"\\n Total Entropy of PlayTennis Data Set:\",total_entropy)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "# Information Gain of Attributes " 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 216, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "Information Gain Calculation of Outlook\n", 173 | "\n", 174 | " Number of Instances of the Current Sub Class is 4.0:\n", 175 | "\n", 176 | " Classes: Yes Yes\n", 177 | " \n", 178 | " Probabilities of Class Yes is 1.0:\n", 179 | " \n", 180 | " Probabilities of Class Yes is 1.0:\n", 181 | "\n", 182 | " Number of Instances of the Current Sub Class is 5.0:\n", 183 | "\n", 184 | " Classes: No Yes\n", 185 | " \n", 186 | " Probabilities of Class No is 0.4:\n", 187 | " \n", 188 | " Probabilities of Class Yes is 0.6:\n", 189 | "\n", 190 | " Number of Instances of the Current Sub Class is 5.0:\n", 191 | "\n", 192 | " Classes: No Yes\n", 193 | " \n", 194 | " Probabilities of Class No is 0.4:\n", 195 | " \n", 196 | " Probabilities of Class Yes is 0.6:\n", 197 | "\n", 198 | " Number of Instances of the Current Sub Class is 14.0:\n", 199 | "\n", 200 | " Classes: No Yes\n", 201 | " \n", 202 | " Probabilities of Class No is 0.35714285714285715:\n", 203 | " \n", 204 | " Probabilities of Class Yes is 0.6428571428571429:\n", 205 | "Info-gain for Outlook is :0.246749819774 \n", 206 | "\n", 207 | "Information Gain Calculation of Humidity\n", 208 | "\n", 209 | " Number of Instances of the Current Sub Class is 7.0:\n", 210 | "\n", 211 | " Classes: No Yes\n", 212 | " \n", 213 | " Probabilities of Class No is 0.42857142857142855:\n", 214 | " \n", 215 | " Probabilities of Class Yes is 0.5714285714285714:\n", 216 | "\n", 217 | " Number of Instances of the Current Sub Class is 7.0:\n", 218 | "\n", 219 | " Classes: No Yes\n", 220 | " \n", 221 | " Probabilities of Class No is 0.14285714285714285:\n", 222 | " \n", 223 | " Probabilities of Class Yes is 0.8571428571428571:\n", 224 | "\n", 225 | " Number of Instances of the Current Sub Class is 14.0:\n", 226 | "\n", 227 | " Classes: No Yes\n", 228 | " \n", 229 | " Probabilities of Class No is 0.35714285714285715:\n", 230 | " \n", 231 | " Probabilities of Class Yes is 0.6428571428571429:\n", 232 | "\n", 233 | " Info-gain for Humidity is: 0.151835501362 \n", 234 | "\n", 235 | "Information Gain Calculation of Wind\n", 236 | "\n", 237 | " Number of Instances of the Current Sub Class is 6.0:\n", 238 | "\n", 239 | " Classes: No Yes\n", 240 | " \n", 241 | " Probabilities of Class No is 0.5:\n", 242 | " \n", 243 | " Probabilities of Class Yes is 0.5:\n", 244 | "\n", 245 | " Number of Instances of the Current Sub Class is 8.0:\n", 246 | "\n", 247 | " Classes: No Yes\n", 248 | " \n", 249 | " Probabilities of Class No is 0.25:\n", 250 | " \n", 251 | " Probabilities of Class Yes is 0.75:\n", 252 | "\n", 253 | " Number of Instances of the Current Sub Class is 14.0:\n", 254 | "\n", 255 | " Classes: No Yes\n", 256 | " \n", 257 | " Probabilities of Class No is 0.35714285714285715:\n", 258 | " \n", 259 | " Probabilities of Class Yes is 0.6428571428571429:\n", 260 | "\n", 261 | " Info-gain for Wind is:0.0481270304083 \n", 262 | "\n", 263 | "Information Gain Calculation of Temperature\n", 264 | "\n", 265 | " Number of Instances of the Current Sub Class is 4.0:\n", 266 | "\n", 267 | " Classes: No Yes\n", 268 | " \n", 269 | " Probabilities of Class No is 0.25:\n", 270 | " \n", 271 | " Probabilities of Class Yes is 0.75:\n", 272 | "\n", 273 | " Number of Instances of the Current Sub Class is 4.0:\n", 274 | "\n", 275 | " Classes: No Yes\n", 276 | " \n", 277 | " Probabilities of Class No is 0.5:\n", 278 | " \n", 279 | " Probabilities of Class Yes is 0.5:\n", 280 | "\n", 281 | " Number of Instances of the Current Sub Class is 6.0:\n", 282 | "\n", 283 | " Classes: No Yes\n", 284 | " \n", 285 | " Probabilities of Class No is 0.3333333333333333:\n", 286 | " \n", 287 | " Probabilities of Class Yes is 0.6666666666666666:\n", 288 | "\n", 289 | " Number of Instances of the Current Sub Class is 14.0:\n", 290 | "\n", 291 | " Classes: No Yes\n", 292 | " \n", 293 | " Probabilities of Class No is 0.35714285714285715:\n", 294 | " \n", 295 | " Probabilities of Class Yes is 0.6428571428571429:\n", 296 | "\n", 297 | " Info-gain for Temperature is:0.029222565659 \n", 298 | "\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "def information_gain(df, split_attribute_name, target_attribute_name, trace=0):\n", 304 | " print(\"Information Gain Calculation of \",split_attribute_name)\n", 305 | " '''\n", 306 | " Takes a DataFrame of attributes, and quantifies the entropy of a target\n", 307 | " attribute after performing a split along the values of another attribute.\n", 308 | " '''\n", 309 | " # Split Data by Possible Vals of Attribute:\n", 310 | " df_split = df.groupby(split_attribute_name)\n", 311 | " # for name,group in df_split:\n", 312 | " # print(\"Name:\\n\",name)\n", 313 | " # print(\"Group:\\n\",group)\n", 314 | " \n", 315 | " # Calculate Entropy for Target Attribute, as well as\n", 316 | " # Proportion of Obs in Each Data-Split\n", 317 | " nobs = len(df.index) * 1.0\n", 318 | " # print(\"NOBS\",nobs)\n", 319 | " df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]\n", 320 | " #print([target_attribute_name])\n", 321 | " #print(\" Entropy List \",entropy_of_list)\n", 322 | " #print(\"DFAGGENT\",df_agg_ent)\n", 323 | " df_agg_ent.columns = ['Entropy', 'PropObservations']\n", 324 | " #if trace: # helps understand what fxn is doing:\n", 325 | " # print(df_agg_ent)\n", 326 | " \n", 327 | " # Calculate Information Gain:\n", 328 | " new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )\n", 329 | " old_entropy = entropy_of_list(df[target_attribute_name])\n", 330 | " return old_entropy - new_entropy\n", 331 | "\n", 332 | "\n", 333 | "print('Info-gain for Outlook is :'+str( information_gain(df_tennis, 'Outlook', 'PlayTennis')),\"\\n\")\n", 334 | "print('\\n Info-gain for Humidity is: ' + str( information_gain(df_tennis, 'Humidity', 'PlayTennis')),\"\\n\")\n", 335 | "print('\\n Info-gain for Wind is:' + str( information_gain(df_tennis, 'Wind', 'PlayTennis')),\"\\n\")\n", 336 | "print('\\n Info-gain for Temperature is:' + str( information_gain(df_tennis, 'Temperature','PlayTennis')),\"\\n\")" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "# ID3 Algorithm" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 217, 349 | "metadata": { 350 | "collapsed": true 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "def id3(df, target_attribute_name, attribute_names, default_class=None):\n", 355 | " \n", 356 | " ## Tally target attribute:\n", 357 | " from collections import Counter\n", 358 | " cnt = Counter(x for x in df[target_attribute_name])# class of YES /NO\n", 359 | " \n", 360 | " ## First check: Is this split of the dataset homogeneous?\n", 361 | " if len(cnt) == 1:\n", 362 | " return next(iter(cnt)) # next input data set, or raises StopIteration when EOF is hit.\n", 363 | " \n", 364 | " ## Second check: Is this split of the dataset empty?\n", 365 | " # if yes, return a default value\n", 366 | " elif df.empty or (not attribute_names):\n", 367 | " return default_class # Return None for Empty Data Set\n", 368 | " \n", 369 | " ## Otherwise: This dataset is ready to be devied up!\n", 370 | " else:\n", 371 | " # Get Default Value for next recursive call of this function:\n", 372 | " default_class = max(cnt.keys()) #No of YES and NO Class\n", 373 | " # Compute the Information Gain of the attributes:\n", 374 | " gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] #\n", 375 | " index_of_max = gainz.index(max(gainz)) # Index of Best Attribute\n", 376 | " # Choose Best Attribute to split on:\n", 377 | " best_attr = attribute_names[index_of_max]\n", 378 | " \n", 379 | " # Create an empty tree, to be populated in a moment\n", 380 | " tree = {best_attr:{}} # Iniiate the tree with best attribute as a node \n", 381 | " remaining_attribute_names = [i for i in attribute_names if i != best_attr]\n", 382 | " \n", 383 | " # Split dataset\n", 384 | " # On each split, recursively call this algorithm.\n", 385 | " # populate the empty tree with subtrees, which\n", 386 | " # are the result of the recursive call\n", 387 | " for attr_val, data_subset in df.groupby(best_attr):\n", 388 | " subtree = id3(data_subset,\n", 389 | " target_attribute_name,\n", 390 | " remaining_attribute_names,\n", 391 | " default_class)\n", 392 | " tree[best_attr][attr_val] = subtree\n", 393 | " return tree" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "# Predicting Attributes" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 218, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "List of Attributes: ['PlayTennis', 'Outlook', 'Temperature', 'Humidity', 'Wind']\n", 413 | "Predicting Attributes: ['Outlook', 'Temperature', 'Humidity', 'Wind']\n" 414 | ] 415 | } 416 | ], 417 | "source": [ 418 | "# Get Predictor Names (all but 'class')\n", 419 | "attribute_names = list(df_tennis.columns)\n", 420 | "print(\"List of Attributes:\", attribute_names) \n", 421 | "attribute_names.remove('PlayTennis') #Remove the class attribute \n", 422 | "print(\"Predicting Attributes:\", attribute_names)" 423 | ] 424 | }, 425 | { 426 | "cell_type": "raw", 427 | "metadata": {}, 428 | "source": [ 429 | "# Tree Construction" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 219, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "name": "stdout", 439 | "output_type": "stream", 440 | "text": [ 441 | "Information Gain Calculation of Outlook\n", 442 | "\n", 443 | " Number of Instances of the Current Sub Class is 4.0:\n", 444 | "\n", 445 | " Classes: Yes Yes\n", 446 | " \n", 447 | " Probabilities of Class Yes is 1.0:\n", 448 | " \n", 449 | " Probabilities of Class Yes is 1.0:\n", 450 | "\n", 451 | " Number of Instances of the Current Sub Class is 5.0:\n", 452 | "\n", 453 | " Classes: No Yes\n", 454 | " \n", 455 | " Probabilities of Class No is 0.4:\n", 456 | " \n", 457 | " Probabilities of Class Yes is 0.6:\n", 458 | "\n", 459 | " Number of Instances of the Current Sub Class is 5.0:\n", 460 | "\n", 461 | " Classes: No Yes\n", 462 | " \n", 463 | " Probabilities of Class No is 0.4:\n", 464 | " \n", 465 | " Probabilities of Class Yes is 0.6:\n", 466 | "\n", 467 | " Number of Instances of the Current Sub Class is 14.0:\n", 468 | "\n", 469 | " Classes: No Yes\n", 470 | " \n", 471 | " Probabilities of Class No is 0.35714285714285715:\n", 472 | " \n", 473 | " Probabilities of Class Yes is 0.6428571428571429:\n", 474 | "Information Gain Calculation of Temperature\n", 475 | "\n", 476 | " Number of Instances of the Current Sub Class is 4.0:\n", 477 | "\n", 478 | " Classes: No Yes\n", 479 | " \n", 480 | " Probabilities of Class No is 0.25:\n", 481 | " \n", 482 | " Probabilities of Class Yes is 0.75:\n", 483 | "\n", 484 | " Number of Instances of the Current Sub Class is 4.0:\n", 485 | "\n", 486 | " Classes: No Yes\n", 487 | " \n", 488 | " Probabilities of Class No is 0.5:\n", 489 | " \n", 490 | " Probabilities of Class Yes is 0.5:\n", 491 | "\n", 492 | " Number of Instances of the Current Sub Class is 6.0:\n", 493 | "\n", 494 | " Classes: No Yes\n", 495 | " \n", 496 | " Probabilities of Class No is 0.3333333333333333:\n", 497 | " \n", 498 | " Probabilities of Class Yes is 0.6666666666666666:\n", 499 | "\n", 500 | " Number of Instances of the Current Sub Class is 14.0:\n", 501 | "\n", 502 | " Classes: No Yes\n", 503 | " \n", 504 | " Probabilities of Class No is 0.35714285714285715:\n", 505 | " \n", 506 | " Probabilities of Class Yes is 0.6428571428571429:\n", 507 | "Information Gain Calculation of Humidity\n", 508 | "\n", 509 | " Number of Instances of the Current Sub Class is 7.0:\n", 510 | "\n", 511 | " Classes: No Yes\n", 512 | " \n", 513 | " Probabilities of Class No is 0.42857142857142855:\n", 514 | " \n", 515 | " Probabilities of Class Yes is 0.5714285714285714:\n", 516 | "\n", 517 | " Number of Instances of the Current Sub Class is 7.0:\n", 518 | "\n", 519 | " Classes: No Yes\n", 520 | " \n", 521 | " Probabilities of Class No is 0.14285714285714285:\n", 522 | " \n", 523 | " Probabilities of Class Yes is 0.8571428571428571:\n", 524 | "\n", 525 | " Number of Instances of the Current Sub Class is 14.0:\n", 526 | "\n", 527 | " Classes: No Yes\n", 528 | " \n", 529 | " Probabilities of Class No is 0.35714285714285715:\n", 530 | " \n", 531 | " Probabilities of Class Yes is 0.6428571428571429:\n", 532 | "Information Gain Calculation of Wind\n", 533 | "\n", 534 | " Number of Instances of the Current Sub Class is 6.0:\n", 535 | "\n", 536 | " Classes: No Yes\n", 537 | " \n", 538 | " Probabilities of Class No is 0.5:\n", 539 | " \n", 540 | " Probabilities of Class Yes is 0.5:\n", 541 | "\n", 542 | " Number of Instances of the Current Sub Class is 8.0:\n", 543 | "\n", 544 | " Classes: No Yes\n", 545 | " \n", 546 | " Probabilities of Class No is 0.25:\n", 547 | " \n", 548 | " Probabilities of Class Yes is 0.75:\n", 549 | "\n", 550 | " Number of Instances of the Current Sub Class is 14.0:\n", 551 | "\n", 552 | " Classes: No Yes\n", 553 | " \n", 554 | " Probabilities of Class No is 0.35714285714285715:\n", 555 | " \n", 556 | " Probabilities of Class Yes is 0.6428571428571429:\n", 557 | "Information Gain Calculation of Temperature\n", 558 | "\n", 559 | " Number of Instances of the Current Sub Class is 2.0:\n", 560 | "\n", 561 | " Classes: No Yes\n", 562 | " \n", 563 | " Probabilities of Class No is 0.5:\n", 564 | " \n", 565 | " Probabilities of Class Yes is 0.5:\n", 566 | "\n", 567 | " Number of Instances of the Current Sub Class is 3.0:\n", 568 | "\n", 569 | " Classes: No Yes\n", 570 | " \n", 571 | " Probabilities of Class No is 0.3333333333333333:\n", 572 | " \n", 573 | " Probabilities of Class Yes is 0.6666666666666666:\n", 574 | "\n", 575 | " Number of Instances of the Current Sub Class is 5.0:\n", 576 | "\n", 577 | " Classes: No Yes\n", 578 | " \n", 579 | " Probabilities of Class No is 0.4:\n", 580 | " \n", 581 | " Probabilities of Class Yes is 0.6:\n", 582 | "Information Gain Calculation of Humidity\n", 583 | "\n", 584 | " Number of Instances of the Current Sub Class is 2.0:\n", 585 | "\n", 586 | " Classes: No Yes\n", 587 | " \n", 588 | " Probabilities of Class No is 0.5:\n", 589 | " \n", 590 | " Probabilities of Class Yes is 0.5:\n", 591 | "\n", 592 | " Number of Instances of the Current Sub Class is 3.0:\n", 593 | "\n", 594 | " Classes: No Yes\n", 595 | " \n", 596 | " Probabilities of Class No is 0.3333333333333333:\n", 597 | " \n", 598 | " Probabilities of Class Yes is 0.6666666666666666:\n", 599 | "\n", 600 | " Number of Instances of the Current Sub Class is 5.0:\n", 601 | "\n", 602 | " Classes: No Yes\n", 603 | " \n", 604 | " Probabilities of Class No is 0.4:\n", 605 | " \n", 606 | " Probabilities of Class Yes is 0.6:\n", 607 | "Information Gain Calculation of Wind\n", 608 | "\n", 609 | " Number of Instances of the Current Sub Class is 2.0:\n", 610 | "\n", 611 | " Classes: No No\n", 612 | " \n", 613 | " Probabilities of Class No is 1.0:\n", 614 | " \n", 615 | " Probabilities of Class No is 1.0:\n", 616 | "\n", 617 | " Number of Instances of the Current Sub Class is 3.0:\n", 618 | "\n", 619 | " Classes: Yes Yes\n", 620 | " \n", 621 | " Probabilities of Class Yes is 1.0:\n", 622 | " \n", 623 | " Probabilities of Class Yes is 1.0:\n", 624 | "\n", 625 | " Number of Instances of the Current Sub Class is 5.0:\n", 626 | "\n", 627 | " Classes: No Yes\n", 628 | " \n", 629 | " Probabilities of Class No is 0.4:\n", 630 | " \n", 631 | " Probabilities of Class Yes is 0.6:\n", 632 | "Information Gain Calculation of Temperature\n", 633 | "\n", 634 | " Number of Instances of the Current Sub Class is 1.0:\n", 635 | "\n", 636 | " Classes: Yes Yes\n", 637 | " \n", 638 | " Probabilities of Class Yes is 1.0:\n", 639 | " \n", 640 | " Probabilities of Class Yes is 1.0:\n", 641 | "\n", 642 | " Number of Instances of the Current Sub Class is 2.0:\n", 643 | "\n", 644 | " Classes: No No\n", 645 | " \n", 646 | " Probabilities of Class No is 1.0:\n", 647 | " \n", 648 | " Probabilities of Class No is 1.0:\n", 649 | "\n", 650 | " Number of Instances of the Current Sub Class is 2.0:\n", 651 | "\n", 652 | " Classes: No Yes\n", 653 | " \n", 654 | " Probabilities of Class No is 0.5:\n", 655 | " \n", 656 | " Probabilities of Class Yes is 0.5:\n", 657 | "\n", 658 | " Number of Instances of the Current Sub Class is 5.0:\n", 659 | "\n", 660 | " Classes: No Yes\n", 661 | " \n", 662 | " Probabilities of Class No is 0.4:\n", 663 | " \n", 664 | " Probabilities of Class Yes is 0.6:\n", 665 | "Information Gain Calculation of Humidity\n", 666 | "\n", 667 | " Number of Instances of the Current Sub Class is 3.0:\n", 668 | "\n", 669 | " Classes: No No\n", 670 | " \n", 671 | " Probabilities of Class No is 1.0:\n", 672 | " \n", 673 | " Probabilities of Class No is 1.0:\n", 674 | "\n", 675 | " Number of Instances of the Current Sub Class is 2.0:\n", 676 | "\n", 677 | " Classes: Yes Yes\n", 678 | " \n", 679 | " Probabilities of Class Yes is 1.0:\n", 680 | " \n", 681 | " Probabilities of Class Yes is 1.0:\n", 682 | "\n", 683 | " Number of Instances of the Current Sub Class is 5.0:\n", 684 | "\n", 685 | " Classes: No Yes\n", 686 | " \n", 687 | " Probabilities of Class No is 0.4:\n", 688 | " \n", 689 | " Probabilities of Class Yes is 0.6:\n", 690 | "Information Gain Calculation of Wind\n", 691 | "\n", 692 | " Number of Instances of the Current Sub Class is 2.0:\n", 693 | "\n", 694 | " Classes: No Yes\n", 695 | " \n", 696 | " Probabilities of Class No is 0.5:\n", 697 | " \n", 698 | " Probabilities of Class Yes is 0.5:\n", 699 | "\n", 700 | " Number of Instances of the Current Sub Class is 3.0:\n", 701 | "\n", 702 | " Classes: No Yes\n", 703 | " \n", 704 | " Probabilities of Class No is 0.3333333333333333:\n", 705 | " \n", 706 | " Probabilities of Class Yes is 0.6666666666666666:\n", 707 | "\n", 708 | " Number of Instances of the Current Sub Class is 5.0:\n", 709 | "\n", 710 | " Classes: No Yes\n", 711 | " \n", 712 | " Probabilities of Class No is 0.4:\n", 713 | " \n", 714 | " Probabilities of Class Yes is 0.6:\n", 715 | "\n", 716 | "\n", 717 | "The Resultant Decision Tree is :\n", 718 | "\n", 719 | "{'Outlook': {'Overcast': 'Yes',\n", 720 | " 'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},\n", 721 | " 'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}\n", 722 | "Best Attribute :\n", 723 | " Outlook\n", 724 | "Tree Keys:\n", 725 | " dict_keys(['Overcast', 'Rain', 'Sunny'])\n" 726 | ] 727 | } 728 | ], 729 | "source": [ 730 | "# Run Algorithm:\n", 731 | "from pprint import pprint\n", 732 | "tree = id3(df_tennis,'PlayTennis',attribute_names)\n", 733 | "print(\"\\n\\nThe Resultant Decision Tree is :\\n\")\n", 734 | "#print(tree)\n", 735 | "pprint(tree)\n", 736 | "attribute = next(iter(tree))\n", 737 | "print(\"Best Attribute :\\n\",attribute)\n", 738 | "print(\"Tree Keys:\\n\",tree[attribute].keys())" 739 | ] 740 | }, 741 | { 742 | "cell_type": "markdown", 743 | "metadata": {}, 744 | "source": [ 745 | "# Classification Accuracy" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": 220, 751 | "metadata": {}, 752 | "outputs": [], 753 | "source": [ 754 | "def classify(instance, tree, default=None): # Instance of Play Tennis with Predicted \n", 755 | " \n", 756 | " #print(\"Instance:\",instance)\n", 757 | " attribute = next(iter(tree)) # Outlook/Humidity/Wind \n", 758 | " print(\"Key:\",tree.keys()) # [Outlook,Humidity,Wind ]\n", 759 | " print(\"Attribute:\",attribute) # [Key /Attribute Both are same ]\n", 760 | " \n", 761 | " # print(\"Insance of Attribute :\",instance[attribute],attribute)\n", 762 | " if instance[attribute] in tree[attribute].keys(): # Value of the attributs in set of Tree keys \n", 763 | " result = tree[attribute][instance[attribute]]\n", 764 | " print(\"Instance Attribute:\",instance[attribute],\"TreeKeys :\",tree[attribute].keys())\n", 765 | " if isinstance(result, dict): # this is a tree, delve deeper\n", 766 | " return classify(instance, result)\n", 767 | " else:\n", 768 | " return result # this is a label\n", 769 | " else:\n", 770 | " return default" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": 138, 776 | "metadata": {}, 777 | "outputs": [ 778 | { 779 | "name": "stdout", 780 | "output_type": "stream", 781 | "text": [ 782 | "Key: dict_keys(['Outlook'])\n", 783 | "Attribute: Outlook\n", 784 | "Instance Attribute: Sunny TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 785 | "Key: dict_keys(['Humidity'])\n", 786 | "Attribute: Humidity\n", 787 | "Instance Attribute: High TreeKeys : dict_keys(['High', 'Normal'])\n", 788 | "Key: dict_keys(['Outlook'])\n", 789 | "Attribute: Outlook\n", 790 | "Instance Attribute: Sunny TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 791 | "Key: dict_keys(['Humidity'])\n", 792 | "Attribute: Humidity\n", 793 | "Instance Attribute: High TreeKeys : dict_keys(['High', 'Normal'])\n", 794 | "Key: dict_keys(['Outlook'])\n", 795 | "Attribute: Outlook\n", 796 | "Instance Attribute: Overcast TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 797 | "Key: dict_keys(['Outlook'])\n", 798 | "Attribute: Outlook\n", 799 | "Instance Attribute: Rain TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 800 | "Key: dict_keys(['Wind'])\n", 801 | "Attribute: Wind\n", 802 | "Instance Attribute: Weak TreeKeys : dict_keys(['Strong', 'Weak'])\n", 803 | "Key: dict_keys(['Outlook'])\n", 804 | "Attribute: Outlook\n", 805 | "Instance Attribute: Rain TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 806 | "Key: dict_keys(['Wind'])\n", 807 | "Attribute: Wind\n", 808 | "Instance Attribute: Weak TreeKeys : dict_keys(['Strong', 'Weak'])\n", 809 | "Key: dict_keys(['Outlook'])\n", 810 | "Attribute: Outlook\n", 811 | "Instance Attribute: Rain TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 812 | "Key: dict_keys(['Wind'])\n", 813 | "Attribute: Wind\n", 814 | "Instance Attribute: Strong TreeKeys : dict_keys(['Strong', 'Weak'])\n", 815 | "Key: dict_keys(['Outlook'])\n", 816 | "Attribute: Outlook\n", 817 | "Instance Attribute: Overcast TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 818 | "Key: dict_keys(['Outlook'])\n", 819 | "Attribute: Outlook\n", 820 | "Instance Attribute: Sunny TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 821 | "Key: dict_keys(['Humidity'])\n", 822 | "Attribute: Humidity\n", 823 | "Instance Attribute: High TreeKeys : dict_keys(['High', 'Normal'])\n", 824 | "Key: dict_keys(['Outlook'])\n", 825 | "Attribute: Outlook\n", 826 | "Instance Attribute: Sunny TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 827 | "Key: dict_keys(['Humidity'])\n", 828 | "Attribute: Humidity\n", 829 | "Instance Attribute: Normal TreeKeys : dict_keys(['High', 'Normal'])\n", 830 | "Key: dict_keys(['Outlook'])\n", 831 | "Attribute: Outlook\n", 832 | "Instance Attribute: Rain TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 833 | "Key: dict_keys(['Wind'])\n", 834 | "Attribute: Wind\n", 835 | "Instance Attribute: Weak TreeKeys : dict_keys(['Strong', 'Weak'])\n", 836 | "Key: dict_keys(['Outlook'])\n", 837 | "Attribute: Outlook\n", 838 | "Instance Attribute: Sunny TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 839 | "Key: dict_keys(['Humidity'])\n", 840 | "Attribute: Humidity\n", 841 | "Instance Attribute: Normal TreeKeys : dict_keys(['High', 'Normal'])\n", 842 | "Key: dict_keys(['Outlook'])\n", 843 | "Attribute: Outlook\n", 844 | "Instance Attribute: Overcast TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 845 | "Key: dict_keys(['Outlook'])\n", 846 | "Attribute: Outlook\n", 847 | "Instance Attribute: Overcast TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 848 | "Key: dict_keys(['Outlook'])\n", 849 | "Attribute: Outlook\n", 850 | "Instance Attribute: Rain TreeKeys : dict_keys(['Overcast', 'Rain', 'Sunny'])\n", 851 | "Key: dict_keys(['Wind'])\n", 852 | "Attribute: Wind\n", 853 | "Instance Attribute: Strong TreeKeys : dict_keys(['Strong', 'Weak'])\n", 854 | "0 No\n", 855 | "1 No\n", 856 | "2 Yes\n", 857 | "3 Yes\n", 858 | "4 Yes\n", 859 | "5 No\n", 860 | "6 Yes\n", 861 | "7 No\n", 862 | "8 Yes\n", 863 | "9 Yes\n", 864 | "10 Yes\n", 865 | "11 Yes\n", 866 | "12 Yes\n", 867 | "13 No\n", 868 | "Name: predicted, dtype: object\n", 869 | "\n", 870 | " Accuracy is:\n", 871 | "1.0\n" 872 | ] 873 | }, 874 | { 875 | "data": { 876 | "text/html": [ 877 | "
\n", 895 | " | PlayTennis | \n", 896 | "predicted | \n", 897 | "
---|---|---|
0 | \n", 902 | "No | \n", 903 | "No | \n", 904 | "
1 | \n", 907 | "No | \n", 908 | "No | \n", 909 | "
2 | \n", 912 | "Yes | \n", 913 | "Yes | \n", 914 | "
3 | \n", 917 | "Yes | \n", 918 | "Yes | \n", 919 | "
4 | \n", 922 | "Yes | \n", 923 | "Yes | \n", 924 | "
5 | \n", 927 | "No | \n", 928 | "No | \n", 929 | "
6 | \n", 932 | "Yes | \n", 933 | "Yes | \n", 934 | "
7 | \n", 937 | "No | \n", 938 | "No | \n", 939 | "
8 | \n", 942 | "Yes | \n", 943 | "Yes | \n", 944 | "
9 | \n", 947 | "Yes | \n", 948 | "Yes | \n", 949 | "
10 | \n", 952 | "Yes | \n", 953 | "Yes | \n", 954 | "
11 | \n", 957 | "Yes | \n", 958 | "Yes | \n", 959 | "
12 | \n", 962 | "Yes | \n", 963 | "Yes | \n", 964 | "
13 | \n", 967 | "No | \n", 968 | "No | \n", 969 | "