├── README.md ├── LICENSE ├── Feature_Preprocessing_for_Categorical_and_Ordinal_Features_The_Most_Important_Step.ipynb └── BreastTissue_Multiclass_Classification.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # blog-posts 2 | This repo contains code from the blog posts that I pubhish. 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Sabina 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Feature_Preprocessing_for_Categorical_and_Ordinal_Features_The_Most_Important_Step.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Feature Preprocessing for Categorical and Ordinal Features- The Most Important Step.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "qIrbn_FgFkbX", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# Feature Preprocessing for Categorical and Ordinal Features- The Most Important Step" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "id": "JPGkbECOFO2r", 41 | "colab_type": "text" 42 | }, 43 | "source": [ 44 | "\n", 45 | "\n", 46 | "Dataset from http://archive.ics.uci.edu/ml/datasets/Automobile obtained from UCI Machine Learning Repository.\n", 47 | "\n", 48 | "\n", 49 | "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "metadata": { 55 | "id": "rBFE9nxxB2qz", 56 | "colab_type": "code", 57 | "colab": {} 58 | }, 59 | "source": [ 60 | "# import pandas\n", 61 | "import pandas as pd\n", 62 | "\n", 63 | "# define column names for the dataset as the dataset we will be importing does not have column names\n", 64 | "columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height',\n", 65 | " 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']\n", 66 | "\n", 67 | "# read the dataset\n", 68 | "df = pd.read_csv('https://query.data.world/s/sdhzwzf6n2ivkvgabugicfo6oxiais', header=None, names=columns)" 69 | ], 70 | "execution_count": 0, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "fDW15YqYB6KJ", 77 | "colab_type": "code", 78 | "colab": { 79 | "base_uri": "https://localhost:8080/", 80 | "height": 309 81 | }, 82 | "outputId": "6e98ee8c-71b5-4daa-b544-05b7bcd3043d" 83 | }, 84 | "source": [ 85 | "df.head()" 86 | ], 87 | "execution_count": 128, 88 | "outputs": [ 89 | { 90 | "output_type": "execute_result", 91 | "data": { 92 | "text/html": [ 93 | "
\n", 94 | "\n", 107 | "\n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | "
symbolingnormalized-lossesmakefuel-typeaspirationnum-of-doorsbody-styledrive-wheelsengine-locationwheel-baselengthwidthheightcurb-weightengine-typenum-of-cylindersengine-sizefuel-systemborestrokecompression-ratiohorsepowerpeak-rpmcity-mpghighway-mpgprice
03?alfa-romerogasstdtwoconvertiblerwdfront88.6168.864.148.82548dohcfour130mpfi3.472.689.01115000212713495
13?alfa-romerogasstdtwoconvertiblerwdfront88.6168.864.148.82548dohcfour130mpfi3.472.689.01115000212716500
21?alfa-romerogasstdtwohatchbackrwdfront94.5171.265.552.42823ohcvsix152mpfi2.683.479.01545000192616500
32164audigasstdfoursedanfwdfront99.8176.666.254.32337ohcfour109mpfi3.193.4010.01025500243013950
42164audigasstdfoursedan4wdfront99.4176.666.454.32824ohcfive136mpfi3.193.408.01155500182217450
\n", 287 | "
" 288 | ], 289 | "text/plain": [ 290 | " symboling normalized-losses make ... city-mpg highway-mpg price\n", 291 | "0 3 ? alfa-romero ... 21 27 13495\n", 292 | "1 3 ? alfa-romero ... 21 27 16500\n", 293 | "2 1 ? alfa-romero ... 19 26 16500\n", 294 | "3 2 164 audi ... 24 30 13950\n", 295 | "4 2 164 audi ... 18 22 17450\n", 296 | "\n", 297 | "[5 rows x 26 columns]" 298 | ] 299 | }, 300 | "metadata": { 301 | "tags": [] 302 | }, 303 | "execution_count": 128 304 | } 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "metadata": { 310 | "id": "tmN3VGkPB8l_", 311 | "colab_type": "code", 312 | "colab": { 313 | "base_uri": "https://localhost:8080/", 314 | "height": 203 315 | }, 316 | "outputId": "28c07121-59d3-492c-a0d3-a6d603bb30f0" 317 | }, 318 | "source": [ 319 | "# As this dataset contains a lot of features, let us select a few categorical features for the purpose of demonstration.\n", 320 | "select_columns = ['fuel-type','engine-location','num-of-cylinders']\n", 321 | "\n", 322 | "df = df[select_columns]\n", 323 | "df.head()" 324 | ], 325 | "execution_count": 129, 326 | "outputs": [ 327 | { 328 | "output_type": "execute_result", 329 | "data": { 330 | "text/html": [ 331 | "
\n", 332 | "\n", 345 | "\n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | "
fuel-typeengine-locationnum-of-cylinders
0gasfrontfour
1gasfrontfour
2gasfrontsix
3gasfrontfour
4gasfrontfive
\n", 387 | "
" 388 | ], 389 | "text/plain": [ 390 | " fuel-type engine-location num-of-cylinders\n", 391 | "0 gas front four\n", 392 | "1 gas front four\n", 393 | "2 gas front six\n", 394 | "3 gas front four\n", 395 | "4 gas front five" 396 | ] 397 | }, 398 | "metadata": { 399 | "tags": [] 400 | }, 401 | "execution_count": 129 402 | } 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "QxFIqfzvF1LR", 409 | "colab_type": "code", 410 | "colab": { 411 | "base_uri": "https://localhost:8080/", 412 | "height": 71 413 | }, 414 | "outputId": "42c57254-7936-48b0-d13e-6555834833eb" 415 | }, 416 | "source": [ 417 | "# find unique values for feature fuel-type\n", 418 | "print(df['fuel-type'].unique())\n", 419 | "\n", 420 | "# find unique values for feature engine-location\n", 421 | "print(df['engine-location'].unique())\n", 422 | "\n", 423 | "# find unique values for feature num-of-cylinders\n", 424 | "print(df['num-of-cylinders'].unique())" 425 | ], 426 | "execution_count": 130, 427 | "outputs": [ 428 | { 429 | "output_type": "stream", 430 | "text": [ 431 | "['gas' 'diesel']\n", 432 | "['front' 'rear']\n", 433 | "['four' 'six' 'five' 'three' 'twelve' 'two' 'eight']\n" 434 | ], 435 | "name": "stdout" 436 | } 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "peck2hzVF-hK", 443 | "colab_type": "code", 444 | "colab": { 445 | "base_uri": "https://localhost:8080/", 446 | "height": 203 447 | }, 448 | "outputId": "ff55a7cc-91bd-49ca-fe8e-f346463b6bba" 449 | }, 450 | "source": [ 451 | "# import Label encoder\n", 452 | "from sklearn.preprocessing import LabelEncoder\n", 453 | "\n", 454 | "# create laber encoder\n", 455 | "label_encoder = LabelEncoder()\n", 456 | "\n", 457 | "# create a copy of dataset\n", 458 | "df_le = df.copy()\n", 459 | "\n", 460 | "# fit the label encoder and transform the labels in the dataset to create new label encoded features\n", 461 | "df_le['enc-fuel-type'] = label_encoder.fit_transform(df_le['fuel-type'])\n", 462 | "df_le['enc-engine-location'] = label_encoder.fit_transform(df_le['engine-location'])\n", 463 | "df_le['enc-num-of-cylinders'] = label_encoder.fit_transform(df_le['num-of-cylinders'])\n", 464 | "\n", 465 | "# drop original categorical features\n", 466 | "columns_to_drop = ['fuel-type','engine-location','num-of-cylinders']\n", 467 | "df_le = df_le.drop(columns=columns_to_drop)\n", 468 | "\n", 469 | "df_le.head()" 470 | ], 471 | "execution_count": 131, 472 | "outputs": [ 473 | { 474 | "output_type": "execute_result", 475 | "data": { 476 | "text/html": [ 477 | "
\n", 478 | "\n", 491 | "\n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | "
enc-fuel-typeenc-engine-locationenc-num-of-cylinders
0102
1102
2103
3102
4101
\n", 533 | "
" 534 | ], 535 | "text/plain": [ 536 | " enc-fuel-type enc-engine-location enc-num-of-cylinders\n", 537 | "0 1 0 2\n", 538 | "1 1 0 2\n", 539 | "2 1 0 3\n", 540 | "3 1 0 2\n", 541 | "4 1 0 1" 542 | ] 543 | }, 544 | "metadata": { 545 | "tags": [] 546 | }, 547 | "execution_count": 131 548 | } 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "metadata": { 554 | "id": "yeUMeFhqKugN", 555 | "colab_type": "code", 556 | "colab": { 557 | "base_uri": "https://localhost:8080/", 558 | "height": 203 559 | }, 560 | "outputId": "435e70c7-6fd2-44c8-d3c9-7ac02f1e3d5d" 561 | }, 562 | "source": [ 563 | "# import OneHotEncoder\n", 564 | "from sklearn.preprocessing import OneHotEncoder\n", 565 | "\n", 566 | "# create one hot encoder\n", 567 | "one_hot_encoder = OneHotEncoder()\n", 568 | "\n", 569 | "# create a copy of the dataset\n", 570 | "df_ohe = df.copy()\n", 571 | "\n", 572 | "# fit one hot encoder\n", 573 | "one_hot_encoder = one_hot_encoder.fit(df_ohe)\n", 574 | "\n", 575 | "# transform dataset \n", 576 | "ohelabels = one_hot_encoder.transform(df_ohe).toarray()\n", 577 | "df_ohe = pd.DataFrame(ohelabels, columns=one_hot_encoder.get_feature_names())\n", 578 | "\n", 579 | "df_ohe.head()\n" 580 | ], 581 | "execution_count": 132, 582 | "outputs": [ 583 | { 584 | "output_type": "execute_result", 585 | "data": { 586 | "text/html": [ 587 | "
\n", 588 | "\n", 601 | "\n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | "
x0_dieselx0_gasx1_frontx1_rearx2_eightx2_fivex2_fourx2_sixx2_threex2_twelvex2_two
00.01.01.00.00.00.01.00.00.00.00.0
10.01.01.00.00.00.01.00.00.00.00.0
20.01.01.00.00.00.00.01.00.00.00.0
30.01.01.00.00.00.01.00.00.00.00.0
40.01.01.00.00.01.00.00.00.00.00.0
\n", 691 | "
" 692 | ], 693 | "text/plain": [ 694 | " x0_diesel x0_gas x1_front x1_rear ... x2_six x2_three x2_twelve x2_two\n", 695 | "0 0.0 1.0 1.0 0.0 ... 0.0 0.0 0.0 0.0\n", 696 | "1 0.0 1.0 1.0 0.0 ... 0.0 0.0 0.0 0.0\n", 697 | "2 0.0 1.0 1.0 0.0 ... 1.0 0.0 0.0 0.0\n", 698 | "3 0.0 1.0 1.0 0.0 ... 0.0 0.0 0.0 0.0\n", 699 | "4 0.0 1.0 1.0 0.0 ... 0.0 0.0 0.0 0.0\n", 700 | "\n", 701 | "[5 rows x 11 columns]" 702 | ] 703 | }, 704 | "metadata": { 705 | "tags": [] 706 | }, 707 | "execution_count": 132 708 | } 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "metadata": { 714 | "id": "uCV-EhaBqTxO", 715 | "colab_type": "code", 716 | "colab": {} 717 | }, 718 | "source": [ 719 | "" 720 | ], 721 | "execution_count": 0, 722 | "outputs": [] 723 | } 724 | ] 725 | } -------------------------------------------------------------------------------- /BreastTissue_Multiclass_Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "BreastTissue - Multiclass Classification.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "GCe-2DR1O39Z", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# Multiclass Classification Model for Breast Tissue" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "id": "nyjq0183Dltz", 41 | "colab_type": "text" 42 | }, 43 | "source": [ 44 | "Data Source: https://archive.ics.uci.edu/ml/datasets/Breast+Tissue\n", 45 | "\n", 46 | "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "metadata": { 52 | "id": "y3WuFJBmC8bU", 53 | "colab_type": "code", 54 | "outputId": "05f61a62-898c-4d6c-b3d1-bbc730a4e7b2", 55 | "colab": { 56 | "resources": { 57 | "http://localhost:8080/nbextensions/google.colab/files.js": { 58 | "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7Ci8vIE1heCBhbW91bnQgb2YgdGltZSB0byBibG9jayB3YWl0aW5nIGZvciB0aGUgdXNlci4KY29uc3QgRklMRV9DSEFOR0VfVElNRU9VVF9NUyA9IDMwICogMTAwMDsKCmZ1bmN0aW9uIF91cGxvYWRGaWxlcyhpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IHN0ZXBzID0gdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKTsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIC8vIENhY2hlIHN0ZXBzIG9uIHRoZSBvdXRwdXRFbGVtZW50IHRvIG1ha2UgaXQgYXZhaWxhYmxlIGZvciB0aGUgbmV4dCBjYWxsCiAgLy8gdG8gdXBsb2FkRmlsZXNDb250aW51ZSBmcm9tIFB5dGhvbi4KICBvdXRwdXRFbGVtZW50LnN0ZXBzID0gc3RlcHM7CgogIHJldHVybiBfdXBsb2FkRmlsZXNDb250aW51ZShvdXRwdXRJZCk7Cn0KCi8vIFRoaXMgaXMgcm91Z2hseSBhbiBhc3luYyBnZW5lcmF0b3IgKG5vdCBzdXBwb3J0ZWQgaW4gdGhlIGJyb3dzZXIgeWV0KSwKLy8gd2hlcmUgdGhlcmUgYXJlIG11bHRpcGxlIGFzeW5jaHJvbm91cyBzdGVwcyBhbmQgdGhlIFB5dGhvbiBzaWRlIGlzIGdvaW5nCi8vIHRvIHBvbGwgZm9yIGNvbXBsZXRpb24gb2YgZWFjaCBzdGVwLgovLyBUaGlzIHVzZXMgYSBQcm9taXNlIHRvIGJsb2NrIHRoZSBweXRob24gc2lkZSBvbiBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcCwKLy8gdGhlbiBwYXNzZXMgdGhlIHJlc3VsdCBvZiB0aGUgcHJldmlvdXMgc3RlcCBhcyB0aGUgaW5wdXQgdG8gdGhlIG5leHQgc3RlcC4KZnVuY3Rpb24gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpIHsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIGNvbnN0IHN0ZXBzID0gb3V0cHV0RWxlbWVudC5zdGVwczsKCiAgY29uc3QgbmV4dCA9IHN0ZXBzLm5leHQob3V0cHV0RWxlbWVudC5sYXN0UHJvbWlzZVZhbHVlKTsKICByZXR1cm4gUHJvbWlzZS5yZXNvbHZlKG5leHQudmFsdWUucHJvbWlzZSkudGhlbigodmFsdWUpID0+IHsKICAgIC8vIENhY2hlIHRoZSBsYXN0IHByb21pc2UgdmFsdWUgdG8gbWFrZSBpdCBhdmFpbGFibGUgdG8gdGhlIG5leHQKICAgIC8vIHN0ZXAgb2YgdGhlIGdlbmVyYXRvci4KICAgIG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSA9IHZhbHVlOwogICAgcmV0dXJuIG5leHQudmFsdWUucmVzcG9uc2U7CiAgfSk7Cn0KCi8qKgogKiBHZW5lcmF0b3IgZnVuY3Rpb24gd2hpY2ggaXMgY2FsbGVkIGJldHdlZW4gZWFjaCBhc3luYyBzdGVwIG9mIHRoZSB1cGxvYWQKICogcHJvY2Vzcy4KICogQHBhcmFtIHtzdHJpbmd9IGlucHV0SWQgRWxlbWVudCBJRCBvZiB0aGUgaW5wdXQgZmlsZSBwaWNrZXIgZWxlbWVudC4KICogQHBhcmFtIHtzdHJpbmd9IG91dHB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIG91dHB1dCBkaXNwbGF5LgogKiBAcmV0dXJuIHshSXRlcmFibGU8IU9iamVjdD59IEl0ZXJhYmxlIG9mIG5leHQgc3RlcHMuCiAqLwpmdW5jdGlvbiogdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKSB7CiAgY29uc3QgaW5wdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoaW5wdXRJZCk7CiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gZmFsc2U7CgogIGNvbnN0IG91dHB1dEVsZW1lbnQgPSBkb2N1bWVudC5nZXRFbGVtZW50QnlJZChvdXRwdXRJZCk7CiAgb3V0cHV0RWxlbWVudC5pbm5lckhUTUwgPSAnJzsKCiAgY29uc3QgcGlja2VkUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBpbnB1dEVsZW1lbnQuYWRkRXZlbnRMaXN0ZW5lcignY2hhbmdlJywgKGUpID0+IHsKICAgICAgcmVzb2x2ZShlLnRhcmdldC5maWxlcyk7CiAgICB9KTsKICB9KTsKCiAgY29uc3QgY2FuY2VsID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnYnV0dG9uJyk7CiAgaW5wdXRFbGVtZW50LnBhcmVudEVsZW1lbnQuYXBwZW5kQ2hpbGQoY2FuY2VsKTsKICBjYW5jZWwudGV4dENvbnRlbnQgPSAnQ2FuY2VsIHVwbG9hZCc7CiAgY29uc3QgY2FuY2VsUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBjYW5jZWwub25jbGljayA9ICgpID0+IHsKICAgICAgcmVzb2x2ZShudWxsKTsKICAgIH07CiAgfSk7CgogIC8vIENhbmNlbCB1cGxvYWQgaWYgdXNlciBoYXNuJ3QgcGlja2VkIGFueXRoaW5nIGluIHRpbWVvdXQuCiAgY29uc3QgdGltZW91dFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgc2V0VGltZW91dCgoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9LCBGSUxFX0NIQU5HRV9USU1FT1VUX01TKTsKICB9KTsKCiAgLy8gV2FpdCBmb3IgdGhlIHVzZXIgdG8gcGljayB0aGUgZmlsZXMuCiAgY29uc3QgZmlsZXMgPSB5aWVsZCB7CiAgICBwcm9taXNlOiBQcm9taXNlLnJhY2UoW3BpY2tlZFByb21pc2UsIHRpbWVvdXRQcm9taXNlLCBjYW5jZWxQcm9taXNlXSksCiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdzdGFydGluZycsCiAgICB9CiAgfTsKCiAgaWYgKCFmaWxlcykgewogICAgcmV0dXJuIHsKICAgICAgcmVzcG9uc2U6IHsKICAgICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICAgIH0KICAgIH07CiAgfQoKICBjYW5jZWwucmVtb3ZlKCk7CgogIC8vIERpc2FibGUgdGhlIGlucHV0IGVsZW1lbnQgc2luY2UgZnVydGhlciBwaWNrcyBhcmUgbm90IGFsbG93ZWQuCiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gdHJ1ZTsKCiAgZm9yIChjb25zdCBmaWxlIG9mIGZpbGVzKSB7CiAgICBjb25zdCBsaSA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2xpJyk7CiAgICBsaS5hcHBlbmQoc3BhbihmaWxlLm5hbWUsIHtmb250V2VpZ2h0OiAnYm9sZCd9KSk7CiAgICBsaS5hcHBlbmQoc3BhbigKICAgICAgICBgKCR7ZmlsZS50eXBlIHx8ICduL2EnfSkgLSAke2ZpbGUuc2l6ZX0gYnl0ZXMsIGAgKwogICAgICAgIGBsYXN0IG1vZGlmaWVkOiAkewogICAgICAgICAgICBmaWxlLmxhc3RNb2RpZmllZERhdGUgPyBmaWxlLmxhc3RNb2RpZmllZERhdGUudG9Mb2NhbGVEYXRlU3RyaW5nKCkgOgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnbi9hJ30gLSBgKSk7CiAgICBjb25zdCBwZXJjZW50ID0gc3BhbignMCUgZG9uZScpOwogICAgbGkuYXBwZW5kQ2hpbGQocGVyY2VudCk7CgogICAgb3V0cHV0RWxlbWVudC5hcHBlbmRDaGlsZChsaSk7CgogICAgY29uc3QgZmlsZURhdGFQcm9taXNlID0gbmV3IFByb21pc2UoKHJlc29sdmUpID0+IHsKICAgICAgY29uc3QgcmVhZGVyID0gbmV3IEZpbGVSZWFkZXIoKTsKICAgICAgcmVhZGVyLm9ubG9hZCA9IChlKSA9PiB7CiAgICAgICAgcmVzb2x2ZShlLnRhcmdldC5yZXN1bHQpOwogICAgICB9OwogICAgICByZWFkZXIucmVhZEFzQXJyYXlCdWZmZXIoZmlsZSk7CiAgICB9KTsKICAgIC8vIFdhaXQgZm9yIHRoZSBkYXRhIHRvIGJlIHJlYWR5LgogICAgbGV0IGZpbGVEYXRhID0geWllbGQgewogICAgICBwcm9taXNlOiBmaWxlRGF0YVByb21pc2UsCiAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgYWN0aW9uOiAnY29udGludWUnLAogICAgICB9CiAgICB9OwoKICAgIC8vIFVzZSBhIGNodW5rZWQgc2VuZGluZyB0byBhdm9pZCBtZXNzYWdlIHNpemUgbGltaXRzLiBTZWUgYi82MjExNTY2MC4KICAgIGxldCBwb3NpdGlvbiA9IDA7CiAgICB3aGlsZSAocG9zaXRpb24gPCBmaWxlRGF0YS5ieXRlTGVuZ3RoKSB7CiAgICAgIGNvbnN0IGxlbmd0aCA9IE1hdGgubWluKGZpbGVEYXRhLmJ5dGVMZW5ndGggLSBwb3NpdGlvbiwgTUFYX1BBWUxPQURfU0laRSk7CiAgICAgIGNvbnN0IGNodW5rID0gbmV3IFVpbnQ4QXJyYXkoZmlsZURhdGEsIHBvc2l0aW9uLCBsZW5ndGgpOwogICAgICBwb3NpdGlvbiArPSBsZW5ndGg7CgogICAgICBjb25zdCBiYXNlNjQgPSBidG9hKFN0cmluZy5mcm9tQ2hhckNvZGUuYXBwbHkobnVsbCwgY2h1bmspKTsKICAgICAgeWllbGQgewogICAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgICBhY3Rpb246ICdhcHBlbmQnLAogICAgICAgICAgZmlsZTogZmlsZS5uYW1lLAogICAgICAgICAgZGF0YTogYmFzZTY0LAogICAgICAgIH0sCiAgICAgIH07CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPQogICAgICAgICAgYCR7TWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCl9JSBkb25lYDsKICAgIH0KICB9CgogIC8vIEFsbCBkb25lLgogIHlpZWxkIHsKICAgIHJlc3BvbnNlOiB7CiAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgIH0KICB9Owp9CgpzY29wZS5nb29nbGUgPSBzY29wZS5nb29nbGUgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYiA9IHNjb3BlLmdvb2dsZS5jb2xhYiB8fCB7fTsKc2NvcGUuZ29vZ2xlLmNvbGFiLl9maWxlcyA9IHsKICBfdXBsb2FkRmlsZXMsCiAgX3VwbG9hZEZpbGVzQ29udGludWUsCn07Cn0pKHNlbGYpOwo=", 59 | "ok": true, 60 | "headers": [ 61 | [ 62 | "content-type", 63 | "application/javascript" 64 | ] 65 | ], 66 | "status": 200, 67 | "status_text": "" 68 | } 69 | }, 70 | "base_uri": "https://localhost:8080/", 71 | "height": 75 72 | } 73 | }, 74 | "source": [ 75 | "from google.colab import files\n", 76 | "uploaded = files.upload()" 77 | ], 78 | "execution_count": 1, 79 | "outputs": [ 80 | { 81 | "output_type": "display_data", 82 | "data": { 83 | "text/html": [ 84 | "\n", 85 | " \n", 86 | " \n", 87 | " Upload widget is only available when the cell has been executed in the\n", 88 | " current browser session. Please rerun this cell to enable.\n", 89 | " \n", 90 | " " 91 | ], 92 | "text/plain": [ 93 | "" 94 | ] 95 | }, 96 | "metadata": { 97 | "tags": [] 98 | } 99 | }, 100 | { 101 | "output_type": "stream", 102 | "text": [ 103 | "Saving BreastTissue.csv to BreastTissue.csv\n" 104 | ], 105 | "name": "stdout" 106 | } 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "metadata": { 112 | "id": "ktwxF3kc9haU", 113 | "colab_type": "code", 114 | "colab": { 115 | "base_uri": "https://localhost:8080/", 116 | "height": 203 117 | }, 118 | "outputId": "a91382eb-d766-4088-9296-7d8cab7fe6ac" 119 | }, 120 | "source": [ 121 | "# import pandas\n", 122 | "import pandas as pd\n", 123 | "\n", 124 | "# read the dataset\n", 125 | "df = pd.read_csv('BreastTissue.csv')\n", 126 | "\n", 127 | "df.head()" 128 | ], 129 | "execution_count": 2, 130 | "outputs": [ 131 | { 132 | "output_type": "execute_result", 133 | "data": { 134 | "text/html": [ 135 | "
\n", 136 | "\n", 149 | "\n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | "
Case #ClassI0PA500HFSDAAreaA/DAMax IPDRP
01car524.7940720.1874480.032114228.8002286843.59848129.91080360.204880220.737212556.828334
12car330.0000000.2268930.265290121.1542013163.23947226.10920269.71736199.084964400.225776
23car551.8792870.2324780.063530264.80493511888.39183044.89490377.793297253.785300656.769449
34car380.0000000.2408550.286234137.6401115402.17118039.24852488.758446105.198568493.701813
45car362.8312660.2007130.244346124.9125593290.46244626.34212769.389389103.866552424.796503
\n", 239 | "
" 240 | ], 241 | "text/plain": [ 242 | " Case # Class I0 ... Max IP DR P\n", 243 | "0 1 car 524.794072 ... 60.204880 220.737212 556.828334\n", 244 | "1 2 car 330.000000 ... 69.717361 99.084964 400.225776\n", 245 | "2 3 car 551.879287 ... 77.793297 253.785300 656.769449\n", 246 | "3 4 car 380.000000 ... 88.758446 105.198568 493.701813\n", 247 | "4 5 car 362.831266 ... 69.389389 103.866552 424.796503\n", 248 | "\n", 249 | "[5 rows x 11 columns]" 250 | ] 251 | }, 252 | "metadata": { 253 | "tags": [] 254 | }, 255 | "execution_count": 2 256 | } 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "metadata": { 262 | "id": "F4I9l_YmEdrn", 263 | "colab_type": "code", 264 | "outputId": "0367968e-2fa1-48a6-f7bc-870f25e12f77", 265 | "colab": { 266 | "base_uri": "https://localhost:8080/", 267 | "height": 331 268 | } 269 | }, 270 | "source": [ 271 | "print(df.shape)\n", 272 | "\n", 273 | "num_of_classes = len(df.Class.unique())\n", 274 | "print(num_of_classes)\n", 275 | "\n", 276 | "df.describe()" 277 | ], 278 | "execution_count": 3, 279 | "outputs": [ 280 | { 281 | "output_type": "stream", 282 | "text": [ 283 | "(106, 11)\n", 284 | "6\n" 285 | ], 286 | "name": "stdout" 287 | }, 288 | { 289 | "output_type": "execute_result", 290 | "data": { 291 | "text/html": [ 292 | "
\n", 293 | "\n", 306 | "\n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | "
Case #I0PA500HFSDAAreaA/DAMax IPDRP
count106.000000106.000000106.000000106.000000106.000000106.000000106.000000106.000000106.000000106.000000
mean53.500000784.2516180.1201330.114691190.5686427335.15516123.47378475.381258166.710575810.638127
std30.743563753.9500750.0685960.101347190.80144818580.31421223.35467281.345838181.309580763.019135
min1.000000103.0000000.012392-0.06632319.64767070.4262391.5957427.968783-9.257696124.978561
25%27.250000250.0000000.0674130.04398253.845470409.6471418.18032126.89377341.781258270.215238
50%53.500000384.9364890.1054180.086568120.7773032219.58116316.13365744.21604097.832557454.108153
75%79.7500001487.9896260.1696020.166504255.3348097615.20496830.95329483.671755232.9900701301.559438
max106.0000002800.0000000.3583160.4677481063.441427174480.476200164.071543436.099640977.5523672896.582483
\n", 429 | "
" 430 | ], 431 | "text/plain": [ 432 | " Case # I0 PA500 ... Max IP DR P\n", 433 | "count 106.000000 106.000000 106.000000 ... 106.000000 106.000000 106.000000\n", 434 | "mean 53.500000 784.251618 0.120133 ... 75.381258 166.710575 810.638127\n", 435 | "std 30.743563 753.950075 0.068596 ... 81.345838 181.309580 763.019135\n", 436 | "min 1.000000 103.000000 0.012392 ... 7.968783 -9.257696 124.978561\n", 437 | "25% 27.250000 250.000000 0.067413 ... 26.893773 41.781258 270.215238\n", 438 | "50% 53.500000 384.936489 0.105418 ... 44.216040 97.832557 454.108153\n", 439 | "75% 79.750000 1487.989626 0.169602 ... 83.671755 232.990070 1301.559438\n", 440 | "max 106.000000 2800.000000 0.358316 ... 436.099640 977.552367 2896.582483\n", 441 | "\n", 442 | "[8 rows x 10 columns]" 443 | ] 444 | }, 445 | "metadata": { 446 | "tags": [] 447 | }, 448 | "execution_count": 3 449 | } 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "metadata": { 455 | "id": "iBr_P8exEFLa", 456 | "colab_type": "code", 457 | "outputId": "b326530e-aa71-4caa-81c0-af98a1931c8f", 458 | "colab": { 459 | "base_uri": "https://localhost:8080/", 460 | "height": 53 461 | } 462 | }, 463 | "source": [ 464 | "# split train input and output data\n", 465 | "X = df.drop(axis=0, columns=['Class', 'Case #'])\n", 466 | "Y = df.Class\n", 467 | "\n", 468 | "#Print the shape of X and Y\n", 469 | "print(X.shape)\n", 470 | "print(Y.shape)" 471 | ], 472 | "execution_count": 4, 473 | "outputs": [ 474 | { 475 | "output_type": "stream", 476 | "text": [ 477 | "(106, 9)\n", 478 | "(106,)\n" 479 | ], 480 | "name": "stdout" 481 | } 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "QCV1u4tqFIqm", 488 | "colab_type": "code", 489 | "colab": {} 490 | }, 491 | "source": [ 492 | "from sklearn.model_selection import train_test_split\n", 493 | "\n", 494 | "# Split into training and test sets\n", 495 | "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)" 496 | ], 497 | "execution_count": 0, 498 | "outputs": [] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "metadata": { 503 | "id": "3oMEHRVgF640", 504 | "colab_type": "code", 505 | "outputId": "9136f352-5b1d-40f4-9ba1-26c066f21f55", 506 | "colab": { 507 | "base_uri": "https://localhost:8080/", 508 | "height": 35 509 | } 510 | }, 511 | "source": [ 512 | "from xgboost import XGBClassifier\n", 513 | "from sklearn.metrics import roc_auc_score\n", 514 | "from sklearn import preprocessing\n", 515 | "\n", 516 | "xgb = XGBClassifier(booster='gbtree', objective='multi:softprob', random_state=42, eval_metric=\"auc\", num_class=num_of_classes)\n", 517 | "\n", 518 | "xgb.fit(X_train,y_train)\n", 519 | "\n", 520 | "pred = xgb.predict(X_test)\n", 521 | "val = xgb.predict(X_test)\n", 522 | "\n", 523 | "lb = preprocessing.LabelBinarizer()\n", 524 | "lb.fit(y_test)\n", 525 | "\n", 526 | "y_test_lb = lb.transform(y_test)\n", 527 | "val_lb = lb.transform(val)\n", 528 | "\n", 529 | "roc_auc_score(y_test_lb, val_lb, average='macro')" 530 | ], 531 | "execution_count": 9, 532 | "outputs": [ 533 | { 534 | "output_type": "execute_result", 535 | "data": { 536 | "text/plain": [ 537 | "0.835727969348659" 538 | ] 539 | }, 540 | "metadata": { 541 | "tags": [] 542 | }, 543 | "execution_count": 9 544 | } 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "metadata": { 550 | "id": "81Qqa0BzrEvL", 551 | "colab_type": "code", 552 | "colab": { 553 | "base_uri": "https://localhost:8080/", 554 | "height": 203 555 | }, 556 | "outputId": "8dde22fc-0597-480e-d5d0-92a2559bda4c" 557 | }, 558 | "source": [ 559 | "output = pd.DataFrame()\n", 560 | "output['Expected Output'] = y_test\n", 561 | "output['Predicted Output'] = val\n", 562 | "output.tail()" 563 | ], 564 | "execution_count": 10, 565 | "outputs": [ 566 | { 567 | "output_type": "execute_result", 568 | "data": { 569 | "text/html": [ 570 | "
\n", 571 | "\n", 584 | "\n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | "
Expected OutputPredicted Output
36masfad
88adiadi
9carcar
53masfad
95adiadi
\n", 620 | "
" 621 | ], 622 | "text/plain": [ 623 | " Expected Output Predicted Output\n", 624 | "36 mas fad\n", 625 | "88 adi adi\n", 626 | "9 car car\n", 627 | "53 mas fad\n", 628 | "95 adi adi" 629 | ] 630 | }, 631 | "metadata": { 632 | "tags": [] 633 | }, 634 | "execution_count": 10 635 | } 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "metadata": { 641 | "id": "euLoXe4LrnnQ", 642 | "colab_type": "code", 643 | "colab": {} 644 | }, 645 | "source": [ 646 | "" 647 | ], 648 | "execution_count": 0, 649 | "outputs": [] 650 | } 651 | ] 652 | } --------------------------------------------------------------------------------