├── README.md
├── LICENSE
├── Feature_Preprocessing_for_Categorical_and_Ordinal_Features_The_Most_Important_Step.ipynb
└── BreastTissue_Multiclass_Classification.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # blog-posts
2 | This repo contains code from the blog posts that I pubhish.
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Sabina
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Feature_Preprocessing_for_Categorical_and_Ordinal_Features_The_Most_Important_Step.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Feature Preprocessing for Categorical and Ordinal Features- The Most Important Step.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "include_colab_link": true
10 | },
11 | "kernelspec": {
12 | "name": "python3",
13 | "display_name": "Python 3"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {
20 | "id": "view-in-github",
21 | "colab_type": "text"
22 | },
23 | "source": [
24 | "
"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {
30 | "id": "qIrbn_FgFkbX",
31 | "colab_type": "text"
32 | },
33 | "source": [
34 | "# Feature Preprocessing for Categorical and Ordinal Features- The Most Important Step"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {
40 | "id": "JPGkbECOFO2r",
41 | "colab_type": "text"
42 | },
43 | "source": [
44 | "\n",
45 | "\n",
46 | "Dataset from http://archive.ics.uci.edu/ml/datasets/Automobile obtained from UCI Machine Learning Repository.\n",
47 | "\n",
48 | "\n",
49 | "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science."
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "metadata": {
55 | "id": "rBFE9nxxB2qz",
56 | "colab_type": "code",
57 | "colab": {}
58 | },
59 | "source": [
60 | "# import pandas\n",
61 | "import pandas as pd\n",
62 | "\n",
63 | "# define column names for the dataset as the dataset we will be importing does not have column names\n",
64 | "columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height',\n",
65 | " 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']\n",
66 | "\n",
67 | "# read the dataset\n",
68 | "df = pd.read_csv('https://query.data.world/s/sdhzwzf6n2ivkvgabugicfo6oxiais', header=None, names=columns)"
69 | ],
70 | "execution_count": 0,
71 | "outputs": []
72 | },
73 | {
74 | "cell_type": "code",
75 | "metadata": {
76 | "id": "fDW15YqYB6KJ",
77 | "colab_type": "code",
78 | "colab": {
79 | "base_uri": "https://localhost:8080/",
80 | "height": 309
81 | },
82 | "outputId": "6e98ee8c-71b5-4daa-b544-05b7bcd3043d"
83 | },
84 | "source": [
85 | "df.head()"
86 | ],
87 | "execution_count": 128,
88 | "outputs": [
89 | {
90 | "output_type": "execute_result",
91 | "data": {
92 | "text/html": [
93 | "
\n",
94 | "\n",
107 | "
\n",
108 | " \n",
109 | " \n",
110 | " | \n",
111 | " symboling | \n",
112 | " normalized-losses | \n",
113 | " make | \n",
114 | " fuel-type | \n",
115 | " aspiration | \n",
116 | " num-of-doors | \n",
117 | " body-style | \n",
118 | " drive-wheels | \n",
119 | " engine-location | \n",
120 | " wheel-base | \n",
121 | " length | \n",
122 | " width | \n",
123 | " height | \n",
124 | " curb-weight | \n",
125 | " engine-type | \n",
126 | " num-of-cylinders | \n",
127 | " engine-size | \n",
128 | " fuel-system | \n",
129 | " bore | \n",
130 | " stroke | \n",
131 | " compression-ratio | \n",
132 | " horsepower | \n",
133 | " peak-rpm | \n",
134 | " city-mpg | \n",
135 | " highway-mpg | \n",
136 | " price | \n",
137 | "
\n",
138 | " \n",
139 | " \n",
140 | " \n",
141 | " | 0 | \n",
142 | " 3 | \n",
143 | " ? | \n",
144 | " alfa-romero | \n",
145 | " gas | \n",
146 | " std | \n",
147 | " two | \n",
148 | " convertible | \n",
149 | " rwd | \n",
150 | " front | \n",
151 | " 88.6 | \n",
152 | " 168.8 | \n",
153 | " 64.1 | \n",
154 | " 48.8 | \n",
155 | " 2548 | \n",
156 | " dohc | \n",
157 | " four | \n",
158 | " 130 | \n",
159 | " mpfi | \n",
160 | " 3.47 | \n",
161 | " 2.68 | \n",
162 | " 9.0 | \n",
163 | " 111 | \n",
164 | " 5000 | \n",
165 | " 21 | \n",
166 | " 27 | \n",
167 | " 13495 | \n",
168 | "
\n",
169 | " \n",
170 | " | 1 | \n",
171 | " 3 | \n",
172 | " ? | \n",
173 | " alfa-romero | \n",
174 | " gas | \n",
175 | " std | \n",
176 | " two | \n",
177 | " convertible | \n",
178 | " rwd | \n",
179 | " front | \n",
180 | " 88.6 | \n",
181 | " 168.8 | \n",
182 | " 64.1 | \n",
183 | " 48.8 | \n",
184 | " 2548 | \n",
185 | " dohc | \n",
186 | " four | \n",
187 | " 130 | \n",
188 | " mpfi | \n",
189 | " 3.47 | \n",
190 | " 2.68 | \n",
191 | " 9.0 | \n",
192 | " 111 | \n",
193 | " 5000 | \n",
194 | " 21 | \n",
195 | " 27 | \n",
196 | " 16500 | \n",
197 | "
\n",
198 | " \n",
199 | " | 2 | \n",
200 | " 1 | \n",
201 | " ? | \n",
202 | " alfa-romero | \n",
203 | " gas | \n",
204 | " std | \n",
205 | " two | \n",
206 | " hatchback | \n",
207 | " rwd | \n",
208 | " front | \n",
209 | " 94.5 | \n",
210 | " 171.2 | \n",
211 | " 65.5 | \n",
212 | " 52.4 | \n",
213 | " 2823 | \n",
214 | " ohcv | \n",
215 | " six | \n",
216 | " 152 | \n",
217 | " mpfi | \n",
218 | " 2.68 | \n",
219 | " 3.47 | \n",
220 | " 9.0 | \n",
221 | " 154 | \n",
222 | " 5000 | \n",
223 | " 19 | \n",
224 | " 26 | \n",
225 | " 16500 | \n",
226 | "
\n",
227 | " \n",
228 | " | 3 | \n",
229 | " 2 | \n",
230 | " 164 | \n",
231 | " audi | \n",
232 | " gas | \n",
233 | " std | \n",
234 | " four | \n",
235 | " sedan | \n",
236 | " fwd | \n",
237 | " front | \n",
238 | " 99.8 | \n",
239 | " 176.6 | \n",
240 | " 66.2 | \n",
241 | " 54.3 | \n",
242 | " 2337 | \n",
243 | " ohc | \n",
244 | " four | \n",
245 | " 109 | \n",
246 | " mpfi | \n",
247 | " 3.19 | \n",
248 | " 3.40 | \n",
249 | " 10.0 | \n",
250 | " 102 | \n",
251 | " 5500 | \n",
252 | " 24 | \n",
253 | " 30 | \n",
254 | " 13950 | \n",
255 | "
\n",
256 | " \n",
257 | " | 4 | \n",
258 | " 2 | \n",
259 | " 164 | \n",
260 | " audi | \n",
261 | " gas | \n",
262 | " std | \n",
263 | " four | \n",
264 | " sedan | \n",
265 | " 4wd | \n",
266 | " front | \n",
267 | " 99.4 | \n",
268 | " 176.6 | \n",
269 | " 66.4 | \n",
270 | " 54.3 | \n",
271 | " 2824 | \n",
272 | " ohc | \n",
273 | " five | \n",
274 | " 136 | \n",
275 | " mpfi | \n",
276 | " 3.19 | \n",
277 | " 3.40 | \n",
278 | " 8.0 | \n",
279 | " 115 | \n",
280 | " 5500 | \n",
281 | " 18 | \n",
282 | " 22 | \n",
283 | " 17450 | \n",
284 | "
\n",
285 | " \n",
286 | "
\n",
287 | "
"
288 | ],
289 | "text/plain": [
290 | " symboling normalized-losses make ... city-mpg highway-mpg price\n",
291 | "0 3 ? alfa-romero ... 21 27 13495\n",
292 | "1 3 ? alfa-romero ... 21 27 16500\n",
293 | "2 1 ? alfa-romero ... 19 26 16500\n",
294 | "3 2 164 audi ... 24 30 13950\n",
295 | "4 2 164 audi ... 18 22 17450\n",
296 | "\n",
297 | "[5 rows x 26 columns]"
298 | ]
299 | },
300 | "metadata": {
301 | "tags": []
302 | },
303 | "execution_count": 128
304 | }
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "metadata": {
310 | "id": "tmN3VGkPB8l_",
311 | "colab_type": "code",
312 | "colab": {
313 | "base_uri": "https://localhost:8080/",
314 | "height": 203
315 | },
316 | "outputId": "28c07121-59d3-492c-a0d3-a6d603bb30f0"
317 | },
318 | "source": [
319 | "# As this dataset contains a lot of features, let us select a few categorical features for the purpose of demonstration.\n",
320 | "select_columns = ['fuel-type','engine-location','num-of-cylinders']\n",
321 | "\n",
322 | "df = df[select_columns]\n",
323 | "df.head()"
324 | ],
325 | "execution_count": 129,
326 | "outputs": [
327 | {
328 | "output_type": "execute_result",
329 | "data": {
330 | "text/html": [
331 | "\n",
332 | "\n",
345 | "
\n",
346 | " \n",
347 | " \n",
348 | " | \n",
349 | " fuel-type | \n",
350 | " engine-location | \n",
351 | " num-of-cylinders | \n",
352 | "
\n",
353 | " \n",
354 | " \n",
355 | " \n",
356 | " | 0 | \n",
357 | " gas | \n",
358 | " front | \n",
359 | " four | \n",
360 | "
\n",
361 | " \n",
362 | " | 1 | \n",
363 | " gas | \n",
364 | " front | \n",
365 | " four | \n",
366 | "
\n",
367 | " \n",
368 | " | 2 | \n",
369 | " gas | \n",
370 | " front | \n",
371 | " six | \n",
372 | "
\n",
373 | " \n",
374 | " | 3 | \n",
375 | " gas | \n",
376 | " front | \n",
377 | " four | \n",
378 | "
\n",
379 | " \n",
380 | " | 4 | \n",
381 | " gas | \n",
382 | " front | \n",
383 | " five | \n",
384 | "
\n",
385 | " \n",
386 | "
\n",
387 | "
"
388 | ],
389 | "text/plain": [
390 | " fuel-type engine-location num-of-cylinders\n",
391 | "0 gas front four\n",
392 | "1 gas front four\n",
393 | "2 gas front six\n",
394 | "3 gas front four\n",
395 | "4 gas front five"
396 | ]
397 | },
398 | "metadata": {
399 | "tags": []
400 | },
401 | "execution_count": 129
402 | }
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "metadata": {
408 | "id": "QxFIqfzvF1LR",
409 | "colab_type": "code",
410 | "colab": {
411 | "base_uri": "https://localhost:8080/",
412 | "height": 71
413 | },
414 | "outputId": "42c57254-7936-48b0-d13e-6555834833eb"
415 | },
416 | "source": [
417 | "# find unique values for feature fuel-type\n",
418 | "print(df['fuel-type'].unique())\n",
419 | "\n",
420 | "# find unique values for feature engine-location\n",
421 | "print(df['engine-location'].unique())\n",
422 | "\n",
423 | "# find unique values for feature num-of-cylinders\n",
424 | "print(df['num-of-cylinders'].unique())"
425 | ],
426 | "execution_count": 130,
427 | "outputs": [
428 | {
429 | "output_type": "stream",
430 | "text": [
431 | "['gas' 'diesel']\n",
432 | "['front' 'rear']\n",
433 | "['four' 'six' 'five' 'three' 'twelve' 'two' 'eight']\n"
434 | ],
435 | "name": "stdout"
436 | }
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "metadata": {
442 | "id": "peck2hzVF-hK",
443 | "colab_type": "code",
444 | "colab": {
445 | "base_uri": "https://localhost:8080/",
446 | "height": 203
447 | },
448 | "outputId": "ff55a7cc-91bd-49ca-fe8e-f346463b6bba"
449 | },
450 | "source": [
451 | "# import Label encoder\n",
452 | "from sklearn.preprocessing import LabelEncoder\n",
453 | "\n",
454 | "# create laber encoder\n",
455 | "label_encoder = LabelEncoder()\n",
456 | "\n",
457 | "# create a copy of dataset\n",
458 | "df_le = df.copy()\n",
459 | "\n",
460 | "# fit the label encoder and transform the labels in the dataset to create new label encoded features\n",
461 | "df_le['enc-fuel-type'] = label_encoder.fit_transform(df_le['fuel-type'])\n",
462 | "df_le['enc-engine-location'] = label_encoder.fit_transform(df_le['engine-location'])\n",
463 | "df_le['enc-num-of-cylinders'] = label_encoder.fit_transform(df_le['num-of-cylinders'])\n",
464 | "\n",
465 | "# drop original categorical features\n",
466 | "columns_to_drop = ['fuel-type','engine-location','num-of-cylinders']\n",
467 | "df_le = df_le.drop(columns=columns_to_drop)\n",
468 | "\n",
469 | "df_le.head()"
470 | ],
471 | "execution_count": 131,
472 | "outputs": [
473 | {
474 | "output_type": "execute_result",
475 | "data": {
476 | "text/html": [
477 | "\n",
478 | "\n",
491 | "
\n",
492 | " \n",
493 | " \n",
494 | " | \n",
495 | " enc-fuel-type | \n",
496 | " enc-engine-location | \n",
497 | " enc-num-of-cylinders | \n",
498 | "
\n",
499 | " \n",
500 | " \n",
501 | " \n",
502 | " | 0 | \n",
503 | " 1 | \n",
504 | " 0 | \n",
505 | " 2 | \n",
506 | "
\n",
507 | " \n",
508 | " | 1 | \n",
509 | " 1 | \n",
510 | " 0 | \n",
511 | " 2 | \n",
512 | "
\n",
513 | " \n",
514 | " | 2 | \n",
515 | " 1 | \n",
516 | " 0 | \n",
517 | " 3 | \n",
518 | "
\n",
519 | " \n",
520 | " | 3 | \n",
521 | " 1 | \n",
522 | " 0 | \n",
523 | " 2 | \n",
524 | "
\n",
525 | " \n",
526 | " | 4 | \n",
527 | " 1 | \n",
528 | " 0 | \n",
529 | " 1 | \n",
530 | "
\n",
531 | " \n",
532 | "
\n",
533 | "
"
534 | ],
535 | "text/plain": [
536 | " enc-fuel-type enc-engine-location enc-num-of-cylinders\n",
537 | "0 1 0 2\n",
538 | "1 1 0 2\n",
539 | "2 1 0 3\n",
540 | "3 1 0 2\n",
541 | "4 1 0 1"
542 | ]
543 | },
544 | "metadata": {
545 | "tags": []
546 | },
547 | "execution_count": 131
548 | }
549 | ]
550 | },
551 | {
552 | "cell_type": "code",
553 | "metadata": {
554 | "id": "yeUMeFhqKugN",
555 | "colab_type": "code",
556 | "colab": {
557 | "base_uri": "https://localhost:8080/",
558 | "height": 203
559 | },
560 | "outputId": "435e70c7-6fd2-44c8-d3c9-7ac02f1e3d5d"
561 | },
562 | "source": [
563 | "# import OneHotEncoder\n",
564 | "from sklearn.preprocessing import OneHotEncoder\n",
565 | "\n",
566 | "# create one hot encoder\n",
567 | "one_hot_encoder = OneHotEncoder()\n",
568 | "\n",
569 | "# create a copy of the dataset\n",
570 | "df_ohe = df.copy()\n",
571 | "\n",
572 | "# fit one hot encoder\n",
573 | "one_hot_encoder = one_hot_encoder.fit(df_ohe)\n",
574 | "\n",
575 | "# transform dataset \n",
576 | "ohelabels = one_hot_encoder.transform(df_ohe).toarray()\n",
577 | "df_ohe = pd.DataFrame(ohelabels, columns=one_hot_encoder.get_feature_names())\n",
578 | "\n",
579 | "df_ohe.head()\n"
580 | ],
581 | "execution_count": 132,
582 | "outputs": [
583 | {
584 | "output_type": "execute_result",
585 | "data": {
586 | "text/html": [
587 | "\n",
588 | "\n",
601 | "
\n",
602 | " \n",
603 | " \n",
604 | " | \n",
605 | " x0_diesel | \n",
606 | " x0_gas | \n",
607 | " x1_front | \n",
608 | " x1_rear | \n",
609 | " x2_eight | \n",
610 | " x2_five | \n",
611 | " x2_four | \n",
612 | " x2_six | \n",
613 | " x2_three | \n",
614 | " x2_twelve | \n",
615 | " x2_two | \n",
616 | "
\n",
617 | " \n",
618 | " \n",
619 | " \n",
620 | " | 0 | \n",
621 | " 0.0 | \n",
622 | " 1.0 | \n",
623 | " 1.0 | \n",
624 | " 0.0 | \n",
625 | " 0.0 | \n",
626 | " 0.0 | \n",
627 | " 1.0 | \n",
628 | " 0.0 | \n",
629 | " 0.0 | \n",
630 | " 0.0 | \n",
631 | " 0.0 | \n",
632 | "
\n",
633 | " \n",
634 | " | 1 | \n",
635 | " 0.0 | \n",
636 | " 1.0 | \n",
637 | " 1.0 | \n",
638 | " 0.0 | \n",
639 | " 0.0 | \n",
640 | " 0.0 | \n",
641 | " 1.0 | \n",
642 | " 0.0 | \n",
643 | " 0.0 | \n",
644 | " 0.0 | \n",
645 | " 0.0 | \n",
646 | "
\n",
647 | " \n",
648 | " | 2 | \n",
649 | " 0.0 | \n",
650 | " 1.0 | \n",
651 | " 1.0 | \n",
652 | " 0.0 | \n",
653 | " 0.0 | \n",
654 | " 0.0 | \n",
655 | " 0.0 | \n",
656 | " 1.0 | \n",
657 | " 0.0 | \n",
658 | " 0.0 | \n",
659 | " 0.0 | \n",
660 | "
\n",
661 | " \n",
662 | " | 3 | \n",
663 | " 0.0 | \n",
664 | " 1.0 | \n",
665 | " 1.0 | \n",
666 | " 0.0 | \n",
667 | " 0.0 | \n",
668 | " 0.0 | \n",
669 | " 1.0 | \n",
670 | " 0.0 | \n",
671 | " 0.0 | \n",
672 | " 0.0 | \n",
673 | " 0.0 | \n",
674 | "
\n",
675 | " \n",
676 | " | 4 | \n",
677 | " 0.0 | \n",
678 | " 1.0 | \n",
679 | " 1.0 | \n",
680 | " 0.0 | \n",
681 | " 0.0 | \n",
682 | " 1.0 | \n",
683 | " 0.0 | \n",
684 | " 0.0 | \n",
685 | " 0.0 | \n",
686 | " 0.0 | \n",
687 | " 0.0 | \n",
688 | "
\n",
689 | " \n",
690 | "
\n",
691 | "
"
692 | ],
693 | "text/plain": [
694 | " x0_diesel x0_gas x1_front x1_rear ... x2_six x2_three x2_twelve x2_two\n",
695 | "0 0.0 1.0 1.0 0.0 ... 0.0 0.0 0.0 0.0\n",
696 | "1 0.0 1.0 1.0 0.0 ... 0.0 0.0 0.0 0.0\n",
697 | "2 0.0 1.0 1.0 0.0 ... 1.0 0.0 0.0 0.0\n",
698 | "3 0.0 1.0 1.0 0.0 ... 0.0 0.0 0.0 0.0\n",
699 | "4 0.0 1.0 1.0 0.0 ... 0.0 0.0 0.0 0.0\n",
700 | "\n",
701 | "[5 rows x 11 columns]"
702 | ]
703 | },
704 | "metadata": {
705 | "tags": []
706 | },
707 | "execution_count": 132
708 | }
709 | ]
710 | },
711 | {
712 | "cell_type": "code",
713 | "metadata": {
714 | "id": "uCV-EhaBqTxO",
715 | "colab_type": "code",
716 | "colab": {}
717 | },
718 | "source": [
719 | ""
720 | ],
721 | "execution_count": 0,
722 | "outputs": []
723 | }
724 | ]
725 | }
--------------------------------------------------------------------------------
/BreastTissue_Multiclass_Classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "BreastTissue - Multiclass Classification.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "include_colab_link": true
10 | },
11 | "kernelspec": {
12 | "name": "python3",
13 | "display_name": "Python 3"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {
20 | "id": "view-in-github",
21 | "colab_type": "text"
22 | },
23 | "source": [
24 | "
"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {
30 | "id": "GCe-2DR1O39Z",
31 | "colab_type": "text"
32 | },
33 | "source": [
34 | "# Multiclass Classification Model for Breast Tissue"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {
40 | "id": "nyjq0183Dltz",
41 | "colab_type": "text"
42 | },
43 | "source": [
44 | "Data Source: https://archive.ics.uci.edu/ml/datasets/Breast+Tissue\n",
45 | "\n",
46 | "Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science."
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "metadata": {
52 | "id": "y3WuFJBmC8bU",
53 | "colab_type": "code",
54 | "outputId": "05f61a62-898c-4d6c-b3d1-bbc730a4e7b2",
55 | "colab": {
56 | "resources": {
57 | "http://localhost:8080/nbextensions/google.colab/files.js": {
58 | "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7Ci8vIE1heCBhbW91bnQgb2YgdGltZSB0byBibG9jayB3YWl0aW5nIGZvciB0aGUgdXNlci4KY29uc3QgRklMRV9DSEFOR0VfVElNRU9VVF9NUyA9IDMwICogMTAwMDsKCmZ1bmN0aW9uIF91cGxvYWRGaWxlcyhpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IHN0ZXBzID0gdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKTsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIC8vIENhY2hlIHN0ZXBzIG9uIHRoZSBvdXRwdXRFbGVtZW50IHRvIG1ha2UgaXQgYXZhaWxhYmxlIGZvciB0aGUgbmV4dCBjYWxsCiAgLy8gdG8gdXBsb2FkRmlsZXNDb250aW51ZSBmcm9tIFB5dGhvbi4KICBvdXRwdXRFbGVtZW50LnN0ZXBzID0gc3RlcHM7CgogIHJldHVybiBfdXBsb2FkRmlsZXNDb250aW51ZShvdXRwdXRJZCk7Cn0KCi8vIFRoaXMgaXMgcm91Z2hseSBhbiBhc3luYyBnZW5lcmF0b3IgKG5vdCBzdXBwb3J0ZWQgaW4gdGhlIGJyb3dzZXIgeWV0KSwKLy8gd2hlcmUgdGhlcmUgYXJlIG11bHRpcGxlIGFzeW5jaHJvbm91cyBzdGVwcyBhbmQgdGhlIFB5dGhvbiBzaWRlIGlzIGdvaW5nCi8vIHRvIHBvbGwgZm9yIGNvbXBsZXRpb24gb2YgZWFjaCBzdGVwLgovLyBUaGlzIHVzZXMgYSBQcm9taXNlIHRvIGJsb2NrIHRoZSBweXRob24gc2lkZSBvbiBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcCwKLy8gdGhlbiBwYXNzZXMgdGhlIHJlc3VsdCBvZiB0aGUgcHJldmlvdXMgc3RlcCBhcyB0aGUgaW5wdXQgdG8gdGhlIG5leHQgc3RlcC4KZnVuY3Rpb24gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpIHsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIGNvbnN0IHN0ZXBzID0gb3V0cHV0RWxlbWVudC5zdGVwczsKCiAgY29uc3QgbmV4dCA9IHN0ZXBzLm5leHQob3V0cHV0RWxlbWVudC5sYXN0UHJvbWlzZVZhbHVlKTsKICByZXR1cm4gUHJvbWlzZS5yZXNvbHZlKG5leHQudmFsdWUucHJvbWlzZSkudGhlbigodmFsdWUpID0+IHsKICAgIC8vIENhY2hlIHRoZSBsYXN0IHByb21pc2UgdmFsdWUgdG8gbWFrZSBpdCBhdmFpbGFibGUgdG8gdGhlIG5leHQKICAgIC8vIHN0ZXAgb2YgdGhlIGdlbmVyYXRvci4KICAgIG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSA9IHZhbHVlOwogICAgcmV0dXJuIG5leHQudmFsdWUucmVzcG9uc2U7CiAgfSk7Cn0KCi8qKgogKiBHZW5lcmF0b3IgZnVuY3Rpb24gd2hpY2ggaXMgY2FsbGVkIGJldHdlZW4gZWFjaCBhc3luYyBzdGVwIG9mIHRoZSB1cGxvYWQKICogcHJvY2Vzcy4KICogQHBhcmFtIHtzdHJpbmd9IGlucHV0SWQgRWxlbWVudCBJRCBvZiB0aGUgaW5wdXQgZmlsZSBwaWNrZXIgZWxlbWVudC4KICogQHBhcmFtIHtzdHJpbmd9IG91dHB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIG91dHB1dCBkaXNwbGF5LgogKiBAcmV0dXJuIHshSXRlcmFibGU8IU9iamVjdD59IEl0ZXJhYmxlIG9mIG5leHQgc3RlcHMuCiAqLwpmdW5jdGlvbiogdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKSB7CiAgY29uc3QgaW5wdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoaW5wdXRJZCk7CiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gZmFsc2U7CgogIGNvbnN0IG91dHB1dEVsZW1lbnQgPSBkb2N1bWVudC5nZXRFbGVtZW50QnlJZChvdXRwdXRJZCk7CiAgb3V0cHV0RWxlbWVudC5pbm5lckhUTUwgPSAnJzsKCiAgY29uc3QgcGlja2VkUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBpbnB1dEVsZW1lbnQuYWRkRXZlbnRMaXN0ZW5lcignY2hhbmdlJywgKGUpID0+IHsKICAgICAgcmVzb2x2ZShlLnRhcmdldC5maWxlcyk7CiAgICB9KTsKICB9KTsKCiAgY29uc3QgY2FuY2VsID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnYnV0dG9uJyk7CiAgaW5wdXRFbGVtZW50LnBhcmVudEVsZW1lbnQuYXBwZW5kQ2hpbGQoY2FuY2VsKTsKICBjYW5jZWwudGV4dENvbnRlbnQgPSAnQ2FuY2VsIHVwbG9hZCc7CiAgY29uc3QgY2FuY2VsUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBjYW5jZWwub25jbGljayA9ICgpID0+IHsKICAgICAgcmVzb2x2ZShudWxsKTsKICAgIH07CiAgfSk7CgogIC8vIENhbmNlbCB1cGxvYWQgaWYgdXNlciBoYXNuJ3QgcGlja2VkIGFueXRoaW5nIGluIHRpbWVvdXQuCiAgY29uc3QgdGltZW91dFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgc2V0VGltZW91dCgoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9LCBGSUxFX0NIQU5HRV9USU1FT1VUX01TKTsKICB9KTsKCiAgLy8gV2FpdCBmb3IgdGhlIHVzZXIgdG8gcGljayB0aGUgZmlsZXMuCiAgY29uc3QgZmlsZXMgPSB5aWVsZCB7CiAgICBwcm9taXNlOiBQcm9taXNlLnJhY2UoW3BpY2tlZFByb21pc2UsIHRpbWVvdXRQcm9taXNlLCBjYW5jZWxQcm9taXNlXSksCiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdzdGFydGluZycsCiAgICB9CiAgfTsKCiAgaWYgKCFmaWxlcykgewogICAgcmV0dXJuIHsKICAgICAgcmVzcG9uc2U6IHsKICAgICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICAgIH0KICAgIH07CiAgfQoKICBjYW5jZWwucmVtb3ZlKCk7CgogIC8vIERpc2FibGUgdGhlIGlucHV0IGVsZW1lbnQgc2luY2UgZnVydGhlciBwaWNrcyBhcmUgbm90IGFsbG93ZWQuCiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gdHJ1ZTsKCiAgZm9yIChjb25zdCBmaWxlIG9mIGZpbGVzKSB7CiAgICBjb25zdCBsaSA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2xpJyk7CiAgICBsaS5hcHBlbmQoc3BhbihmaWxlLm5hbWUsIHtmb250V2VpZ2h0OiAnYm9sZCd9KSk7CiAgICBsaS5hcHBlbmQoc3BhbigKICAgICAgICBgKCR7ZmlsZS50eXBlIHx8ICduL2EnfSkgLSAke2ZpbGUuc2l6ZX0gYnl0ZXMsIGAgKwogICAgICAgIGBsYXN0IG1vZGlmaWVkOiAkewogICAgICAgICAgICBmaWxlLmxhc3RNb2RpZmllZERhdGUgPyBmaWxlLmxhc3RNb2RpZmllZERhdGUudG9Mb2NhbGVEYXRlU3RyaW5nKCkgOgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnbi9hJ30gLSBgKSk7CiAgICBjb25zdCBwZXJjZW50ID0gc3BhbignMCUgZG9uZScpOwogICAgbGkuYXBwZW5kQ2hpbGQocGVyY2VudCk7CgogICAgb3V0cHV0RWxlbWVudC5hcHBlbmRDaGlsZChsaSk7CgogICAgY29uc3QgZmlsZURhdGFQcm9taXNlID0gbmV3IFByb21pc2UoKHJlc29sdmUpID0+IHsKICAgICAgY29uc3QgcmVhZGVyID0gbmV3IEZpbGVSZWFkZXIoKTsKICAgICAgcmVhZGVyLm9ubG9hZCA9IChlKSA9PiB7CiAgICAgICAgcmVzb2x2ZShlLnRhcmdldC5yZXN1bHQpOwogICAgICB9OwogICAgICByZWFkZXIucmVhZEFzQXJyYXlCdWZmZXIoZmlsZSk7CiAgICB9KTsKICAgIC8vIFdhaXQgZm9yIHRoZSBkYXRhIHRvIGJlIHJlYWR5LgogICAgbGV0IGZpbGVEYXRhID0geWllbGQgewogICAgICBwcm9taXNlOiBmaWxlRGF0YVByb21pc2UsCiAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgYWN0aW9uOiAnY29udGludWUnLAogICAgICB9CiAgICB9OwoKICAgIC8vIFVzZSBhIGNodW5rZWQgc2VuZGluZyB0byBhdm9pZCBtZXNzYWdlIHNpemUgbGltaXRzLiBTZWUgYi82MjExNTY2MC4KICAgIGxldCBwb3NpdGlvbiA9IDA7CiAgICB3aGlsZSAocG9zaXRpb24gPCBmaWxlRGF0YS5ieXRlTGVuZ3RoKSB7CiAgICAgIGNvbnN0IGxlbmd0aCA9IE1hdGgubWluKGZpbGVEYXRhLmJ5dGVMZW5ndGggLSBwb3NpdGlvbiwgTUFYX1BBWUxPQURfU0laRSk7CiAgICAgIGNvbnN0IGNodW5rID0gbmV3IFVpbnQ4QXJyYXkoZmlsZURhdGEsIHBvc2l0aW9uLCBsZW5ndGgpOwogICAgICBwb3NpdGlvbiArPSBsZW5ndGg7CgogICAgICBjb25zdCBiYXNlNjQgPSBidG9hKFN0cmluZy5mcm9tQ2hhckNvZGUuYXBwbHkobnVsbCwgY2h1bmspKTsKICAgICAgeWllbGQgewogICAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgICBhY3Rpb246ICdhcHBlbmQnLAogICAgICAgICAgZmlsZTogZmlsZS5uYW1lLAogICAgICAgICAgZGF0YTogYmFzZTY0LAogICAgICAgIH0sCiAgICAgIH07CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPQogICAgICAgICAgYCR7TWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCl9JSBkb25lYDsKICAgIH0KICB9CgogIC8vIEFsbCBkb25lLgogIHlpZWxkIHsKICAgIHJlc3BvbnNlOiB7CiAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgIH0KICB9Owp9CgpzY29wZS5nb29nbGUgPSBzY29wZS5nb29nbGUgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYiA9IHNjb3BlLmdvb2dsZS5jb2xhYiB8fCB7fTsKc2NvcGUuZ29vZ2xlLmNvbGFiLl9maWxlcyA9IHsKICBfdXBsb2FkRmlsZXMsCiAgX3VwbG9hZEZpbGVzQ29udGludWUsCn07Cn0pKHNlbGYpOwo=",
59 | "ok": true,
60 | "headers": [
61 | [
62 | "content-type",
63 | "application/javascript"
64 | ]
65 | ],
66 | "status": 200,
67 | "status_text": ""
68 | }
69 | },
70 | "base_uri": "https://localhost:8080/",
71 | "height": 75
72 | }
73 | },
74 | "source": [
75 | "from google.colab import files\n",
76 | "uploaded = files.upload()"
77 | ],
78 | "execution_count": 1,
79 | "outputs": [
80 | {
81 | "output_type": "display_data",
82 | "data": {
83 | "text/html": [
84 | "\n",
85 | " \n",
86 | " \n",
90 | " "
91 | ],
92 | "text/plain": [
93 | ""
94 | ]
95 | },
96 | "metadata": {
97 | "tags": []
98 | }
99 | },
100 | {
101 | "output_type": "stream",
102 | "text": [
103 | "Saving BreastTissue.csv to BreastTissue.csv\n"
104 | ],
105 | "name": "stdout"
106 | }
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "metadata": {
112 | "id": "ktwxF3kc9haU",
113 | "colab_type": "code",
114 | "colab": {
115 | "base_uri": "https://localhost:8080/",
116 | "height": 203
117 | },
118 | "outputId": "a91382eb-d766-4088-9296-7d8cab7fe6ac"
119 | },
120 | "source": [
121 | "# import pandas\n",
122 | "import pandas as pd\n",
123 | "\n",
124 | "# read the dataset\n",
125 | "df = pd.read_csv('BreastTissue.csv')\n",
126 | "\n",
127 | "df.head()"
128 | ],
129 | "execution_count": 2,
130 | "outputs": [
131 | {
132 | "output_type": "execute_result",
133 | "data": {
134 | "text/html": [
135 | "\n",
136 | "\n",
149 | "
\n",
150 | " \n",
151 | " \n",
152 | " | \n",
153 | " Case # | \n",
154 | " Class | \n",
155 | " I0 | \n",
156 | " PA500 | \n",
157 | " HFS | \n",
158 | " DA | \n",
159 | " Area | \n",
160 | " A/DA | \n",
161 | " Max IP | \n",
162 | " DR | \n",
163 | " P | \n",
164 | "
\n",
165 | " \n",
166 | " \n",
167 | " \n",
168 | " | 0 | \n",
169 | " 1 | \n",
170 | " car | \n",
171 | " 524.794072 | \n",
172 | " 0.187448 | \n",
173 | " 0.032114 | \n",
174 | " 228.800228 | \n",
175 | " 6843.598481 | \n",
176 | " 29.910803 | \n",
177 | " 60.204880 | \n",
178 | " 220.737212 | \n",
179 | " 556.828334 | \n",
180 | "
\n",
181 | " \n",
182 | " | 1 | \n",
183 | " 2 | \n",
184 | " car | \n",
185 | " 330.000000 | \n",
186 | " 0.226893 | \n",
187 | " 0.265290 | \n",
188 | " 121.154201 | \n",
189 | " 3163.239472 | \n",
190 | " 26.109202 | \n",
191 | " 69.717361 | \n",
192 | " 99.084964 | \n",
193 | " 400.225776 | \n",
194 | "
\n",
195 | " \n",
196 | " | 2 | \n",
197 | " 3 | \n",
198 | " car | \n",
199 | " 551.879287 | \n",
200 | " 0.232478 | \n",
201 | " 0.063530 | \n",
202 | " 264.804935 | \n",
203 | " 11888.391830 | \n",
204 | " 44.894903 | \n",
205 | " 77.793297 | \n",
206 | " 253.785300 | \n",
207 | " 656.769449 | \n",
208 | "
\n",
209 | " \n",
210 | " | 3 | \n",
211 | " 4 | \n",
212 | " car | \n",
213 | " 380.000000 | \n",
214 | " 0.240855 | \n",
215 | " 0.286234 | \n",
216 | " 137.640111 | \n",
217 | " 5402.171180 | \n",
218 | " 39.248524 | \n",
219 | " 88.758446 | \n",
220 | " 105.198568 | \n",
221 | " 493.701813 | \n",
222 | "
\n",
223 | " \n",
224 | " | 4 | \n",
225 | " 5 | \n",
226 | " car | \n",
227 | " 362.831266 | \n",
228 | " 0.200713 | \n",
229 | " 0.244346 | \n",
230 | " 124.912559 | \n",
231 | " 3290.462446 | \n",
232 | " 26.342127 | \n",
233 | " 69.389389 | \n",
234 | " 103.866552 | \n",
235 | " 424.796503 | \n",
236 | "
\n",
237 | " \n",
238 | "
\n",
239 | "
"
240 | ],
241 | "text/plain": [
242 | " Case # Class I0 ... Max IP DR P\n",
243 | "0 1 car 524.794072 ... 60.204880 220.737212 556.828334\n",
244 | "1 2 car 330.000000 ... 69.717361 99.084964 400.225776\n",
245 | "2 3 car 551.879287 ... 77.793297 253.785300 656.769449\n",
246 | "3 4 car 380.000000 ... 88.758446 105.198568 493.701813\n",
247 | "4 5 car 362.831266 ... 69.389389 103.866552 424.796503\n",
248 | "\n",
249 | "[5 rows x 11 columns]"
250 | ]
251 | },
252 | "metadata": {
253 | "tags": []
254 | },
255 | "execution_count": 2
256 | }
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "metadata": {
262 | "id": "F4I9l_YmEdrn",
263 | "colab_type": "code",
264 | "outputId": "0367968e-2fa1-48a6-f7bc-870f25e12f77",
265 | "colab": {
266 | "base_uri": "https://localhost:8080/",
267 | "height": 331
268 | }
269 | },
270 | "source": [
271 | "print(df.shape)\n",
272 | "\n",
273 | "num_of_classes = len(df.Class.unique())\n",
274 | "print(num_of_classes)\n",
275 | "\n",
276 | "df.describe()"
277 | ],
278 | "execution_count": 3,
279 | "outputs": [
280 | {
281 | "output_type": "stream",
282 | "text": [
283 | "(106, 11)\n",
284 | "6\n"
285 | ],
286 | "name": "stdout"
287 | },
288 | {
289 | "output_type": "execute_result",
290 | "data": {
291 | "text/html": [
292 | "\n",
293 | "\n",
306 | "
\n",
307 | " \n",
308 | " \n",
309 | " | \n",
310 | " Case # | \n",
311 | " I0 | \n",
312 | " PA500 | \n",
313 | " HFS | \n",
314 | " DA | \n",
315 | " Area | \n",
316 | " A/DA | \n",
317 | " Max IP | \n",
318 | " DR | \n",
319 | " P | \n",
320 | "
\n",
321 | " \n",
322 | " \n",
323 | " \n",
324 | " | count | \n",
325 | " 106.000000 | \n",
326 | " 106.000000 | \n",
327 | " 106.000000 | \n",
328 | " 106.000000 | \n",
329 | " 106.000000 | \n",
330 | " 106.000000 | \n",
331 | " 106.000000 | \n",
332 | " 106.000000 | \n",
333 | " 106.000000 | \n",
334 | " 106.000000 | \n",
335 | "
\n",
336 | " \n",
337 | " | mean | \n",
338 | " 53.500000 | \n",
339 | " 784.251618 | \n",
340 | " 0.120133 | \n",
341 | " 0.114691 | \n",
342 | " 190.568642 | \n",
343 | " 7335.155161 | \n",
344 | " 23.473784 | \n",
345 | " 75.381258 | \n",
346 | " 166.710575 | \n",
347 | " 810.638127 | \n",
348 | "
\n",
349 | " \n",
350 | " | std | \n",
351 | " 30.743563 | \n",
352 | " 753.950075 | \n",
353 | " 0.068596 | \n",
354 | " 0.101347 | \n",
355 | " 190.801448 | \n",
356 | " 18580.314212 | \n",
357 | " 23.354672 | \n",
358 | " 81.345838 | \n",
359 | " 181.309580 | \n",
360 | " 763.019135 | \n",
361 | "
\n",
362 | " \n",
363 | " | min | \n",
364 | " 1.000000 | \n",
365 | " 103.000000 | \n",
366 | " 0.012392 | \n",
367 | " -0.066323 | \n",
368 | " 19.647670 | \n",
369 | " 70.426239 | \n",
370 | " 1.595742 | \n",
371 | " 7.968783 | \n",
372 | " -9.257696 | \n",
373 | " 124.978561 | \n",
374 | "
\n",
375 | " \n",
376 | " | 25% | \n",
377 | " 27.250000 | \n",
378 | " 250.000000 | \n",
379 | " 0.067413 | \n",
380 | " 0.043982 | \n",
381 | " 53.845470 | \n",
382 | " 409.647141 | \n",
383 | " 8.180321 | \n",
384 | " 26.893773 | \n",
385 | " 41.781258 | \n",
386 | " 270.215238 | \n",
387 | "
\n",
388 | " \n",
389 | " | 50% | \n",
390 | " 53.500000 | \n",
391 | " 384.936489 | \n",
392 | " 0.105418 | \n",
393 | " 0.086568 | \n",
394 | " 120.777303 | \n",
395 | " 2219.581163 | \n",
396 | " 16.133657 | \n",
397 | " 44.216040 | \n",
398 | " 97.832557 | \n",
399 | " 454.108153 | \n",
400 | "
\n",
401 | " \n",
402 | " | 75% | \n",
403 | " 79.750000 | \n",
404 | " 1487.989626 | \n",
405 | " 0.169602 | \n",
406 | " 0.166504 | \n",
407 | " 255.334809 | \n",
408 | " 7615.204968 | \n",
409 | " 30.953294 | \n",
410 | " 83.671755 | \n",
411 | " 232.990070 | \n",
412 | " 1301.559438 | \n",
413 | "
\n",
414 | " \n",
415 | " | max | \n",
416 | " 106.000000 | \n",
417 | " 2800.000000 | \n",
418 | " 0.358316 | \n",
419 | " 0.467748 | \n",
420 | " 1063.441427 | \n",
421 | " 174480.476200 | \n",
422 | " 164.071543 | \n",
423 | " 436.099640 | \n",
424 | " 977.552367 | \n",
425 | " 2896.582483 | \n",
426 | "
\n",
427 | " \n",
428 | "
\n",
429 | "
"
430 | ],
431 | "text/plain": [
432 | " Case # I0 PA500 ... Max IP DR P\n",
433 | "count 106.000000 106.000000 106.000000 ... 106.000000 106.000000 106.000000\n",
434 | "mean 53.500000 784.251618 0.120133 ... 75.381258 166.710575 810.638127\n",
435 | "std 30.743563 753.950075 0.068596 ... 81.345838 181.309580 763.019135\n",
436 | "min 1.000000 103.000000 0.012392 ... 7.968783 -9.257696 124.978561\n",
437 | "25% 27.250000 250.000000 0.067413 ... 26.893773 41.781258 270.215238\n",
438 | "50% 53.500000 384.936489 0.105418 ... 44.216040 97.832557 454.108153\n",
439 | "75% 79.750000 1487.989626 0.169602 ... 83.671755 232.990070 1301.559438\n",
440 | "max 106.000000 2800.000000 0.358316 ... 436.099640 977.552367 2896.582483\n",
441 | "\n",
442 | "[8 rows x 10 columns]"
443 | ]
444 | },
445 | "metadata": {
446 | "tags": []
447 | },
448 | "execution_count": 3
449 | }
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "metadata": {
455 | "id": "iBr_P8exEFLa",
456 | "colab_type": "code",
457 | "outputId": "b326530e-aa71-4caa-81c0-af98a1931c8f",
458 | "colab": {
459 | "base_uri": "https://localhost:8080/",
460 | "height": 53
461 | }
462 | },
463 | "source": [
464 | "# split train input and output data\n",
465 | "X = df.drop(axis=0, columns=['Class', 'Case #'])\n",
466 | "Y = df.Class\n",
467 | "\n",
468 | "#Print the shape of X and Y\n",
469 | "print(X.shape)\n",
470 | "print(Y.shape)"
471 | ],
472 | "execution_count": 4,
473 | "outputs": [
474 | {
475 | "output_type": "stream",
476 | "text": [
477 | "(106, 9)\n",
478 | "(106,)\n"
479 | ],
480 | "name": "stdout"
481 | }
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "metadata": {
487 | "id": "QCV1u4tqFIqm",
488 | "colab_type": "code",
489 | "colab": {}
490 | },
491 | "source": [
492 | "from sklearn.model_selection import train_test_split\n",
493 | "\n",
494 | "# Split into training and test sets\n",
495 | "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)"
496 | ],
497 | "execution_count": 0,
498 | "outputs": []
499 | },
500 | {
501 | "cell_type": "code",
502 | "metadata": {
503 | "id": "3oMEHRVgF640",
504 | "colab_type": "code",
505 | "outputId": "9136f352-5b1d-40f4-9ba1-26c066f21f55",
506 | "colab": {
507 | "base_uri": "https://localhost:8080/",
508 | "height": 35
509 | }
510 | },
511 | "source": [
512 | "from xgboost import XGBClassifier\n",
513 | "from sklearn.metrics import roc_auc_score\n",
514 | "from sklearn import preprocessing\n",
515 | "\n",
516 | "xgb = XGBClassifier(booster='gbtree', objective='multi:softprob', random_state=42, eval_metric=\"auc\", num_class=num_of_classes)\n",
517 | "\n",
518 | "xgb.fit(X_train,y_train)\n",
519 | "\n",
520 | "pred = xgb.predict(X_test)\n",
521 | "val = xgb.predict(X_test)\n",
522 | "\n",
523 | "lb = preprocessing.LabelBinarizer()\n",
524 | "lb.fit(y_test)\n",
525 | "\n",
526 | "y_test_lb = lb.transform(y_test)\n",
527 | "val_lb = lb.transform(val)\n",
528 | "\n",
529 | "roc_auc_score(y_test_lb, val_lb, average='macro')"
530 | ],
531 | "execution_count": 9,
532 | "outputs": [
533 | {
534 | "output_type": "execute_result",
535 | "data": {
536 | "text/plain": [
537 | "0.835727969348659"
538 | ]
539 | },
540 | "metadata": {
541 | "tags": []
542 | },
543 | "execution_count": 9
544 | }
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "metadata": {
550 | "id": "81Qqa0BzrEvL",
551 | "colab_type": "code",
552 | "colab": {
553 | "base_uri": "https://localhost:8080/",
554 | "height": 203
555 | },
556 | "outputId": "8dde22fc-0597-480e-d5d0-92a2559bda4c"
557 | },
558 | "source": [
559 | "output = pd.DataFrame()\n",
560 | "output['Expected Output'] = y_test\n",
561 | "output['Predicted Output'] = val\n",
562 | "output.tail()"
563 | ],
564 | "execution_count": 10,
565 | "outputs": [
566 | {
567 | "output_type": "execute_result",
568 | "data": {
569 | "text/html": [
570 | "\n",
571 | "\n",
584 | "
\n",
585 | " \n",
586 | " \n",
587 | " | \n",
588 | " Expected Output | \n",
589 | " Predicted Output | \n",
590 | "
\n",
591 | " \n",
592 | " \n",
593 | " \n",
594 | " | 36 | \n",
595 | " mas | \n",
596 | " fad | \n",
597 | "
\n",
598 | " \n",
599 | " | 88 | \n",
600 | " adi | \n",
601 | " adi | \n",
602 | "
\n",
603 | " \n",
604 | " | 9 | \n",
605 | " car | \n",
606 | " car | \n",
607 | "
\n",
608 | " \n",
609 | " | 53 | \n",
610 | " mas | \n",
611 | " fad | \n",
612 | "
\n",
613 | " \n",
614 | " | 95 | \n",
615 | " adi | \n",
616 | " adi | \n",
617 | "
\n",
618 | " \n",
619 | "
\n",
620 | "
"
621 | ],
622 | "text/plain": [
623 | " Expected Output Predicted Output\n",
624 | "36 mas fad\n",
625 | "88 adi adi\n",
626 | "9 car car\n",
627 | "53 mas fad\n",
628 | "95 adi adi"
629 | ]
630 | },
631 | "metadata": {
632 | "tags": []
633 | },
634 | "execution_count": 10
635 | }
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "metadata": {
641 | "id": "euLoXe4LrnnQ",
642 | "colab_type": "code",
643 | "colab": {}
644 | },
645 | "source": [
646 | ""
647 | ],
648 | "execution_count": 0,
649 | "outputs": []
650 | }
651 | ]
652 | }
--------------------------------------------------------------------------------