├── IQR method for handling outliers.ipynb ├── Iterative Imputer demo.ipynb ├── README.md ├── Winsorizing outliers.ipynb ├── column transformer part 1.ipynb ├── column transformer part 2.ipynb ├── confusion matrix metrics + ROC & PR curves.ipynb ├── cross validation.ipynb ├── data ├── churn modelling.csv └── income_evaluation.csv ├── exponential and log transformer.ipynb ├── grid search.ipynb ├── knn imputer.ipynb ├── label encoder.ipynb ├── maxabs scaler.ipynb ├── min max scaler.ipynb ├── missing indicator.ipynb ├── normalizer.ipynb ├── one hot encoder.ipynb ├── ordinal encoder.ipynb ├── pipeline.ipynb ├── power transformer.ipynb ├── ppts ├── CM metrics 1-4.pptx ├── CM metrics 5-6.pptx ├── Feature scaling.pptx ├── Grid Search.pptx ├── KNN Imputer Algorithm.pptx ├── README.md ├── bias variance.pptx ├── confusion matrix.pptx ├── cross validation.pptx ├── mcc.pptx ├── mice.pptx ├── outlier.pptx └── roc pr auc.pptx ├── quantile transformer.ipynb ├── robust scaler.ipynb ├── simple imputer.ipynb ├── standard scaler.ipynb ├── train_test_split.ipynb ├── why NEVER use pd.get_dummies.ipynb └── z score for handling outliers.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # machineLearning 2 | A repo for all the relevant code notebooks and datasets used in my Machine Learning tutorial videos on YouTube, accessible here: https://www.youtube.com/playlist?list=PLlg4M31xJeYa7XcJZWypot8l7R-0E65Ls 3 | -------------------------------------------------------------------------------- /knn imputer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "from sklearn.model_selection import train_test_split" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/html": [ 22 | "
\n", 23 | "\n", 36 | "\n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countryincome
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
\n", 150 | "
" 151 | ], 152 | "text/plain": [ 153 | " age workclass fnlwgt education education-num \\\n", 154 | "0 39 State-gov 77516 Bachelors 13 \n", 155 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", 156 | "2 38 Private 215646 HS-grad 9 \n", 157 | "3 53 Private 234721 11th 7 \n", 158 | "4 28 Private 338409 Bachelors 13 \n", 159 | "\n", 160 | " marital-status occupation relationship race sex \\\n", 161 | "0 Never-married Adm-clerical Not-in-family White Male \n", 162 | "1 Married-civ-spouse Exec-managerial Husband White Male \n", 163 | "2 Divorced Handlers-cleaners Not-in-family White Male \n", 164 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", 165 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n", 166 | "\n", 167 | " capital-gain capital-loss hours-per-week native-country income \n", 168 | "0 2174 0 40 United-States <=50K \n", 169 | "1 0 0 13 United-States <=50K \n", 170 | "2 0 0 40 United-States <=50K \n", 171 | "3 0 0 40 United-States <=50K \n", 172 | "4 0 0 40 Cuba <=50K " 173 | ] 174 | }, 175 | "execution_count": 2, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')\n", 182 | "df.head()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 9, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',\n", 194 | " ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',\n", 195 | " ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',\n", 196 | " ' Preschool', ' 12th'], dtype=object)" 197 | ] 198 | }, 199 | "execution_count": 9, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "df[' education'].unique()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 3, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "age 0\n", 217 | " workclass 1836\n", 218 | " fnlwgt 0\n", 219 | " education 0\n", 220 | " education-num 0\n", 221 | " marital-status 0\n", 222 | " occupation 1843\n", 223 | " relationship 0\n", 224 | " race 0\n", 225 | " sex 0\n", 226 | " capital-gain 0\n", 227 | " capital-loss 0\n", 228 | " hours-per-week 0\n", 229 | " native-country 583\n", 230 | " income 0\n", 231 | "dtype: int64" 232 | ] 233 | }, 234 | "execution_count": 3, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "df.isna().sum()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 4, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "# hours per week\n", 250 | "np.random.seed(seed=0)\n", 251 | "h = np.random.choice(a=df.index, replace=False, size=20)\n", 252 | "df.loc[h, ' hours-per-week'] = np.nan" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 5, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# age\n", 262 | "np.random.seed(seed=10)\n", 263 | "a = np.random.choice(a=df.index, replace=False, size=28)\n", 264 | "df.loc[a, 'age'] = np.nan" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 6, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "age 28\n", 276 | " workclass 1836\n", 277 | " fnlwgt 0\n", 278 | " education 0\n", 279 | " education-num 0\n", 280 | " marital-status 0\n", 281 | " occupation 1843\n", 282 | " relationship 0\n", 283 | " race 0\n", 284 | " sex 0\n", 285 | " capital-gain 0\n", 286 | " capital-loss 0\n", 287 | " hours-per-week 20\n", 288 | " native-country 583\n", 289 | " income 0\n", 290 | "dtype: int64" 291 | ] 292 | }, 293 | "execution_count": 6, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "df.isna().sum()" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 7, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', 1), df[' income'],\n", 309 | " test_size=0.2, random_state=5)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 10, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "from sklearn.impute import KNNImputer" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 21, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "knn = KNNImputer(n_neighbors=5, add_indicator=True)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 14, 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "dtype('float64')" 339 | ] 340 | }, 341 | "execution_count": 14, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "X_train['age'].dtypes" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 15, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "text/plain": [ 358 | "Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',\n", 359 | " ' marital-status', ' occupation', ' relationship', ' race', ' sex',\n", 360 | " ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country'],\n", 361 | " dtype='object')" 362 | ] 363 | }, 364 | "execution_count": 15, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "X_train.columns" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 17, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "num = [col for col in X_train.columns if X_train[col].dtypes != 'O']" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 19, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "data": { 389 | "text/html": [ 390 | "
\n", 391 | "\n", 404 | "\n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | "
agefnlwgteducation-numcapital-gaincapital-losshours-per-week
2142555.023821690040.0
2870724.030646090040.0
445548.021314040040.0
223136.0127306130040.0
1886453.0103586130055.0
\n", 464 | "
" 465 | ], 466 | "text/plain": [ 467 | " age fnlwgt education-num capital-gain capital-loss \\\n", 468 | "21425 55.0 238216 9 0 0 \n", 469 | "28707 24.0 306460 9 0 0 \n", 470 | "4455 48.0 213140 4 0 0 \n", 471 | "2231 36.0 127306 13 0 0 \n", 472 | "18864 53.0 103586 13 0 0 \n", 473 | "\n", 474 | " hours-per-week \n", 475 | "21425 40.0 \n", 476 | "28707 40.0 \n", 477 | "4455 40.0 \n", 478 | "2231 40.0 \n", 479 | "18864 55.0 " 480 | ] 481 | }, 482 | "execution_count": 19, 483 | "metadata": {}, 484 | "output_type": "execute_result" 485 | } 486 | ], 487 | "source": [ 488 | "X_train[num].head()" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 22, 494 | "metadata": {}, 495 | "outputs": [ 496 | { 497 | "data": { 498 | "text/plain": [ 499 | "KNNImputer(add_indicator=True)" 500 | ] 501 | }, 502 | "execution_count": 22, 503 | "metadata": {}, 504 | "output_type": "execute_result" 505 | } 506 | ], 507 | "source": [ 508 | "knn.fit(X_train[num])" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 23, 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "data": { 518 | "text/plain": [ 519 | "array([[5.50000e+01, 2.38216e+05, 9.00000e+00, ..., 4.00000e+01,\n", 520 | " 0.00000e+00, 0.00000e+00],\n", 521 | " [2.40000e+01, 3.06460e+05, 9.00000e+00, ..., 4.00000e+01,\n", 522 | " 0.00000e+00, 0.00000e+00],\n", 523 | " [4.80000e+01, 2.13140e+05, 4.00000e+00, ..., 4.00000e+01,\n", 524 | " 0.00000e+00, 0.00000e+00],\n", 525 | " ...,\n", 526 | " [8.50000e+01, 1.66027e+05, 9.00000e+00, ..., 5.00000e+01,\n", 527 | " 0.00000e+00, 0.00000e+00],\n", 528 | " [3.60000e+01, 4.69056e+05, 9.00000e+00, ..., 2.50000e+01,\n", 529 | " 0.00000e+00, 0.00000e+00],\n", 530 | " [2.60000e+01, 1.98163e+05, 1.40000e+01, ..., 4.00000e+01,\n", 531 | " 0.00000e+00, 0.00000e+00]])" 532 | ] 533 | }, 534 | "execution_count": 23, 535 | "metadata": {}, 536 | "output_type": "execute_result" 537 | } 538 | ], 539 | "source": [ 540 | "knn.transform(X_train[num])" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 27, 546 | "metadata": { 547 | "scrolled": true 548 | }, 549 | "outputs": [ 550 | { 551 | "data": { 552 | "text/html": [ 553 | "
\n", 554 | "\n", 567 | "\n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | "
01234567
055.0238216.09.00.00.040.00.00.0
124.0306460.09.00.00.040.00.00.0
248.0213140.04.00.00.040.00.00.0
336.0127306.013.00.00.040.00.00.0
453.0103586.013.00.00.055.00.00.0
\n", 639 | "
" 640 | ], 641 | "text/plain": [ 642 | " 0 1 2 3 4 5 6 7\n", 643 | "0 55.0 238216.0 9.0 0.0 0.0 40.0 0.0 0.0\n", 644 | "1 24.0 306460.0 9.0 0.0 0.0 40.0 0.0 0.0\n", 645 | "2 48.0 213140.0 4.0 0.0 0.0 40.0 0.0 0.0\n", 646 | "3 36.0 127306.0 13.0 0.0 0.0 40.0 0.0 0.0\n", 647 | "4 53.0 103586.0 13.0 0.0 0.0 55.0 0.0 0.0" 648 | ] 649 | }, 650 | "execution_count": 27, 651 | "metadata": {}, 652 | "output_type": "execute_result" 653 | } 654 | ], 655 | "source": [ 656 | "pd.DataFrame(knn.transform(X_train[num])).head()" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 30, 662 | "metadata": {}, 663 | "outputs": [ 664 | { 665 | "data": { 666 | "text/plain": [ 667 | "age 5\n", 668 | " fnlwgt 0\n", 669 | " education-num 0\n", 670 | " capital-gain 0\n", 671 | " capital-loss 0\n", 672 | " hours-per-week 1\n", 673 | "dtype: int64" 674 | ] 675 | }, 676 | "execution_count": 30, 677 | "metadata": {}, 678 | "output_type": "execute_result" 679 | } 680 | ], 681 | "source": [ 682 | "X_test[num].isna().sum()" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 31, 688 | "metadata": {}, 689 | "outputs": [ 690 | { 691 | "data": { 692 | "text/plain": [ 693 | "array([[3.20000e+01, 2.60954e+05, 7.00000e+00, ..., 3.00000e+01,\n", 694 | " 0.00000e+00, 0.00000e+00],\n", 695 | " [3.10000e+01, 2.36391e+05, 1.00000e+01, ..., 4.00000e+01,\n", 696 | " 0.00000e+00, 0.00000e+00],\n", 697 | " [5.90000e+01, 1.75689e+05, 1.00000e+01, ..., 1.40000e+01,\n", 698 | " 0.00000e+00, 0.00000e+00],\n", 699 | " ...,\n", 700 | " [2.60000e+01, 1.77482e+05, 1.20000e+01, ..., 4.50000e+01,\n", 701 | " 0.00000e+00, 0.00000e+00],\n", 702 | " [4.70000e+01, 2.58498e+05, 1.00000e+01, ..., 5.20000e+01,\n", 703 | " 0.00000e+00, 0.00000e+00],\n", 704 | " [4.50000e+01, 1.60962e+05, 1.00000e+01, ..., 3.50000e+01,\n", 705 | " 0.00000e+00, 0.00000e+00]])" 706 | ] 707 | }, 708 | "execution_count": 31, 709 | "metadata": {}, 710 | "output_type": "execute_result" 711 | } 712 | ], 713 | "source": [ 714 | "knn.transform(X_test[num])" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": 33, 720 | "metadata": {}, 721 | "outputs": [ 722 | { 723 | "data": { 724 | "text/plain": [ 725 | "0" 726 | ] 727 | }, 728 | "execution_count": 33, 729 | "metadata": {}, 730 | "output_type": "execute_result" 731 | } 732 | ], 733 | "source": [ 734 | "pd.DataFrame(knn.transform(X_test[num])).isna().sum().sum()" 735 | ] 736 | } 737 | ], 738 | "metadata": { 739 | "kernelspec": { 740 | "display_name": "Python 3", 741 | "language": "python", 742 | "name": "python3" 743 | }, 744 | "language_info": { 745 | "codemirror_mode": { 746 | "name": "ipython", 747 | "version": 3 748 | }, 749 | "file_extension": ".py", 750 | "mimetype": "text/x-python", 751 | "name": "python", 752 | "nbconvert_exporter": "python", 753 | "pygments_lexer": "ipython3", 754 | "version": "3.7.3" 755 | } 756 | }, 757 | "nbformat": 4, 758 | "nbformat_minor": 2 759 | } 760 | -------------------------------------------------------------------------------- /label encoder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/html": [ 21 | "
\n", 22 | "\n", 35 | "\n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countryincome
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
\n", 149 | "
" 150 | ], 151 | "text/plain": [ 152 | " age workclass fnlwgt education education-num \\\n", 153 | "0 39 State-gov 77516 Bachelors 13 \n", 154 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", 155 | "2 38 Private 215646 HS-grad 9 \n", 156 | "3 53 Private 234721 11th 7 \n", 157 | "4 28 Private 338409 Bachelors 13 \n", 158 | "\n", 159 | " marital-status occupation relationship race sex \\\n", 160 | "0 Never-married Adm-clerical Not-in-family White Male \n", 161 | "1 Married-civ-spouse Exec-managerial Husband White Male \n", 162 | "2 Divorced Handlers-cleaners Not-in-family White Male \n", 163 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", 164 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n", 165 | "\n", 166 | " capital-gain capital-loss hours-per-week native-country income \n", 167 | "0 2174 0 40 United-States <=50K \n", 168 | "1 0 0 13 United-States <=50K \n", 169 | "2 0 0 40 United-States <=50K \n", 170 | "3 0 0 40 United-States <=50K \n", 171 | "4 0 0 40 Cuba <=50K " 172 | ] 173 | }, 174 | "execution_count": 2, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "df = pd.read_csv('data/income_evaluation.csv')\n", 181 | "df.head()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 28, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',\n", 193 | " ' marital-status', ' occupation', ' relationship', ' race', ' sex',\n", 194 | " ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',\n", 195 | " ' income'],\n", 196 | " dtype='object')" 197 | ] 198 | }, 199 | "execution_count": 28, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "df.columns" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 25, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | " <=50K 24720\n", 217 | " >50K 7841\n", 218 | "Name: income, dtype: int64" 219 | ] 220 | }, 221 | "execution_count": 25, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "df[' income'].value_counts()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 26, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "from sklearn.model_selection import train_test_split" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 29, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', axis=1), df[' income'],\n", 246 | " test_size=0.2, random_state=0)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 30, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "from sklearn.preprocessing import LabelEncoder" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 31, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "le = LabelEncoder()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 32, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "LabelEncoder()" 276 | ] 277 | }, 278 | "execution_count": 32, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "le.fit(y_train)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 33, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "array([' <=50K', ' >50K'], dtype=object)" 296 | ] 297 | }, 298 | "execution_count": 33, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "le.classes_" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 37, 310 | "metadata": { 311 | "scrolled": true 312 | }, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "text/plain": [ 317 | "15282 <=50K\n", 318 | "24870 <=50K\n", 319 | "18822 <=50K\n", 320 | "26404 <=50K\n", 321 | "7842 <=50K\n", 322 | "4890 <=50K\n", 323 | "3243 <=50K\n", 324 | "17470 <=50K\n", 325 | "14211 <=50K\n", 326 | "22453 <=50K\n", 327 | "631 <=50K\n", 328 | "29051 <=50K\n", 329 | "21478 <=50K\n", 330 | "26565 <=50K\n", 331 | "25140 <=50K\n", 332 | "15497 >50K\n", 333 | "14689 <=50K\n", 334 | "18726 <=50K\n", 335 | "28105 <=50K\n", 336 | "6965 <=50K\n", 337 | "4343 >50K\n", 338 | "24308 <=50K\n", 339 | "11380 <=50K\n", 340 | "26087 <=50K\n", 341 | "5679 <=50K\n", 342 | "13019 <=50K\n", 343 | "24049 >50K\n", 344 | "32119 >50K\n", 345 | "25586 >50K\n", 346 | "26959 <=50K\n", 347 | " ... \n", 348 | "6216 >50K\n", 349 | "27469 <=50K\n", 350 | "16921 <=50K\n", 351 | "26277 >50K\n", 352 | "2897 <=50K\n", 353 | "24152 <=50K\n", 354 | "18606 <=50K\n", 355 | "10327 >50K\n", 356 | "18983 <=50K\n", 357 | "32230 >50K\n", 358 | "17089 <=50K\n", 359 | "14650 >50K\n", 360 | "19852 <=50K\n", 361 | "6744 <=50K\n", 362 | "15832 >50K\n", 363 | "15430 <=50K\n", 364 | "14935 <=50K\n", 365 | "14116 <=50K\n", 366 | "22258 <=50K\n", 367 | "20757 <=50K\n", 368 | "24275 <=50K\n", 369 | "9225 <=50K\n", 370 | "32103 <=50K\n", 371 | "30403 <=50K\n", 372 | "21243 <=50K\n", 373 | "13123 >50K\n", 374 | "19648 <=50K\n", 375 | "9845 <=50K\n", 376 | "10799 >50K\n", 377 | "2732 <=50K\n", 378 | "Name: income, Length: 26048, dtype: object" 379 | ] 380 | }, 381 | "execution_count": 37, 382 | "metadata": {}, 383 | "output_type": "execute_result" 384 | } 385 | ], 386 | "source": [ 387 | "y_train" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 35, 393 | "metadata": { 394 | "scrolled": true 395 | }, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "0 0\n", 401 | "1 0\n", 402 | "2 0\n", 403 | "3 0\n", 404 | "4 0\n", 405 | "5 0\n", 406 | "6 0\n", 407 | "7 0\n", 408 | "8 0\n", 409 | "9 0\n", 410 | "10 0\n", 411 | "11 0\n", 412 | "12 0\n", 413 | "13 0\n", 414 | "14 0\n", 415 | "15 1\n", 416 | "16 0\n", 417 | "17 0\n", 418 | "18 0\n", 419 | "19 0\n", 420 | "20 1\n", 421 | "21 0\n", 422 | "22 0\n", 423 | "23 0\n", 424 | "24 0\n", 425 | "25 0\n", 426 | "26 1\n", 427 | "27 1\n", 428 | "28 1\n", 429 | "29 0\n", 430 | " ..\n", 431 | "26018 1\n", 432 | "26019 0\n", 433 | "26020 0\n", 434 | "26021 1\n", 435 | "26022 0\n", 436 | "26023 0\n", 437 | "26024 0\n", 438 | "26025 1\n", 439 | "26026 0\n", 440 | "26027 1\n", 441 | "26028 0\n", 442 | "26029 1\n", 443 | "26030 0\n", 444 | "26031 0\n", 445 | "26032 1\n", 446 | "26033 0\n", 447 | "26034 0\n", 448 | "26035 0\n", 449 | "26036 0\n", 450 | "26037 0\n", 451 | "26038 0\n", 452 | "26039 0\n", 453 | "26040 0\n", 454 | "26041 0\n", 455 | "26042 0\n", 456 | "26043 1\n", 457 | "26044 0\n", 458 | "26045 0\n", 459 | "26046 1\n", 460 | "26047 0\n", 461 | "Length: 26048, dtype: int32" 462 | ] 463 | }, 464 | "execution_count": 35, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "pd.Series(le.transform(y_train))" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 38, 476 | "metadata": {}, 477 | "outputs": [ 478 | { 479 | "data": { 480 | "text/plain": [ 481 | "array([0, 0, 0, ..., 1, 0, 1])" 482 | ] 483 | }, 484 | "execution_count": 38, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "le.transform(y_test)" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 41, 496 | "metadata": { 497 | "scrolled": true 498 | }, 499 | "outputs": [ 500 | { 501 | "data": { 502 | "text/plain": [ 503 | "15282 United-States\n", 504 | "24870 United-States\n", 505 | "18822 United-States\n", 506 | "26404 United-States\n", 507 | "7842 United-States\n", 508 | "4890 United-States\n", 509 | "3243 Mexico\n", 510 | "17470 United-States\n", 511 | "14211 United-States\n", 512 | "22453 United-States\n", 513 | "631 United-States\n", 514 | "29051 United-States\n", 515 | "21478 United-States\n", 516 | "26565 United-States\n", 517 | "25140 United-States\n", 518 | "15497 United-States\n", 519 | "14689 United-States\n", 520 | "18726 Mexico\n", 521 | "28105 United-States\n", 522 | "6965 United-States\n", 523 | "4343 United-States\n", 524 | "24308 United-States\n", 525 | "11380 United-States\n", 526 | "26087 Ireland\n", 527 | "5679 United-States\n", 528 | "13019 United-States\n", 529 | "24049 United-States\n", 530 | "32119 United-States\n", 531 | "25586 United-States\n", 532 | "26959 United-States\n", 533 | " ... \n", 534 | "6216 United-States\n", 535 | "27469 United-States\n", 536 | "16921 United-States\n", 537 | "26277 England\n", 538 | "2897 United-States\n", 539 | "24152 United-States\n", 540 | "18606 Nicaragua\n", 541 | "10327 United-States\n", 542 | "18983 United-States\n", 543 | "32230 United-States\n", 544 | "17089 United-States\n", 545 | "14650 United-States\n", 546 | "19852 United-States\n", 547 | "6744 United-States\n", 548 | "15832 United-States\n", 549 | "15430 United-States\n", 550 | "14935 United-States\n", 551 | "14116 United-States\n", 552 | "22258 United-States\n", 553 | "20757 United-States\n", 554 | "24275 United-States\n", 555 | "9225 England\n", 556 | "32103 United-States\n", 557 | "30403 United-States\n", 558 | "21243 United-States\n", 559 | "13123 United-States\n", 560 | "19648 United-States\n", 561 | "9845 United-States\n", 562 | "10799 United-States\n", 563 | "2732 United-States\n", 564 | "Name: native-country, Length: 26048, dtype: object" 565 | ] 566 | }, 567 | "execution_count": 41, 568 | "metadata": {}, 569 | "output_type": "execute_result" 570 | } 571 | ], 572 | "source": [ 573 | "X_train[' native-country']" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 40, 579 | "metadata": {}, 580 | "outputs": [ 581 | { 582 | "data": { 583 | "text/plain": [ 584 | "array([39, 39, 39, ..., 39, 39, 39])" 585 | ] 586 | }, 587 | "execution_count": 40, 588 | "metadata": {}, 589 | "output_type": "execute_result" 590 | } 591 | ], 592 | "source": [ 593 | "le1 = LabelEncoder()\n", 594 | "le1.fit_transform(X_train[' native-country'])" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 42, 600 | "metadata": {}, 601 | "outputs": [ 602 | { 603 | "data": { 604 | "text/plain": [ 605 | "0 39\n", 606 | "1 39\n", 607 | "2 39\n", 608 | "3 39\n", 609 | "4 39\n", 610 | "5 39\n", 611 | "6 26\n", 612 | "7 39\n", 613 | "8 39\n", 614 | "9 39\n", 615 | "10 39\n", 616 | "11 39\n", 617 | "12 39\n", 618 | "13 39\n", 619 | "14 39\n", 620 | "15 39\n", 621 | "16 39\n", 622 | "17 26\n", 623 | "18 39\n", 624 | "19 39\n", 625 | "20 39\n", 626 | "21 39\n", 627 | "22 39\n", 628 | "23 21\n", 629 | "24 39\n", 630 | "25 39\n", 631 | "26 39\n", 632 | "27 39\n", 633 | "28 39\n", 634 | "29 39\n", 635 | " ..\n", 636 | "26018 39\n", 637 | "26019 39\n", 638 | "26020 39\n", 639 | "26021 9\n", 640 | "26022 39\n", 641 | "26023 39\n", 642 | "26024 27\n", 643 | "26025 39\n", 644 | "26026 39\n", 645 | "26027 39\n", 646 | "26028 39\n", 647 | "26029 39\n", 648 | "26030 39\n", 649 | "26031 39\n", 650 | "26032 39\n", 651 | "26033 39\n", 652 | "26034 39\n", 653 | "26035 39\n", 654 | "26036 39\n", 655 | "26037 39\n", 656 | "26038 39\n", 657 | "26039 9\n", 658 | "26040 39\n", 659 | "26041 39\n", 660 | "26042 39\n", 661 | "26043 39\n", 662 | "26044 39\n", 663 | "26045 39\n", 664 | "26046 39\n", 665 | "26047 39\n", 666 | "Length: 26048, dtype: int32" 667 | ] 668 | }, 669 | "execution_count": 42, 670 | "metadata": {}, 671 | "output_type": "execute_result" 672 | } 673 | ], 674 | "source": [ 675 | "pd.Series(le1.fit_transform(X_train[' native-country']))" 676 | ] 677 | } 678 | ], 679 | "metadata": { 680 | "kernelspec": { 681 | "display_name": "Python 3", 682 | "language": "python", 683 | "name": "python3" 684 | }, 685 | "language_info": { 686 | "codemirror_mode": { 687 | "name": "ipython", 688 | "version": 3 689 | }, 690 | "file_extension": ".py", 691 | "mimetype": "text/x-python", 692 | "name": "python", 693 | "nbconvert_exporter": "python", 694 | "pygments_lexer": "ipython3", 695 | "version": "3.7.3" 696 | } 697 | }, 698 | "nbformat": 4, 699 | "nbformat_minor": 2 700 | } 701 | -------------------------------------------------------------------------------- /missing indicator.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "from sklearn.model_selection import train_test_split" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/html": [ 22 | "
\n", 23 | "\n", 36 | "\n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countryincome
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
\n", 150 | "
" 151 | ], 152 | "text/plain": [ 153 | " age workclass fnlwgt education education-num \\\n", 154 | "0 39 State-gov 77516 Bachelors 13 \n", 155 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", 156 | "2 38 Private 215646 HS-grad 9 \n", 157 | "3 53 Private 234721 11th 7 \n", 158 | "4 28 Private 338409 Bachelors 13 \n", 159 | "\n", 160 | " marital-status occupation relationship race sex \\\n", 161 | "0 Never-married Adm-clerical Not-in-family White Male \n", 162 | "1 Married-civ-spouse Exec-managerial Husband White Male \n", 163 | "2 Divorced Handlers-cleaners Not-in-family White Male \n", 164 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", 165 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n", 166 | "\n", 167 | " capital-gain capital-loss hours-per-week native-country income \n", 168 | "0 2174 0 40 United-States <=50K \n", 169 | "1 0 0 13 United-States <=50K \n", 170 | "2 0 0 40 United-States <=50K \n", 171 | "3 0 0 40 United-States <=50K \n", 172 | "4 0 0 40 Cuba <=50K " 173 | ] 174 | }, 175 | "execution_count": 2, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')\n", 182 | "df.head()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 3, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "age 0\n", 194 | " workclass 1836\n", 195 | " fnlwgt 0\n", 196 | " education 0\n", 197 | " education-num 0\n", 198 | " marital-status 0\n", 199 | " occupation 1843\n", 200 | " relationship 0\n", 201 | " race 0\n", 202 | " sex 0\n", 203 | " capital-gain 0\n", 204 | " capital-loss 0\n", 205 | " hours-per-week 0\n", 206 | " native-country 583\n", 207 | " income 0\n", 208 | "dtype: int64" 209 | ] 210 | }, 211 | "execution_count": 3, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "df.isna().sum()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 4, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', 1), df[' income'],\n", 227 | " test_size=0.2, random_state=5)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 5, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "from sklearn.impute import MissingIndicator" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 6, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "mi = MissingIndicator()" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 7, 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "data": { 255 | "text/plain": [ 256 | "MissingIndicator()" 257 | ] 258 | }, 259 | "execution_count": 7, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "mi.fit(X_train)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 10, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/html": [ 276 | "
\n", 277 | "\n", 290 | "\n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | "
012
0FalseFalseFalse
1FalseFalseFalse
2FalseFalseFalse
3FalseFalseFalse
4FalseFalseFalse
\n", 332 | "
" 333 | ], 334 | "text/plain": [ 335 | " 0 1 2\n", 336 | "0 False False False\n", 337 | "1 False False False\n", 338 | "2 False False False\n", 339 | "3 False False False\n", 340 | "4 False False False" 341 | ] 342 | }, 343 | "execution_count": 10, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "pd.DataFrame(mi.transform(X_train)).head()" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 16, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "age 0\n", 361 | " workclass 1468\n", 362 | " fnlwgt 0\n", 363 | " education 0\n", 364 | " education-num 0\n", 365 | " marital-status 0\n", 366 | " occupation 1475\n", 367 | " relationship 0\n", 368 | " race 0\n", 369 | " sex 0\n", 370 | " capital-gain 0\n", 371 | " capital-loss 0\n", 372 | " hours-per-week 0\n", 373 | " native-country 474\n", 374 | "dtype: int64" 375 | ] 376 | }, 377 | "execution_count": 16, 378 | "metadata": {}, 379 | "output_type": "execute_result" 380 | } 381 | ], 382 | "source": [ 383 | "X_train.isna().sum()" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 11, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "mi1 = MissingIndicator(features='all')" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 15, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/html": [ 403 | "
\n", 404 | "\n", 417 | "\n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | "
012345678910111213
0FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
2FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
4FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", 525 | "
" 526 | ], 527 | "text/plain": [ 528 | " 0 1 2 3 4 5 6 7 8 9 \\\n", 529 | "0 False False False False False False False False False False \n", 530 | "1 False False False False False False False False False False \n", 531 | "2 False False False False False False False False False False \n", 532 | "3 False False False False False False False False False False \n", 533 | "4 False False False False False False False False False False \n", 534 | "\n", 535 | " 10 11 12 13 \n", 536 | "0 False False False False \n", 537 | "1 False False False False \n", 538 | "2 False False False False \n", 539 | "3 False False False False \n", 540 | "4 False False False False " 541 | ] 542 | }, 543 | "execution_count": 15, 544 | "metadata": {}, 545 | "output_type": "execute_result" 546 | } 547 | ], 548 | "source": [ 549 | "pd.DataFrame(mi1.fit_transform(X_train)).head()" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 18, 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "data": { 559 | "text/plain": [ 560 | "array([ 1, 6, 13], dtype=int64)" 561 | ] 562 | }, 563 | "execution_count": 18, 564 | "metadata": {}, 565 | "output_type": "execute_result" 566 | } 567 | ], 568 | "source": [ 569 | "mi.features_" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 19, 575 | "metadata": {}, 576 | "outputs": [ 577 | { 578 | "data": { 579 | "text/plain": [ 580 | "Index([' workclass', ' occupation', ' native-country'], dtype='object')" 581 | ] 582 | }, 583 | "execution_count": 19, 584 | "metadata": {}, 585 | "output_type": "execute_result" 586 | } 587 | ], 588 | "source": [ 589 | "X_train.columns[mi.features_]" 590 | ] 591 | } 592 | ], 593 | "metadata": { 594 | "kernelspec": { 595 | "display_name": "Python 3", 596 | "language": "python", 597 | "name": "python3" 598 | }, 599 | "language_info": { 600 | "codemirror_mode": { 601 | "name": "ipython", 602 | "version": 3 603 | }, 604 | "file_extension": ".py", 605 | "mimetype": "text/x-python", 606 | "name": "python", 607 | "nbconvert_exporter": "python", 608 | "pygments_lexer": "ipython3", 609 | "version": "3.7.3" 610 | } 611 | }, 612 | "nbformat": 4, 613 | "nbformat_minor": 2 614 | } 615 | -------------------------------------------------------------------------------- /ordinal encoder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from sklearn.model_selection import train_test_split" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/html": [ 21 | "
\n", 22 | "\n", 35 | "\n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countryincome
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
\n", 149 | "
" 150 | ], 151 | "text/plain": [ 152 | " age workclass fnlwgt education education-num \\\n", 153 | "0 39 State-gov 77516 Bachelors 13 \n", 154 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", 155 | "2 38 Private 215646 HS-grad 9 \n", 156 | "3 53 Private 234721 11th 7 \n", 157 | "4 28 Private 338409 Bachelors 13 \n", 158 | "\n", 159 | " marital-status occupation relationship race sex \\\n", 160 | "0 Never-married Adm-clerical Not-in-family White Male \n", 161 | "1 Married-civ-spouse Exec-managerial Husband White Male \n", 162 | "2 Divorced Handlers-cleaners Not-in-family White Male \n", 163 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", 164 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n", 165 | "\n", 166 | " capital-gain capital-loss hours-per-week native-country income \n", 167 | "0 2174 0 40 United-States <=50K \n", 168 | "1 0 0 13 United-States <=50K \n", 169 | "2 0 0 40 United-States <=50K \n", 170 | "3 0 0 40 United-States <=50K \n", 171 | "4 0 0 40 Cuba <=50K " 172 | ] 173 | }, 174 | "execution_count": 2, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "df = pd.read_csv('data/income_evaluation.csv')\n", 181 | "df.head()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 4, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "X_train, X_test, y_train, y_test = train_test_split(df.drop([' income'], axis=1), df[' income'],\n", 191 | " test_size=0.2, random_state=0)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 6, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/plain": [ 202 | " HS-grad 8450\n", 203 | " Some-college 5832\n", 204 | " Bachelors 4242\n", 205 | " Masters 1414\n", 206 | " Assoc-voc 1110\n", 207 | " 11th 920\n", 208 | " Assoc-acdm 817\n", 209 | " 10th 752\n", 210 | " 7th-8th 526\n", 211 | " Prof-school 459\n", 212 | " 9th 419\n", 213 | " 12th 360\n", 214 | " Doctorate 306\n", 215 | " 5th-6th 259\n", 216 | " 1st-4th 139\n", 217 | " Preschool 43\n", 218 | "Name: education, dtype: int64" 219 | ] 220 | }, 221 | "execution_count": 6, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "X_train[' education'].value_counts()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 7, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "array([' 11th', ' HS-grad', ' Bachelors', ' Assoc-voc', ' Some-college',\n", 239 | " ' 9th', ' 10th', ' 12th', ' Doctorate', ' Prof-school', ' Masters',\n", 240 | " ' Assoc-acdm', ' 7th-8th', ' 5th-6th', ' Preschool', ' 1st-4th'],\n", 241 | " dtype=object)" 242 | ] 243 | }, 244 | "execution_count": 7, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "X_train[' education'].unique()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 18, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "data": { 260 | "text/plain": [ 261 | "array([' Male', ' Female'], dtype=object)" 262 | ] 263 | }, 264 | "execution_count": 18, 265 | "metadata": {}, 266 | "output_type": "execute_result" 267 | } 268 | ], 269 | "source": [ 270 | "X_train[' sex'].unique()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 26, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "gender = [' Male', ' Female']" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 9, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "edu = [' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th',\n", 289 | " ' 11th', ' 12th', ' HS-grad', ' Prof-school', ' Some-college',\n", 290 | " ' Assoc-acdm', ' Assoc-voc',' Bachelors', ' Masters', ' Doctorate' ]" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 10, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "from sklearn.preprocessing import OrdinalEncoder" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 27, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "ordi = OrdinalEncoder(categories=[edu, gender])" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 28, 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "data": { 318 | "text/plain": [ 319 | "OrdinalEncoder(categories=[[' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th',\n", 320 | " ' 9th', ' 10th', ' 11th', ' 12th', ' HS-grad',\n", 321 | " ' Prof-school', ' Some-college', ' Assoc-acdm',\n", 322 | " ' Assoc-voc', ' Bachelors', ' Masters',\n", 323 | " ' Doctorate'],\n", 324 | " [' Male', ' Female']])" 325 | ] 326 | }, 327 | "execution_count": 28, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "ordi.fit(X_train[[' education', ' sex']])" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 29, 339 | "metadata": { 340 | "scrolled": true 341 | }, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/html": [ 346 | "
\n", 347 | "\n", 360 | "\n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | "
educationsex
1528211thMale
24870HS-gradFemale
18822BachelorsFemale
26404HS-gradFemale
7842Assoc-vocMale
4890Some-collegeMale
32439thMale
17470Some-collegeFemale
14211Assoc-vocFemale
22453Some-collegeFemale
63110thFemale
29051Some-collegeMale
2147812thMale
26565HS-gradMale
25140HS-gradMale
15497DoctorateMale
14689Assoc-vocFemale
18726Some-collegeFemale
28105Some-collegeMale
6965Prof-schoolMale
4343MastersMale
24308Some-collegeFemale
11380Some-collegeFemale
26087HS-gradMale
5679BachelorsMale
13019HS-gradFemale
24049MastersMale
32119Some-collegeMale
25586Some-collegeMale
26959MastersMale
.........
6216BachelorsMale
27469Some-collegeFemale
169219thMale
26277HS-gradMale
289710thFemale
2415210thMale
18606Some-collegeMale
10327Prof-schoolMale
18983BachelorsMale
32230Prof-schoolMale
17089HS-gradFemale
14650MastersMale
19852HS-gradMale
6744HS-gradMale
15832DoctorateMale
15430Some-collegeFemale
149355th-6thMale
14116HS-gradFemale
2225810thFemale
2075710thMale
24275HS-gradMale
9225Assoc-acdmMale
321037th-8thFemale
30403HS-gradFemale
21243HS-gradMale
13123MastersMale
1964810thMale
9845Some-collegeFemale
10799DoctorateMale
2732Some-collegeMale
\n", 676 | "

26048 rows × 2 columns

\n", 677 | "
" 678 | ], 679 | "text/plain": [ 680 | " education sex\n", 681 | "15282 11th Male\n", 682 | "24870 HS-grad Female\n", 683 | "18822 Bachelors Female\n", 684 | "26404 HS-grad Female\n", 685 | "7842 Assoc-voc Male\n", 686 | "4890 Some-college Male\n", 687 | "3243 9th Male\n", 688 | "17470 Some-college Female\n", 689 | "14211 Assoc-voc Female\n", 690 | "22453 Some-college Female\n", 691 | "631 10th Female\n", 692 | "29051 Some-college Male\n", 693 | "21478 12th Male\n", 694 | "26565 HS-grad Male\n", 695 | "25140 HS-grad Male\n", 696 | "15497 Doctorate Male\n", 697 | "14689 Assoc-voc Female\n", 698 | "18726 Some-college Female\n", 699 | "28105 Some-college Male\n", 700 | "6965 Prof-school Male\n", 701 | "4343 Masters Male\n", 702 | "24308 Some-college Female\n", 703 | "11380 Some-college Female\n", 704 | "26087 HS-grad Male\n", 705 | "5679 Bachelors Male\n", 706 | "13019 HS-grad Female\n", 707 | "24049 Masters Male\n", 708 | "32119 Some-college Male\n", 709 | "25586 Some-college Male\n", 710 | "26959 Masters Male\n", 711 | "... ... ...\n", 712 | "6216 Bachelors Male\n", 713 | "27469 Some-college Female\n", 714 | "16921 9th Male\n", 715 | "26277 HS-grad Male\n", 716 | "2897 10th Female\n", 717 | "24152 10th Male\n", 718 | "18606 Some-college Male\n", 719 | "10327 Prof-school Male\n", 720 | "18983 Bachelors Male\n", 721 | "32230 Prof-school Male\n", 722 | "17089 HS-grad Female\n", 723 | "14650 Masters Male\n", 724 | "19852 HS-grad Male\n", 725 | "6744 HS-grad Male\n", 726 | "15832 Doctorate Male\n", 727 | "15430 Some-college Female\n", 728 | "14935 5th-6th Male\n", 729 | "14116 HS-grad Female\n", 730 | "22258 10th Female\n", 731 | "20757 10th Male\n", 732 | "24275 HS-grad Male\n", 733 | "9225 Assoc-acdm Male\n", 734 | "32103 7th-8th Female\n", 735 | "30403 HS-grad Female\n", 736 | "21243 HS-grad Male\n", 737 | "13123 Masters Male\n", 738 | "19648 10th Male\n", 739 | "9845 Some-college Female\n", 740 | "10799 Doctorate Male\n", 741 | "2732 Some-college Male\n", 742 | "\n", 743 | "[26048 rows x 2 columns]" 744 | ] 745 | }, 746 | "execution_count": 29, 747 | "metadata": {}, 748 | "output_type": "execute_result" 749 | } 750 | ], 751 | "source": [ 752 | "X_train[[' education', ' sex']]" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 30, 758 | "metadata": { 759 | "scrolled": true 760 | }, 761 | "outputs": [ 762 | { 763 | "data": { 764 | "text/html": [ 765 | "
\n", 766 | "\n", 779 | "\n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | "
01
06.00.0
18.01.0
213.01.0
38.01.0
412.00.0
510.00.0
64.00.0
710.01.0
812.01.0
910.01.0
105.01.0
1110.00.0
127.00.0
138.00.0
148.00.0
1515.00.0
1612.01.0
1710.01.0
1810.00.0
199.00.0
2014.00.0
2110.01.0
2210.01.0
238.00.0
2413.00.0
258.01.0
2614.00.0
2710.00.0
2810.00.0
2914.00.0
.........
2601813.00.0
2601910.01.0
260204.00.0
260218.00.0
260225.01.0
260235.00.0
2602410.00.0
260259.00.0
2602613.00.0
260279.00.0
260288.01.0
2602914.00.0
260308.00.0
260318.00.0
2603215.00.0
2603310.01.0
260342.00.0
260358.01.0
260365.01.0
260375.00.0
260388.00.0
2603911.00.0
260403.01.0
260418.01.0
260428.00.0
2604314.00.0
260445.00.0
2604510.01.0
2604615.00.0
2604710.00.0
\n", 1095 | "

26048 rows × 2 columns

\n", 1096 | "
" 1097 | ], 1098 | "text/plain": [ 1099 | " 0 1\n", 1100 | "0 6.0 0.0\n", 1101 | "1 8.0 1.0\n", 1102 | "2 13.0 1.0\n", 1103 | "3 8.0 1.0\n", 1104 | "4 12.0 0.0\n", 1105 | "5 10.0 0.0\n", 1106 | "6 4.0 0.0\n", 1107 | "7 10.0 1.0\n", 1108 | "8 12.0 1.0\n", 1109 | "9 10.0 1.0\n", 1110 | "10 5.0 1.0\n", 1111 | "11 10.0 0.0\n", 1112 | "12 7.0 0.0\n", 1113 | "13 8.0 0.0\n", 1114 | "14 8.0 0.0\n", 1115 | "15 15.0 0.0\n", 1116 | "16 12.0 1.0\n", 1117 | "17 10.0 1.0\n", 1118 | "18 10.0 0.0\n", 1119 | "19 9.0 0.0\n", 1120 | "20 14.0 0.0\n", 1121 | "21 10.0 1.0\n", 1122 | "22 10.0 1.0\n", 1123 | "23 8.0 0.0\n", 1124 | "24 13.0 0.0\n", 1125 | "25 8.0 1.0\n", 1126 | "26 14.0 0.0\n", 1127 | "27 10.0 0.0\n", 1128 | "28 10.0 0.0\n", 1129 | "29 14.0 0.0\n", 1130 | "... ... ...\n", 1131 | "26018 13.0 0.0\n", 1132 | "26019 10.0 1.0\n", 1133 | "26020 4.0 0.0\n", 1134 | "26021 8.0 0.0\n", 1135 | "26022 5.0 1.0\n", 1136 | "26023 5.0 0.0\n", 1137 | "26024 10.0 0.0\n", 1138 | "26025 9.0 0.0\n", 1139 | "26026 13.0 0.0\n", 1140 | "26027 9.0 0.0\n", 1141 | "26028 8.0 1.0\n", 1142 | "26029 14.0 0.0\n", 1143 | "26030 8.0 0.0\n", 1144 | "26031 8.0 0.0\n", 1145 | "26032 15.0 0.0\n", 1146 | "26033 10.0 1.0\n", 1147 | "26034 2.0 0.0\n", 1148 | "26035 8.0 1.0\n", 1149 | "26036 5.0 1.0\n", 1150 | "26037 5.0 0.0\n", 1151 | "26038 8.0 0.0\n", 1152 | "26039 11.0 0.0\n", 1153 | "26040 3.0 1.0\n", 1154 | "26041 8.0 1.0\n", 1155 | "26042 8.0 0.0\n", 1156 | "26043 14.0 0.0\n", 1157 | "26044 5.0 0.0\n", 1158 | "26045 10.0 1.0\n", 1159 | "26046 15.0 0.0\n", 1160 | "26047 10.0 0.0\n", 1161 | "\n", 1162 | "[26048 rows x 2 columns]" 1163 | ] 1164 | }, 1165 | "execution_count": 30, 1166 | "metadata": {}, 1167 | "output_type": "execute_result" 1168 | } 1169 | ], 1170 | "source": [ 1171 | "pd.DataFrame(ordi.transform(X_train[[' education', ' sex']]))" 1172 | ] 1173 | }, 1174 | { 1175 | "cell_type": "code", 1176 | "execution_count": 31, 1177 | "metadata": {}, 1178 | "outputs": [ 1179 | { 1180 | "data": { 1181 | "text/plain": [ 1182 | "array([[10., 1.],\n", 1183 | " [13., 1.],\n", 1184 | " [11., 0.],\n", 1185 | " ...,\n", 1186 | " [13., 0.],\n", 1187 | " [ 8., 0.],\n", 1188 | " [ 8., 0.]])" 1189 | ] 1190 | }, 1191 | "execution_count": 31, 1192 | "metadata": {}, 1193 | "output_type": "execute_result" 1194 | } 1195 | ], 1196 | "source": [ 1197 | "ordi.transform(X_test[[' education', ' sex']])" 1198 | ] 1199 | } 1200 | ], 1201 | "metadata": { 1202 | "kernelspec": { 1203 | "display_name": "Python 3", 1204 | "language": "python", 1205 | "name": "python3" 1206 | }, 1207 | "language_info": { 1208 | "codemirror_mode": { 1209 | "name": "ipython", 1210 | "version": 3 1211 | }, 1212 | "file_extension": ".py", 1213 | "mimetype": "text/x-python", 1214 | "name": "python", 1215 | "nbconvert_exporter": "python", 1216 | "pygments_lexer": "ipython3", 1217 | "version": "3.7.3" 1218 | } 1219 | }, 1220 | "nbformat": 4, 1221 | "nbformat_minor": 2 1222 | } 1223 | -------------------------------------------------------------------------------- /pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 27, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "\n", 12 | "from sklearn.model_selection import train_test_split\n", 13 | "from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler, MinMaxScaler\n", 14 | "from sklearn.pipeline import Pipeline\n", 15 | "from sklearn.compose import ColumnTransformer\n", 16 | "from sklearn.tree import DecisionTreeClassifier" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/html": [ 27 | "
\n", 28 | "\n", 41 | "\n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countryincome
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
\n", 155 | "
" 156 | ], 157 | "text/plain": [ 158 | " age workclass fnlwgt education education-num \\\n", 159 | "0 39 State-gov 77516 Bachelors 13 \n", 160 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", 161 | "2 38 Private 215646 HS-grad 9 \n", 162 | "3 53 Private 234721 11th 7 \n", 163 | "4 28 Private 338409 Bachelors 13 \n", 164 | "\n", 165 | " marital-status occupation relationship race sex \\\n", 166 | "0 Never-married Adm-clerical Not-in-family White Male \n", 167 | "1 Married-civ-spouse Exec-managerial Husband White Male \n", 168 | "2 Divorced Handlers-cleaners Not-in-family White Male \n", 169 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", 170 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n", 171 | "\n", 172 | " capital-gain capital-loss hours-per-week native-country income \n", 173 | "0 2174 0 40 United-States <=50K \n", 174 | "1 0 0 13 United-States <=50K \n", 175 | "2 0 0 40 United-States <=50K \n", 176 | "3 0 0 40 United-States <=50K \n", 177 | "4 0 0 40 Cuba <=50K " 178 | ] 179 | }, 180 | "execution_count": 2, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')\n", 187 | "df.head()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 3, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "age 0\n", 199 | " workclass 1836\n", 200 | " fnlwgt 0\n", 201 | " education 0\n", 202 | " education-num 0\n", 203 | " marital-status 0\n", 204 | " occupation 1843\n", 205 | " relationship 0\n", 206 | " race 0\n", 207 | " sex 0\n", 208 | " capital-gain 0\n", 209 | " capital-loss 0\n", 210 | " hours-per-week 0\n", 211 | " native-country 583\n", 212 | " income 0\n", 213 | "dtype: int64" 214 | ] 215 | }, 216 | "execution_count": 3, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "df.isna().sum()" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 4, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "(32561, 15)" 234 | ] 235 | }, 236 | "execution_count": 4, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "df.shape" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "(30162, 15)" 254 | ] 255 | }, 256 | "execution_count": 5, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "df.dropna(inplace=True)\n", 263 | "df.shape" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 6, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "age 0\n", 275 | " workclass 0\n", 276 | " fnlwgt 0\n", 277 | " education 0\n", 278 | " education-num 0\n", 279 | " marital-status 0\n", 280 | " occupation 0\n", 281 | " relationship 0\n", 282 | " race 0\n", 283 | " sex 0\n", 284 | " capital-gain 0\n", 285 | " capital-loss 0\n", 286 | " hours-per-week 0\n", 287 | " native-country 0\n", 288 | " income 0\n", 289 | "dtype: int64" 290 | ] 291 | }, 292 | "execution_count": 6, 293 | "metadata": {}, 294 | "output_type": "execute_result" 295 | } 296 | ], 297 | "source": [ 298 | "df.isna().sum()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 7, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',\n", 310 | " ' marital-status', ' occupation', ' relationship', ' race', ' sex',\n", 311 | " ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',\n", 312 | " ' income'],\n", 313 | " dtype='object')" 314 | ] 315 | }, 316 | "execution_count": 7, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "df.columns" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 8, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "data": { 332 | "text/plain": [ 333 | "Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',\n", 334 | " 'marital-status', 'occupation', 'relationship', 'race', 'sex',\n", 335 | " 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',\n", 336 | " 'income'],\n", 337 | " dtype='object')" 338 | ] 339 | }, 340 | "execution_count": 8, 341 | "metadata": {}, 342 | "output_type": "execute_result" 343 | } 344 | ], 345 | "source": [ 346 | "df.columns = df.columns.str.strip()\n", 347 | "df.columns" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 9, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df.income,\n", 357 | " test_size=0.2, random_state=0)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 19, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/plain": [ 368 | "['age',\n", 369 | " 'fnlwgt',\n", 370 | " 'education-num',\n", 371 | " 'capital-gain',\n", 372 | " 'capital-loss',\n", 373 | " 'hours-per-week']" 374 | ] 375 | }, 376 | "execution_count": 19, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "num_cols = [col for col in X_train.columns if X_train[col].dtypes!='O']\n", 383 | "num_cols" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 20, 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/plain": [ 394 | "['workclass',\n", 395 | " 'education',\n", 396 | " 'marital-status',\n", 397 | " 'occupation',\n", 398 | " 'relationship',\n", 399 | " 'race',\n", 400 | " 'sex',\n", 401 | " 'native-country']" 402 | ] 403 | }, 404 | "execution_count": 20, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "cat_cols = [col for col in X_train.columns if X_train[col].dtypes=='O']\n", 411 | "cat_cols" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 22, 417 | "metadata": { 418 | "scrolled": true 419 | }, 420 | "outputs": [ 421 | { 422 | "data": { 423 | "text/html": [ 424 | "
\n", 425 | "\n", 438 | "\n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | "
educationeducation-num
0Bachelors13
1Bachelors13
2HS-grad9
311th7
4Bachelors13
\n", 474 | "
" 475 | ], 476 | "text/plain": [ 477 | " education education-num\n", 478 | "0 Bachelors 13\n", 479 | "1 Bachelors 13\n", 480 | "2 HS-grad 9\n", 481 | "3 11th 7\n", 482 | "4 Bachelors 13" 483 | ] 484 | }, 485 | "execution_count": 22, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "df[['education', 'education-num']].head()" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 23, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "ct = ColumnTransformer([\n", 501 | " ('step1', RobustScaler(), ['age', 'fnlwgt', 'hours-per-week']),\n", 502 | " ('step2', StandardScaler(), ['capital-gain', 'capital-loss', 'education-num']),\n", 503 | " ('step3', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['workclass', \n", 504 | " 'marital-status', 'occupation',\n", 505 | " 'relationship', 'race', \n", 506 | " 'sex', 'native-country'])\n", 507 | "], remainder='drop')" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [] 516 | }, 517 | { 518 | "cell_type": "markdown", 519 | "metadata": {}, 520 | "source": [ 521 | "# pipeline use case 1 - with an 'estimator' as final step" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": 52, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [ 530 | "p = Pipeline([\n", 531 | " ('coltf_step', ct),\n", 532 | " ('model', DecisionTreeClassifier()),\n", 533 | "])" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 53, 539 | "metadata": { 540 | "scrolled": true 541 | }, 542 | "outputs": [ 543 | { 544 | "data": { 545 | "text/plain": [ 546 | "Pipeline(steps=[('coltf_step',\n", 547 | " ColumnTransformer(transformers=[('step1', RobustScaler(),\n", 548 | " ['age', 'fnlwgt',\n", 549 | " 'hours-per-week']),\n", 550 | " ('step2', StandardScaler(),\n", 551 | " ['capital-gain',\n", 552 | " 'capital-loss',\n", 553 | " 'education-num']),\n", 554 | " ('step3',\n", 555 | " OneHotEncoder(handle_unknown='ignore',\n", 556 | " sparse=False),\n", 557 | " ['workclass',\n", 558 | " 'marital-status',\n", 559 | " 'occupation', 'relationship',\n", 560 | " 'race', 'sex',\n", 561 | " 'native-country'])])),\n", 562 | " ('model', DecisionTreeClassifier())])" 563 | ] 564 | }, 565 | "execution_count": 53, 566 | "metadata": {}, 567 | "output_type": "execute_result" 568 | } 569 | ], 570 | "source": [ 571 | "p.fit(X_train, y_train)" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 54, 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "data": { 581 | "text/plain": [ 582 | "array([' <=50K', ' <=50K', ' <=50K', ..., ' >50K', ' <=50K', ' <=50K'],\n", 583 | " dtype=object)" 584 | ] 585 | }, 586 | "execution_count": 54, 587 | "metadata": {}, 588 | "output_type": "execute_result" 589 | } 590 | ], 591 | "source": [ 592 | "p.predict(X_test)" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 55, 598 | "metadata": {}, 599 | "outputs": [ 600 | { 601 | "data": { 602 | "text/plain": [ 603 | "0.8078899386706447" 604 | ] 605 | }, 606 | "execution_count": 55, 607 | "metadata": {}, 608 | "output_type": "execute_result" 609 | } 610 | ], 611 | "source": [ 612 | "p.score(X_test, y_test)" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": 56, 618 | "metadata": {}, 619 | "outputs": [ 620 | { 621 | "data": { 622 | "text/plain": [ 623 | "{'coltf_step': ColumnTransformer(transformers=[('step1', RobustScaler(),\n", 624 | " ['age', 'fnlwgt', 'hours-per-week']),\n", 625 | " ('step2', StandardScaler(),\n", 626 | " ['capital-gain', 'capital-loss',\n", 627 | " 'education-num']),\n", 628 | " ('step3',\n", 629 | " OneHotEncoder(handle_unknown='ignore',\n", 630 | " sparse=False),\n", 631 | " ['workclass', 'marital-status', 'occupation',\n", 632 | " 'relationship', 'race', 'sex',\n", 633 | " 'native-country'])]),\n", 634 | " 'model': DecisionTreeClassifier()}" 635 | ] 636 | }, 637 | "execution_count": 56, 638 | "metadata": {}, 639 | "output_type": "execute_result" 640 | } 641 | ], 642 | "source": [ 643 | "p.named_steps" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 57, 649 | "metadata": {}, 650 | "outputs": [ 651 | { 652 | "data": { 653 | "text/plain": [ 654 | "[('step1', RobustScaler(), ['age', 'fnlwgt', 'hours-per-week']),\n", 655 | " ('step2',\n", 656 | " StandardScaler(),\n", 657 | " ['capital-gain', 'capital-loss', 'education-num']),\n", 658 | " ('step3',\n", 659 | " OneHotEncoder(handle_unknown='ignore', sparse=False),\n", 660 | " ['workclass',\n", 661 | " 'marital-status',\n", 662 | " 'occupation',\n", 663 | " 'relationship',\n", 664 | " 'race',\n", 665 | " 'sex',\n", 666 | " 'native-country']),\n", 667 | " ('remainder', 'drop', [3])]" 668 | ] 669 | }, 670 | "execution_count": 57, 671 | "metadata": {}, 672 | "output_type": "execute_result" 673 | } 674 | ], 675 | "source": [ 676 | "p.named_steps['coltf_step'].transformers_" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": 58, 682 | "metadata": { 683 | "scrolled": true 684 | }, 685 | "outputs": [ 686 | { 687 | "data": { 688 | "text/plain": [ 689 | "array(['x0_ Federal-gov', 'x0_ Local-gov', 'x0_ Private',\n", 690 | " 'x0_ Self-emp-inc', 'x0_ Self-emp-not-inc', 'x0_ State-gov',\n", 691 | " 'x0_ Without-pay', 'x1_ Divorced', 'x1_ Married-AF-spouse',\n", 692 | " 'x1_ Married-civ-spouse', 'x1_ Married-spouse-absent',\n", 693 | " 'x1_ Never-married', 'x1_ Separated', 'x1_ Widowed',\n", 694 | " 'x2_ Adm-clerical', 'x2_ Armed-Forces', 'x2_ Craft-repair',\n", 695 | " 'x2_ Exec-managerial', 'x2_ Farming-fishing',\n", 696 | " 'x2_ Handlers-cleaners', 'x2_ Machine-op-inspct',\n", 697 | " 'x2_ Other-service', 'x2_ Priv-house-serv', 'x2_ Prof-specialty',\n", 698 | " 'x2_ Protective-serv', 'x2_ Sales', 'x2_ Tech-support',\n", 699 | " 'x2_ Transport-moving', 'x3_ Husband', 'x3_ Not-in-family',\n", 700 | " 'x3_ Other-relative', 'x3_ Own-child', 'x3_ Unmarried', 'x3_ Wife',\n", 701 | " 'x4_ Amer-Indian-Eskimo', 'x4_ Asian-Pac-Islander', 'x4_ Black',\n", 702 | " 'x4_ Other', 'x4_ White', 'x5_ Female', 'x5_ Male', 'x6_ Cambodia',\n", 703 | " 'x6_ Canada', 'x6_ China', 'x6_ Columbia', 'x6_ Cuba',\n", 704 | " 'x6_ Dominican-Republic', 'x6_ Ecuador', 'x6_ El-Salvador',\n", 705 | " 'x6_ England', 'x6_ France', 'x6_ Germany', 'x6_ Greece',\n", 706 | " 'x6_ Guatemala', 'x6_ Haiti', 'x6_ Holand-Netherlands',\n", 707 | " 'x6_ Honduras', 'x6_ Hong', 'x6_ Hungary', 'x6_ India', 'x6_ Iran',\n", 708 | " 'x6_ Ireland', 'x6_ Italy', 'x6_ Jamaica', 'x6_ Japan', 'x6_ Laos',\n", 709 | " 'x6_ Mexico', 'x6_ Nicaragua', 'x6_ Outlying-US(Guam-USVI-etc)',\n", 710 | " 'x6_ Peru', 'x6_ Philippines', 'x6_ Poland', 'x6_ Portugal',\n", 711 | " 'x6_ Puerto-Rico', 'x6_ Scotland', 'x6_ South', 'x6_ Taiwan',\n", 712 | " 'x6_ Thailand', 'x6_ Trinadad&Tobago', 'x6_ United-States',\n", 713 | " 'x6_ Vietnam', 'x6_ Yugoslavia'], dtype=object)" 714 | ] 715 | }, 716 | "execution_count": 58, 717 | "metadata": {}, 718 | "output_type": "execute_result" 719 | } 720 | ], 721 | "source": [ 722 | "p.named_steps['coltf_step'].transformers_[2][1].get_feature_names()" 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": null, 728 | "metadata": {}, 729 | "outputs": [], 730 | "source": [] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": 59, 735 | "metadata": {}, 736 | "outputs": [ 737 | { 738 | "data": { 739 | "text/plain": [ 740 | "array([3.70000e+01, 1.78319e+05, 4.00000e+01])" 741 | ] 742 | }, 743 | "execution_count": 59, 744 | "metadata": {}, 745 | "output_type": "execute_result" 746 | } 747 | ], 748 | "source": [ 749 | "p.named_steps['coltf_step'].transformers_[0][1].center_" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": null, 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [] 758 | }, 759 | { 760 | "cell_type": "markdown", 761 | "metadata": {}, 762 | "source": [ 763 | "# pipeline use case 2 - without an estimator as final step" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": 48, 769 | "metadata": {}, 770 | "outputs": [], 771 | "source": [ 772 | "p1 = Pipeline([\n", 773 | " ('coltf_step', ct),\n", 774 | " ('minmax', MinMaxScaler())\n", 775 | "# ('model', DecisionTreeClassifier()),\n", 776 | "])" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 50, 782 | "metadata": {}, 783 | "outputs": [ 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "Pipeline(steps=[('coltf_step',\n", 788 | " ColumnTransformer(transformers=[('step1', RobustScaler(),\n", 789 | " ['age', 'fnlwgt',\n", 790 | " 'hours-per-week']),\n", 791 | " ('step2', StandardScaler(),\n", 792 | " ['capital-gain',\n", 793 | " 'capital-loss',\n", 794 | " 'education-num']),\n", 795 | " ('step3',\n", 796 | " OneHotEncoder(handle_unknown='ignore',\n", 797 | " sparse=False),\n", 798 | " ['workclass',\n", 799 | " 'marital-status',\n", 800 | " 'occupation', 'relationship',\n", 801 | " 'race', 'sex',\n", 802 | " 'native-country'])])),\n", 803 | " ('minmax', MinMaxScaler())])" 804 | ] 805 | }, 806 | "execution_count": 50, 807 | "metadata": {}, 808 | "output_type": "execute_result" 809 | } 810 | ], 811 | "source": [ 812 | "p1.fit(X_train)" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": 51, 818 | "metadata": {}, 819 | "outputs": [ 820 | { 821 | "data": { 822 | "text/plain": [ 823 | "array([[0.36986301, 0.04628617, 0.39795918, ..., 1. , 0. ,\n", 824 | " 0. ],\n", 825 | " [0.05479452, 0.1987476 , 0.19387755, ..., 1. , 0. ,\n", 826 | " 0. ],\n", 827 | " [0.26027397, 0.11716417, 0.39795918, ..., 1. , 0. ,\n", 828 | " 0. ],\n", 829 | " ...,\n", 830 | " [0.38356164, 0.17738365, 0.60204082, ..., 1. , 0. ,\n", 831 | " 0. ],\n", 832 | " [0.32876712, 0.23260631, 0.65306122, ..., 1. , 0. ,\n", 833 | " 0. ],\n", 834 | " [0.10958904, 0.09400613, 0.39795918, ..., 0. , 0. ,\n", 835 | " 0. ]])" 836 | ] 837 | }, 838 | "execution_count": 51, 839 | "metadata": {}, 840 | "output_type": "execute_result" 841 | } 842 | ], 843 | "source": [ 844 | "p1.transform(X_test)" 845 | ] 846 | } 847 | ], 848 | "metadata": { 849 | "kernelspec": { 850 | "display_name": "Python 3", 851 | "language": "python", 852 | "name": "python3" 853 | }, 854 | "language_info": { 855 | "codemirror_mode": { 856 | "name": "ipython", 857 | "version": 3 858 | }, 859 | "file_extension": ".py", 860 | "mimetype": "text/x-python", 861 | "name": "python", 862 | "nbconvert_exporter": "python", 863 | "pygments_lexer": "ipython3", 864 | "version": "3.7.3" 865 | } 866 | }, 867 | "nbformat": 4, 868 | "nbformat_minor": 2 869 | } 870 | -------------------------------------------------------------------------------- /ppts/CM metrics 1-4.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/CM metrics 1-4.pptx -------------------------------------------------------------------------------- /ppts/CM metrics 5-6.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/CM metrics 5-6.pptx -------------------------------------------------------------------------------- /ppts/Feature scaling.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/Feature scaling.pptx -------------------------------------------------------------------------------- /ppts/Grid Search.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/Grid Search.pptx -------------------------------------------------------------------------------- /ppts/KNN Imputer Algorithm.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/KNN Imputer Algorithm.pptx -------------------------------------------------------------------------------- /ppts/README.md: -------------------------------------------------------------------------------- 1 |

This is where all the PowerPoint Presentations I use during my Machine Learning tutorials on YouTube will reside

2 | 3 | You can access my Machine Learning playlist videos here. 4 | 5 | Thank you for your interest :) 6 | -------------------------------------------------------------------------------- /ppts/bias variance.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/bias variance.pptx -------------------------------------------------------------------------------- /ppts/confusion matrix.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/confusion matrix.pptx -------------------------------------------------------------------------------- /ppts/cross validation.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/cross validation.pptx -------------------------------------------------------------------------------- /ppts/mcc.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/mcc.pptx -------------------------------------------------------------------------------- /ppts/mice.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/mice.pptx -------------------------------------------------------------------------------- /ppts/outlier.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/outlier.pptx -------------------------------------------------------------------------------- /ppts/roc pr auc.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/roc pr auc.pptx -------------------------------------------------------------------------------- /simple imputer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "from sklearn.impute import SimpleImputer\n", 12 | "from sklearn.model_selection import train_test_split" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "
\n", 24 | "\n", 37 | "\n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | "
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countryincome
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
\n", 151 | "
" 152 | ], 153 | "text/plain": [ 154 | " age workclass fnlwgt education education-num \\\n", 155 | "0 39 State-gov 77516 Bachelors 13 \n", 156 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", 157 | "2 38 Private 215646 HS-grad 9 \n", 158 | "3 53 Private 234721 11th 7 \n", 159 | "4 28 Private 338409 Bachelors 13 \n", 160 | "\n", 161 | " marital-status occupation relationship race sex \\\n", 162 | "0 Never-married Adm-clerical Not-in-family White Male \n", 163 | "1 Married-civ-spouse Exec-managerial Husband White Male \n", 164 | "2 Divorced Handlers-cleaners Not-in-family White Male \n", 165 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", 166 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n", 167 | "\n", 168 | " capital-gain capital-loss hours-per-week native-country income \n", 169 | "0 2174 0 40 United-States <=50K \n", 170 | "1 0 0 13 United-States <=50K \n", 171 | "2 0 0 40 United-States <=50K \n", 172 | "3 0 0 40 United-States <=50K \n", 173 | "4 0 0 40 Cuba <=50K " 174 | ] 175 | }, 176 | "execution_count": 2, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "df = pd.read_csv('data/income_evaluation.csv', na_values = ' ?')\n", 183 | "df.head()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 4, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "age 0\n", 195 | " workclass 1836\n", 196 | " fnlwgt 0\n", 197 | " education 0\n", 198 | " education-num 0\n", 199 | " marital-status 0\n", 200 | " occupation 1843\n", 201 | " relationship 0\n", 202 | " race 0\n", 203 | " sex 0\n", 204 | " capital-gain 0\n", 205 | " capital-loss 0\n", 206 | " hours-per-week 0\n", 207 | " native-country 583\n", 208 | " income 0\n", 209 | "dtype: int64" 210 | ] 211 | }, 212 | "execution_count": 4, 213 | "metadata": {}, 214 | "output_type": "execute_result" 215 | } 216 | ], 217 | "source": [ 218 | "df.isna().sum()" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 4, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "# hours per week missing values\n", 235 | "np.random.seed(seed=0)\n", 236 | "h = np.random.choice(a=df.index, replace=False, size=20)\n", 237 | "df.loc[h, ' hours-per-week'] = np.nan" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 5, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "# age missing values\n", 247 | "np.random.seed(seed=10)\n", 248 | "a = np.random.choice(a=df.index, replace=False, size=28)\n", 249 | "df.loc[a, 'age'] = np.nan" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 6, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', axis=1),\n", 259 | " df[' income'], test_size=0.2,\n", 260 | " random_state=30)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 7, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "si_age = SimpleImputer(strategy='mean', add_indicator=True)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 12, 275 | "metadata": { 276 | "scrolled": true 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "a = pd.DataFrame(si_age.fit_transform(X_train[['age']]))" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 11, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "array([38.54201729])" 292 | ] 293 | }, 294 | "execution_count": 11, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "si_age.statistics_" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 16, 306 | "metadata": { 307 | "scrolled": true 308 | }, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/html": [ 313 | "
\n", 314 | "\n", 327 | "\n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | "
01
296938.5420171.0
321938.5420171.0
352238.5420171.0
492538.5420171.0
554338.5420171.0
575438.5420171.0
630538.5420171.0
723738.5420171.0
858738.5420171.0
1131438.5420171.0
1211238.5420171.0
1227438.5420171.0
1259138.5420171.0
1274638.5420171.0
1692738.5420171.0
1759438.5420171.0
1890938.5420171.0
2027138.5420171.0
2036638.5420171.0
2041438.5420171.0
2159838.5420171.0
2280738.5420171.0
2524038.5420171.0
\n", 453 | "
" 454 | ], 455 | "text/plain": [ 456 | " 0 1\n", 457 | "2969 38.542017 1.0\n", 458 | "3219 38.542017 1.0\n", 459 | "3522 38.542017 1.0\n", 460 | "4925 38.542017 1.0\n", 461 | "5543 38.542017 1.0\n", 462 | "5754 38.542017 1.0\n", 463 | "6305 38.542017 1.0\n", 464 | "7237 38.542017 1.0\n", 465 | "8587 38.542017 1.0\n", 466 | "11314 38.542017 1.0\n", 467 | "12112 38.542017 1.0\n", 468 | "12274 38.542017 1.0\n", 469 | "12591 38.542017 1.0\n", 470 | "12746 38.542017 1.0\n", 471 | "16927 38.542017 1.0\n", 472 | "17594 38.542017 1.0\n", 473 | "18909 38.542017 1.0\n", 474 | "20271 38.542017 1.0\n", 475 | "20366 38.542017 1.0\n", 476 | "20414 38.542017 1.0\n", 477 | "21598 38.542017 1.0\n", 478 | "22807 38.542017 1.0\n", 479 | "25240 38.542017 1.0" 480 | ] 481 | }, 482 | "execution_count": 16, 483 | "metadata": {}, 484 | "output_type": "execute_result" 485 | } 486 | ], 487 | "source": [ 488 | "a[a[1] == 1]" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 21, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "si_occ = SimpleImputer(strategy='constant', add_indicator=True, fill_value='not available')" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 22, 503 | "metadata": {}, 504 | "outputs": [ 505 | { 506 | "data": { 507 | "text/plain": [ 508 | "array([[' Exec-managerial', False],\n", 509 | " [' Transport-moving', False],\n", 510 | " [' Transport-moving', False],\n", 511 | " ...,\n", 512 | " [' Other-service', False],\n", 513 | " [' Sales', False],\n", 514 | " [' Tech-support', False]], dtype=object)" 515 | ] 516 | }, 517 | "execution_count": 22, 518 | "metadata": {}, 519 | "output_type": "execute_result" 520 | } 521 | ], 522 | "source": [ 523 | "si_occ.fit_transform(X_train[[' occupation']])" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 23, 529 | "metadata": { 530 | "scrolled": true 531 | }, 532 | "outputs": [ 533 | { 534 | "data": { 535 | "text/html": [ 536 | "
\n", 537 | "\n", 550 | "\n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | "
01
0Exec-managerialFalse
1Transport-movingFalse
2Transport-movingFalse
3Craft-repairFalse
4Adm-clericalFalse
5SalesFalse
6Machine-op-inspctFalse
7Farming-fishingFalse
8Adm-clericalFalse
9Machine-op-inspctFalse
10Adm-clericalFalse
11Other-serviceFalse
12SalesFalse
13Exec-managerialFalse
14Craft-repairFalse
15Adm-clericalFalse
16Other-serviceFalse
17Tech-supportFalse
18SalesFalse
19Farming-fishingFalse
20Protective-servFalse
21Craft-repairFalse
22Prof-specialtyFalse
23Machine-op-inspctFalse
24Exec-managerialFalse
25Craft-repairFalse
26not availableTrue
27Handlers-cleanersFalse
28Other-serviceFalse
29not availableTrue
.........
26018Transport-movingFalse
26019Prof-specialtyFalse
26020SalesFalse
26021Adm-clericalFalse
26022Tech-supportFalse
26023Adm-clericalFalse
26024Craft-repairFalse
26025Adm-clericalFalse
26026Adm-clericalFalse
26027Machine-op-inspctFalse
26028not availableTrue
26029Machine-op-inspctFalse
26030Craft-repairFalse
26031Handlers-cleanersFalse
26032Machine-op-inspctFalse
26033SalesFalse
26034Protective-servFalse
26035Farming-fishingFalse
26036Exec-managerialFalse
26037Exec-managerialFalse
26038Farming-fishingFalse
26039Other-serviceFalse
26040Prof-specialtyFalse
26041Other-serviceFalse
26042Adm-clericalFalse
26043Farming-fishingFalse
26044Adm-clericalFalse
26045Other-serviceFalse
26046SalesFalse
26047Tech-supportFalse
\n", 866 | "

26048 rows × 2 columns

\n", 867 | "
" 868 | ], 869 | "text/plain": [ 870 | " 0 1\n", 871 | "0 Exec-managerial False\n", 872 | "1 Transport-moving False\n", 873 | "2 Transport-moving False\n", 874 | "3 Craft-repair False\n", 875 | "4 Adm-clerical False\n", 876 | "5 Sales False\n", 877 | "6 Machine-op-inspct False\n", 878 | "7 Farming-fishing False\n", 879 | "8 Adm-clerical False\n", 880 | "9 Machine-op-inspct False\n", 881 | "10 Adm-clerical False\n", 882 | "11 Other-service False\n", 883 | "12 Sales False\n", 884 | "13 Exec-managerial False\n", 885 | "14 Craft-repair False\n", 886 | "15 Adm-clerical False\n", 887 | "16 Other-service False\n", 888 | "17 Tech-support False\n", 889 | "18 Sales False\n", 890 | "19 Farming-fishing False\n", 891 | "20 Protective-serv False\n", 892 | "21 Craft-repair False\n", 893 | "22 Prof-specialty False\n", 894 | "23 Machine-op-inspct False\n", 895 | "24 Exec-managerial False\n", 896 | "25 Craft-repair False\n", 897 | "26 not available True\n", 898 | "27 Handlers-cleaners False\n", 899 | "28 Other-service False\n", 900 | "29 not available True\n", 901 | "... ... ...\n", 902 | "26018 Transport-moving False\n", 903 | "26019 Prof-specialty False\n", 904 | "26020 Sales False\n", 905 | "26021 Adm-clerical False\n", 906 | "26022 Tech-support False\n", 907 | "26023 Adm-clerical False\n", 908 | "26024 Craft-repair False\n", 909 | "26025 Adm-clerical False\n", 910 | "26026 Adm-clerical False\n", 911 | "26027 Machine-op-inspct False\n", 912 | "26028 not available True\n", 913 | "26029 Machine-op-inspct False\n", 914 | "26030 Craft-repair False\n", 915 | "26031 Handlers-cleaners False\n", 916 | "26032 Machine-op-inspct False\n", 917 | "26033 Sales False\n", 918 | "26034 Protective-serv False\n", 919 | "26035 Farming-fishing False\n", 920 | "26036 Exec-managerial False\n", 921 | "26037 Exec-managerial False\n", 922 | "26038 Farming-fishing False\n", 923 | "26039 Other-service False\n", 924 | "26040 Prof-specialty False\n", 925 | "26041 Other-service False\n", 926 | "26042 Adm-clerical False\n", 927 | "26043 Farming-fishing False\n", 928 | "26044 Adm-clerical False\n", 929 | "26045 Other-service False\n", 930 | "26046 Sales False\n", 931 | "26047 Tech-support False\n", 932 | "\n", 933 | "[26048 rows x 2 columns]" 934 | ] 935 | }, 936 | "execution_count": 23, 937 | "metadata": {}, 938 | "output_type": "execute_result" 939 | } 940 | ], 941 | "source": [ 942 | "pd.DataFrame(si_occ.fit_transform(X_train[[' occupation']]))" 943 | ] 944 | }, 945 | { 946 | "cell_type": "code", 947 | "execution_count": 24, 948 | "metadata": {}, 949 | "outputs": [ 950 | { 951 | "data": { 952 | "text/plain": [ 953 | "array([[48., 0.],\n", 954 | " [63., 0.],\n", 955 | " [33., 0.],\n", 956 | " ...,\n", 957 | " [48., 0.],\n", 958 | " [54., 0.],\n", 959 | " [58., 0.]])" 960 | ] 961 | }, 962 | "execution_count": 24, 963 | "metadata": {}, 964 | "output_type": "execute_result" 965 | } 966 | ], 967 | "source": [ 968 | "si_age.transform(X_test[['age']])" 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": 26, 974 | "metadata": {}, 975 | "outputs": [], 976 | "source": [ 977 | "b = pd.DataFrame(si_age.transform(X_test[['age']]))" 978 | ] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": 29, 983 | "metadata": {}, 984 | "outputs": [ 985 | { 986 | "data": { 987 | "text/html": [ 988 | "
\n", 989 | "\n", 1002 | "\n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | "
01
252638.5420171.0
406838.5420171.0
411138.5420171.0
532438.5420171.0
593038.5420171.0
\n", 1038 | "
" 1039 | ], 1040 | "text/plain": [ 1041 | " 0 1\n", 1042 | "2526 38.542017 1.0\n", 1043 | "4068 38.542017 1.0\n", 1044 | "4111 38.542017 1.0\n", 1045 | "5324 38.542017 1.0\n", 1046 | "5930 38.542017 1.0" 1047 | ] 1048 | }, 1049 | "execution_count": 29, 1050 | "metadata": {}, 1051 | "output_type": "execute_result" 1052 | } 1053 | ], 1054 | "source": [ 1055 | "b[b[1] == 1]" 1056 | ] 1057 | } 1058 | ], 1059 | "metadata": { 1060 | "kernelspec": { 1061 | "display_name": "Python 3", 1062 | "language": "python", 1063 | "name": "python3" 1064 | }, 1065 | "language_info": { 1066 | "codemirror_mode": { 1067 | "name": "ipython", 1068 | "version": 3 1069 | }, 1070 | "file_extension": ".py", 1071 | "mimetype": "text/x-python", 1072 | "name": "python", 1073 | "nbconvert_exporter": "python", 1074 | "pygments_lexer": "ipython3", 1075 | "version": "3.7.3" 1076 | } 1077 | }, 1078 | "nbformat": 4, 1079 | "nbformat_minor": 2 1080 | } 1081 | -------------------------------------------------------------------------------- /why NEVER use pd.get_dummies.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "
\n", 21 | "\n", 34 | "\n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | "
colorheightpetalsdays
0red4.036
1green9.0916
2red4.017
3green8.0815
4red4.018
5green7.01017
6red4.025
7green7.5812
8blue20.05040
9blue19.04745
\n", 117 | "
" 118 | ], 119 | "text/plain": [ 120 | " color height petals days\n", 121 | "0 red 4.0 3 6\n", 122 | "1 green 9.0 9 16\n", 123 | "2 red 4.0 1 7\n", 124 | "3 green 8.0 8 15\n", 125 | "4 red 4.0 1 8\n", 126 | "5 green 7.0 10 17\n", 127 | "6 red 4.0 2 5\n", 128 | "7 green 7.5 8 12\n", 129 | "8 blue 20.0 50 40\n", 130 | "9 blue 19.0 47 45" 131 | ] 132 | }, 133 | "execution_count": 2, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "flowers = pd.DataFrame({\n", 140 | " 'color' : ['red', 'green', 'red', 'green', 'red', 'green', 'red', 'green', 'blue', 'blue'],\n", 141 | " 'height': [4,9,4,8,4,7,4,7.5,20,19],\n", 142 | " 'petals': [3,9,1,8,1,10,2,8,50,47],\n", 143 | " 'days' : [6,16,7,15,8,17,5,12,40,45]\n", 144 | "})\n", 145 | "flowers" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 3, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "from sklearn.model_selection import train_test_split\n", 155 | "X_train, X_test, y_train, y_test = train_test_split(flowers.drop('days', axis=1), flowers['days'],\n", 156 | " test_size=0.2, random_state=40\n", 157 | ")" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 4, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/html": [ 168 | "
\n", 169 | "\n", 182 | "\n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | "
colorheightpetals
8blue20.050
1green9.09
2red4.01
9blue19.047
0red4.03
5green7.010
7green7.58
6red4.02
\n", 242 | "
" 243 | ], 244 | "text/plain": [ 245 | " color height petals\n", 246 | "8 blue 20.0 50\n", 247 | "1 green 9.0 9\n", 248 | "2 red 4.0 1\n", 249 | "9 blue 19.0 47\n", 250 | "0 red 4.0 3\n", 251 | "5 green 7.0 10\n", 252 | "7 green 7.5 8\n", 253 | "6 red 4.0 2" 254 | ] 255 | }, 256 | "execution_count": 4, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "X_train" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 7, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "\n", 275 | "Int64Index: 8 entries, 8 to 6\n", 276 | "Data columns (total 3 columns):\n", 277 | "color 8 non-null object\n", 278 | "height 8 non-null float64\n", 279 | "petals 8 non-null int64\n", 280 | "dtypes: float64(1), int64(1), object(1)\n", 281 | "memory usage: 256.0+ bytes\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "X_train.info()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 8, 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "data": { 296 | "text/plain": [ 297 | "array(['blue', 'green', 'red'], dtype=object)" 298 | ] 299 | }, 300 | "execution_count": 8, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "X_train['color'].unique()" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 10, 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "text/html": [ 317 | "
\n", 318 | "\n", 331 | "\n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | "
heightpetalscolor_bluecolor_greencolor_redcolor
820.050100blue
19.09010green
24.01001red
919.047100blue
04.03001red
57.010010green
77.58010green
64.02001red
\n", 418 | "
" 419 | ], 420 | "text/plain": [ 421 | " height petals color_blue color_green color_red color\n", 422 | "8 20.0 50 1 0 0 blue\n", 423 | "1 9.0 9 0 1 0 green\n", 424 | "2 4.0 1 0 0 1 red\n", 425 | "9 19.0 47 1 0 0 blue\n", 426 | "0 4.0 3 0 0 1 red\n", 427 | "5 7.0 10 0 1 0 green\n", 428 | "7 7.5 8 0 1 0 green\n", 429 | "6 4.0 2 0 0 1 red" 430 | ] 431 | }, 432 | "execution_count": 10, 433 | "metadata": {}, 434 | "output_type": "execute_result" 435 | } 436 | ], 437 | "source": [ 438 | "pd.get_dummies(X_train).join(X_train['color'])" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 11, 444 | "metadata": {}, 445 | "outputs": [ 446 | { 447 | "data": { 448 | "text/html": [ 449 | "
\n", 450 | "\n", 463 | "\n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | "
colorheightpetals
4red4.01
3green8.08
\n", 487 | "
" 488 | ], 489 | "text/plain": [ 490 | " color height petals\n", 491 | "4 red 4.0 1\n", 492 | "3 green 8.0 8" 493 | ] 494 | }, 495 | "execution_count": 11, 496 | "metadata": {}, 497 | "output_type": "execute_result" 498 | } 499 | ], 500 | "source": [ 501 | "X_test" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 12, 507 | "metadata": {}, 508 | "outputs": [ 509 | { 510 | "data": { 511 | "text/html": [ 512 | "
\n", 513 | "\n", 526 | "\n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | "
heightpetalscolor_greencolor_red
44.0101
38.0810
\n", 553 | "
" 554 | ], 555 | "text/plain": [ 556 | " height petals color_green color_red\n", 557 | "4 4.0 1 0 1\n", 558 | "3 8.0 8 1 0" 559 | ] 560 | }, 561 | "execution_count": 12, 562 | "metadata": {}, 563 | "output_type": "execute_result" 564 | } 565 | ], 566 | "source": [ 567 | "pd.get_dummies(X_test)" 568 | ] 569 | } 570 | ], 571 | "metadata": { 572 | "kernelspec": { 573 | "display_name": "Python 3", 574 | "language": "python", 575 | "name": "python3" 576 | }, 577 | "language_info": { 578 | "codemirror_mode": { 579 | "name": "ipython", 580 | "version": 3 581 | }, 582 | "file_extension": ".py", 583 | "mimetype": "text/x-python", 584 | "name": "python", 585 | "nbconvert_exporter": "python", 586 | "pygments_lexer": "ipython3", 587 | "version": "3.7.3" 588 | } 589 | }, 590 | "nbformat": 4, 591 | "nbformat_minor": 2 592 | } 593 | --------------------------------------------------------------------------------