├── time series └── prophet + lightgbm.png └── pipelines ├── Unleash the Power of Scikit-learn's Pipelines.ipynb └── .ipynb_checkpoints └── Unleash the Power of Scikit-learn's Pipelines-checkpoint.ipynb /time series/prophet + lightgbm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unaiLopez/towards-data-science-posts-notebooks/HEAD/time series/prophet + lightgbm.png -------------------------------------------------------------------------------- /pipelines/Unleash the Power of Scikit-learn's Pipelines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "7c49b5f8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "\n", 13 | "from sklearn.model_selection import train_test_split\n", 14 | "from sklearn.model_selection import GridSearchCV\n", 15 | "from sklearn.neighbors import KNeighborsClassifier\n", 16 | "from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler\n", 17 | "from sklearn.compose import ColumnTransformer\n", 18 | "from sklearn.compose import make_column_selector\n", 19 | "from sklearn.pipeline import Pipeline\n", 20 | "from sklearn.metrics import accuracy_score, make_scorer\n", 21 | "from sklearn.impute import SimpleImputer\n", 22 | "from sklearn.cluster import KMeans" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0dc901f", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "df = pd.read_csv('../datasets/adult.csv')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "id": "cddc6a54", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | "
ageworkclassfnlwgteducationeducation.nummarital.statusoccupationrelationshipracesexcapital.gaincapital.losshours.per.weeknative.countryincome
090?77053HS-grad9Widowed?Not-in-familyWhiteFemale0435640United-States<=50K
182Private132870HS-grad9WidowedExec-managerialNot-in-familyWhiteFemale0435618United-States<=50K
266?186061Some-college10Widowed?UnmarriedBlackFemale0435640United-States<=50K
354Private1403597th-8th4DivorcedMachine-op-inspctUnmarriedWhiteFemale0390040United-States<=50K
441Private264663Some-college10SeparatedProf-specialtyOwn-childWhiteFemale0390040United-States<=50K
................................................
3255622Private310152Some-college10Never-marriedProtective-servNot-in-familyWhiteMale0040United-States<=50K
3255727Private257302Assoc-acdm12Married-civ-spouseTech-supportWifeWhiteFemale0038United-States<=50K
3255840Private154374HS-grad9Married-civ-spouseMachine-op-inspctHusbandWhiteMale0040United-States>50K
3255958Private151910HS-grad9WidowedAdm-clericalUnmarriedWhiteFemale0040United-States<=50K
3256022Private201490HS-grad9Never-marriedAdm-clericalOwn-childWhiteMale0020United-States<=50K
\n", 280 | "

32561 rows × 15 columns

\n", 281 | "
" 282 | ], 283 | "text/plain": [ 284 | " age workclass fnlwgt education education.num marital.status \\\n", 285 | "0 90 ? 77053 HS-grad 9 Widowed \n", 286 | "1 82 Private 132870 HS-grad 9 Widowed \n", 287 | "2 66 ? 186061 Some-college 10 Widowed \n", 288 | "3 54 Private 140359 7th-8th 4 Divorced \n", 289 | "4 41 Private 264663 Some-college 10 Separated \n", 290 | "... ... ... ... ... ... ... \n", 291 | "32556 22 Private 310152 Some-college 10 Never-married \n", 292 | "32557 27 Private 257302 Assoc-acdm 12 Married-civ-spouse \n", 293 | "32558 40 Private 154374 HS-grad 9 Married-civ-spouse \n", 294 | "32559 58 Private 151910 HS-grad 9 Widowed \n", 295 | "32560 22 Private 201490 HS-grad 9 Never-married \n", 296 | "\n", 297 | " occupation relationship race sex capital.gain \\\n", 298 | "0 ? Not-in-family White Female 0 \n", 299 | "1 Exec-managerial Not-in-family White Female 0 \n", 300 | "2 ? Unmarried Black Female 0 \n", 301 | "3 Machine-op-inspct Unmarried White Female 0 \n", 302 | "4 Prof-specialty Own-child White Female 0 \n", 303 | "... ... ... ... ... ... \n", 304 | "32556 Protective-serv Not-in-family White Male 0 \n", 305 | "32557 Tech-support Wife White Female 0 \n", 306 | "32558 Machine-op-inspct Husband White Male 0 \n", 307 | "32559 Adm-clerical Unmarried White Female 0 \n", 308 | "32560 Adm-clerical Own-child White Male 0 \n", 309 | "\n", 310 | " capital.loss hours.per.week native.country income \n", 311 | "0 4356 40 United-States <=50K \n", 312 | "1 4356 18 United-States <=50K \n", 313 | "2 4356 40 United-States <=50K \n", 314 | "3 3900 40 United-States <=50K \n", 315 | "4 3900 40 United-States <=50K \n", 316 | "... ... ... ... ... \n", 317 | "32556 0 40 United-States <=50K \n", 318 | "32557 0 38 United-States <=50K \n", 319 | "32558 0 40 United-States >50K \n", 320 | "32559 0 40 United-States <=50K \n", 321 | "32560 0 20 United-States <=50K \n", 322 | "\n", 323 | "[32561 rows x 15 columns]" 324 | ] 325 | }, 326 | "execution_count": 3, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "df" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 5, 338 | "id": "d6809423", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "#convert question mark symbols '?' to NaN\n", 343 | "df.replace('?', np.nan, inplace=True)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 6, 349 | "id": "63af7124", 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "\n", 357 | "RangeIndex: 32561 entries, 0 to 32560\n", 358 | "Data columns (total 15 columns):\n", 359 | " # Column Non-Null Count Dtype \n", 360 | "--- ------ -------------- ----- \n", 361 | " 0 age 32561 non-null int64 \n", 362 | " 1 workclass 30725 non-null object\n", 363 | " 2 fnlwgt 32561 non-null int64 \n", 364 | " 3 education 32561 non-null object\n", 365 | " 4 education.num 32561 non-null int64 \n", 366 | " 5 marital.status 32561 non-null object\n", 367 | " 6 occupation 30718 non-null object\n", 368 | " 7 relationship 32561 non-null object\n", 369 | " 8 race 32561 non-null object\n", 370 | " 9 sex 32561 non-null object\n", 371 | " 10 capital.gain 32561 non-null int64 \n", 372 | " 11 capital.loss 32561 non-null int64 \n", 373 | " 12 hours.per.week 32561 non-null int64 \n", 374 | " 13 native.country 31978 non-null object\n", 375 | " 14 income 32561 non-null object\n", 376 | "dtypes: int64(6), object(9)\n", 377 | "memory usage: 3.7+ MB\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "df.info()" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 7, 388 | "id": "b4b1c3b0", 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "#map the target column from string to number\n", 393 | "le = LabelEncoder()\n", 394 | "df.income = le.fit_transform(df.income)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 8, 400 | "id": "82d00a08", 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "#creating pipeline for numerical features\n", 405 | "numerical_pipe = Pipeline([\n", 406 | " ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),\n", 407 | " ('scaler', StandardScaler()),\n", 408 | "])\n", 409 | "\n", 410 | "#creating pipeline for categorical features\n", 411 | "categorical_pipe = Pipeline([\n", 412 | " ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),\n", 413 | " ('one_hot', OneHotEncoder(handle_unknown='ignore'))\n", 414 | "])\n", 415 | "\n", 416 | "#creating column transformer component\n", 417 | "preprocessor = ColumnTransformer([\n", 418 | " ('numerical', numerical_pipe, make_column_selector(dtype_include=['int', 'float'])),\n", 419 | " ('categorical', categorical_pipe, make_column_selector(dtype_include=['object'])),\n", 420 | "])\n", 421 | "\n", 422 | "#creating main pipeline\n", 423 | "pipe = Pipeline([\n", 424 | " ('column_transformer', preprocessor),\n", 425 | " ('model', KNeighborsClassifier())\n", 426 | "])" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 9, 432 | "id": "998c2aa2", 433 | "metadata": {}, 434 | "outputs": [ 435 | { 436 | "data": { 437 | "text/plain": [ 438 | "0.8322166387493021" 439 | ] 440 | }, 441 | "execution_count": 9, 442 | "metadata": {}, 443 | "output_type": "execute_result" 444 | } 445 | ], 446 | "source": [ 447 | "#creating X and y variables\n", 448 | "X = df.drop('income', axis=1)\n", 449 | "y = df.income\n", 450 | "\n", 451 | "#spliting data into train and test data\n", 452 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)\n", 453 | "\n", 454 | "#fitting pipeline with train data and predicting test data\n", 455 | "pipe.fit(X_train, y_train)\n", 456 | "predictions = pipe.predict(X_test)\n", 457 | "\n", 458 | "#checking pipeline's accuracy\n", 459 | "accuracy_score(y_test, predictions)" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 10, 465 | "id": "ff499a65", 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "data": { 470 | "text/plain": [ 471 | "Pipeline(steps=[('column_transformer',\n", 472 | " ColumnTransformer(transformers=[('numerical',\n", 473 | " Pipeline(steps=[('imputer',\n", 474 | " SimpleImputer()),\n", 475 | " ('scaler',\n", 476 | " StandardScaler())]),\n", 477 | " ),\n", 478 | " ('categorical',\n", 479 | " Pipeline(steps=[('imputer',\n", 480 | " SimpleImputer(strategy='most_frequent')),\n", 481 | " ('one_hot',\n", 482 | " OneHotEncoder(handle_unknown='ignore'))]),\n", 483 | " )])),\n", 484 | " ('model', KNeighborsClassifier())])" 485 | ] 486 | }, 487 | "execution_count": 10, 488 | "metadata": {}, 489 | "output_type": "execute_result" 490 | } 491 | ], 492 | "source": [ 493 | "pipe" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 51, 499 | "id": "69c78f74", 500 | "metadata": { 501 | "scrolled": false 502 | }, 503 | "outputs": [ 504 | { 505 | "name": "stdout", 506 | "output_type": "stream", 507 | "text": [ 508 | "Fitting 3 folds for each of 64 candidates, totalling 192 fits\n", 509 | "CPU times: user 8min 53s, sys: 1min 20s, total: 10min 13s\n", 510 | "Wall time: 10min 13s\n" 511 | ] 512 | }, 513 | { 514 | "data": { 515 | "text/plain": [ 516 | "GridSearchCV(cv=3,\n", 517 | " estimator=Pipeline(steps=[('column_transformer',\n", 518 | " ColumnTransformer(transformers=[('numerical',\n", 519 | " Pipeline(steps=[('imputer',\n", 520 | " SimpleImputer()),\n", 521 | " ('scaler',\n", 522 | " StandardScaler())]),\n", 523 | " ),\n", 524 | " ('categorical',\n", 525 | " Pipeline(steps=[('imputer',\n", 526 | " SimpleImputer(strategy='most_frequent')),...\n", 527 | " )])),\n", 528 | " ('model', KNeighborsClassifier())]),\n", 529 | " param_grid={'column_transformer__numerical__imputer__strategy': ['mean',\n", 530 | " 'median'],\n", 531 | " 'column_transformer__numerical__scaler': [StandardScaler(),\n", 532 | " MinMaxScaler()],\n", 533 | " 'model__leaf_size': [30, 40],\n", 534 | " 'model__n_neighbors': [3, 6, 10, 15],\n", 535 | " 'model__weights': ['uniform', 'distance']},\n", 536 | " scoring=make_scorer(accuracy_score), verbose=1)" 537 | ] 538 | }, 539 | "execution_count": 51, 540 | "metadata": {}, 541 | "output_type": "execute_result" 542 | } 543 | ], 544 | "source": [ 545 | "%%time\n", 546 | "\n", 547 | "#defining the hyperparameter space for searching\n", 548 | "parameters = {\n", 549 | " 'column_transformer__numerical__imputer__strategy': ['mean', 'median'],\n", 550 | " 'column_transformer__numerical__scaler': [StandardScaler(), MinMaxScaler()],\n", 551 | " 'model__n_neighbors': [3, 6, 10, 15],\n", 552 | " 'model__weights': ['uniform', 'distance'],\n", 553 | " 'model__leaf_size': [30, 40]\n", 554 | "}\n", 555 | "\n", 556 | "#defining a scorer and a GridSearchCV instance\n", 557 | "my_scorer = make_scorer(accuracy_score, greater_is_better=True)\n", 558 | "search = GridSearchCV(pipe, parameters, cv=3, scoring=my_scorer, n_jobs=-1, verbose=1)\n", 559 | "\n", 560 | "#search for the best hiperparameter combination within our defined hyperparameter space\n", 561 | "search.fit(X_train, y_train)" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 59, 567 | "id": "2e7988a6", 568 | "metadata": {}, 569 | "outputs": [ 570 | { 571 | "data": { 572 | "text/plain": [ 573 | "0.8408710217755444" 574 | ] 575 | }, 576 | "execution_count": 59, 577 | "metadata": {}, 578 | "output_type": "execute_result" 579 | } 580 | ], 581 | "source": [ 582 | "#change pipeline parameters\n", 583 | "pipe.set_params(**search.best_params_)\n", 584 | "\n", 585 | "#making predictions\n", 586 | "predictions = pipe.predict(X_test)\n", 587 | "\n", 588 | "#checking accuracy\n", 589 | "accuracy_score(y_test, predictions)" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 60, 595 | "id": "800f1b76", 596 | "metadata": {}, 597 | "outputs": [ 598 | { 599 | "data": { 600 | "text/plain": [ 601 | "Pipeline(steps=[('column_transformer',\n", 602 | " ColumnTransformer(transformers=[('numerical',\n", 603 | " Pipeline(steps=[('imputer',\n", 604 | " SimpleImputer()),\n", 605 | " ('scaler',\n", 606 | " StandardScaler())]),\n", 607 | " ),\n", 608 | " ('categorical',\n", 609 | " Pipeline(steps=[('imputer',\n", 610 | " SimpleImputer(strategy='most_frequent')),\n", 611 | " ('one_hot',\n", 612 | " OneHotEncoder(handle_unknown='ignore'))]),\n", 613 | " )])),\n", 614 | " ('model', KNeighborsClassifier(n_neighbors=15))])" 615 | ] 616 | }, 617 | "execution_count": 60, 618 | "metadata": {}, 619 | "output_type": "execute_result" 620 | } 621 | ], 622 | "source": [ 623 | "pipe" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "id": "d14602a1", 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [] 633 | } 634 | ], 635 | "metadata": { 636 | "kernelspec": { 637 | "display_name": "ml", 638 | "language": "python", 639 | "name": "ml" 640 | }, 641 | "language_info": { 642 | "codemirror_mode": { 643 | "name": "ipython", 644 | "version": 3 645 | }, 646 | "file_extension": ".py", 647 | "mimetype": "text/x-python", 648 | "name": "python", 649 | "nbconvert_exporter": "python", 650 | "pygments_lexer": "ipython3", 651 | "version": "3.7.6" 652 | } 653 | }, 654 | "nbformat": 4, 655 | "nbformat_minor": 5 656 | } 657 | -------------------------------------------------------------------------------- /pipelines/.ipynb_checkpoints/Unleash the Power of Scikit-learn's Pipelines-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "7c49b5f8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "\n", 13 | "from sklearn.model_selection import train_test_split\n", 14 | "from sklearn.model_selection import GridSearchCV\n", 15 | "from sklearn.neighbors import KNeighborsClassifier\n", 16 | "from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler\n", 17 | "from sklearn.compose import ColumnTransformer\n", 18 | "from sklearn.compose import make_column_selector\n", 19 | "from sklearn.pipeline import Pipeline\n", 20 | "from sklearn.metrics import accuracy_score, make_scorer\n", 21 | "from sklearn.impute import SimpleImputer\n", 22 | "from sklearn.cluster import KMeans" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0dc901f", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "df = pd.read_csv('../datasets/adult.csv')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "id": "cddc6a54", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | "
ageworkclassfnlwgteducationeducation.nummarital.statusoccupationrelationshipracesexcapital.gaincapital.losshours.per.weeknative.countryincome
090?77053HS-grad9Widowed?Not-in-familyWhiteFemale0435640United-States<=50K
182Private132870HS-grad9WidowedExec-managerialNot-in-familyWhiteFemale0435618United-States<=50K
266?186061Some-college10Widowed?UnmarriedBlackFemale0435640United-States<=50K
354Private1403597th-8th4DivorcedMachine-op-inspctUnmarriedWhiteFemale0390040United-States<=50K
441Private264663Some-college10SeparatedProf-specialtyOwn-childWhiteFemale0390040United-States<=50K
................................................
3255622Private310152Some-college10Never-marriedProtective-servNot-in-familyWhiteMale0040United-States<=50K
3255727Private257302Assoc-acdm12Married-civ-spouseTech-supportWifeWhiteFemale0038United-States<=50K
3255840Private154374HS-grad9Married-civ-spouseMachine-op-inspctHusbandWhiteMale0040United-States>50K
3255958Private151910HS-grad9WidowedAdm-clericalUnmarriedWhiteFemale0040United-States<=50K
3256022Private201490HS-grad9Never-marriedAdm-clericalOwn-childWhiteMale0020United-States<=50K
\n", 280 | "

32561 rows × 15 columns

\n", 281 | "
" 282 | ], 283 | "text/plain": [ 284 | " age workclass fnlwgt education education.num marital.status \\\n", 285 | "0 90 ? 77053 HS-grad 9 Widowed \n", 286 | "1 82 Private 132870 HS-grad 9 Widowed \n", 287 | "2 66 ? 186061 Some-college 10 Widowed \n", 288 | "3 54 Private 140359 7th-8th 4 Divorced \n", 289 | "4 41 Private 264663 Some-college 10 Separated \n", 290 | "... ... ... ... ... ... ... \n", 291 | "32556 22 Private 310152 Some-college 10 Never-married \n", 292 | "32557 27 Private 257302 Assoc-acdm 12 Married-civ-spouse \n", 293 | "32558 40 Private 154374 HS-grad 9 Married-civ-spouse \n", 294 | "32559 58 Private 151910 HS-grad 9 Widowed \n", 295 | "32560 22 Private 201490 HS-grad 9 Never-married \n", 296 | "\n", 297 | " occupation relationship race sex capital.gain \\\n", 298 | "0 ? Not-in-family White Female 0 \n", 299 | "1 Exec-managerial Not-in-family White Female 0 \n", 300 | "2 ? Unmarried Black Female 0 \n", 301 | "3 Machine-op-inspct Unmarried White Female 0 \n", 302 | "4 Prof-specialty Own-child White Female 0 \n", 303 | "... ... ... ... ... ... \n", 304 | "32556 Protective-serv Not-in-family White Male 0 \n", 305 | "32557 Tech-support Wife White Female 0 \n", 306 | "32558 Machine-op-inspct Husband White Male 0 \n", 307 | "32559 Adm-clerical Unmarried White Female 0 \n", 308 | "32560 Adm-clerical Own-child White Male 0 \n", 309 | "\n", 310 | " capital.loss hours.per.week native.country income \n", 311 | "0 4356 40 United-States <=50K \n", 312 | "1 4356 18 United-States <=50K \n", 313 | "2 4356 40 United-States <=50K \n", 314 | "3 3900 40 United-States <=50K \n", 315 | "4 3900 40 United-States <=50K \n", 316 | "... ... ... ... ... \n", 317 | "32556 0 40 United-States <=50K \n", 318 | "32557 0 38 United-States <=50K \n", 319 | "32558 0 40 United-States >50K \n", 320 | "32559 0 40 United-States <=50K \n", 321 | "32560 0 20 United-States <=50K \n", 322 | "\n", 323 | "[32561 rows x 15 columns]" 324 | ] 325 | }, 326 | "execution_count": 3, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "df" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 5, 338 | "id": "d6809423", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "#convert question mark symbols '?' to NaN\n", 343 | "df.replace('?', np.nan, inplace=True)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 6, 349 | "id": "63af7124", 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "\n", 357 | "RangeIndex: 32561 entries, 0 to 32560\n", 358 | "Data columns (total 15 columns):\n", 359 | " # Column Non-Null Count Dtype \n", 360 | "--- ------ -------------- ----- \n", 361 | " 0 age 32561 non-null int64 \n", 362 | " 1 workclass 30725 non-null object\n", 363 | " 2 fnlwgt 32561 non-null int64 \n", 364 | " 3 education 32561 non-null object\n", 365 | " 4 education.num 32561 non-null int64 \n", 366 | " 5 marital.status 32561 non-null object\n", 367 | " 6 occupation 30718 non-null object\n", 368 | " 7 relationship 32561 non-null object\n", 369 | " 8 race 32561 non-null object\n", 370 | " 9 sex 32561 non-null object\n", 371 | " 10 capital.gain 32561 non-null int64 \n", 372 | " 11 capital.loss 32561 non-null int64 \n", 373 | " 12 hours.per.week 32561 non-null int64 \n", 374 | " 13 native.country 31978 non-null object\n", 375 | " 14 income 32561 non-null object\n", 376 | "dtypes: int64(6), object(9)\n", 377 | "memory usage: 3.7+ MB\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "df.info()" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 7, 388 | "id": "b4b1c3b0", 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "#map the target column from string to number\n", 393 | "le = LabelEncoder()\n", 394 | "df.income = le.fit_transform(df.income)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 8, 400 | "id": "82d00a08", 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "#creating pipeline for numerical features\n", 405 | "numerical_pipe = Pipeline([\n", 406 | " ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),\n", 407 | " ('scaler', StandardScaler()),\n", 408 | "])\n", 409 | "\n", 410 | "#creating pipeline for categorical features\n", 411 | "categorical_pipe = Pipeline([\n", 412 | " ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),\n", 413 | " ('one_hot', OneHotEncoder(handle_unknown='ignore'))\n", 414 | "])\n", 415 | "\n", 416 | "#creating column transformer component\n", 417 | "preprocessor = ColumnTransformer([\n", 418 | " ('numerical', numerical_pipe, make_column_selector(dtype_include=['int', 'float'])),\n", 419 | " ('categorical', categorical_pipe, make_column_selector(dtype_include=['object'])),\n", 420 | "])\n", 421 | "\n", 422 | "#creating main pipeline\n", 423 | "pipe = Pipeline([\n", 424 | " ('column_transformer', preprocessor),\n", 425 | " ('model', KNeighborsClassifier())\n", 426 | "])" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 9, 432 | "id": "998c2aa2", 433 | "metadata": {}, 434 | "outputs": [ 435 | { 436 | "data": { 437 | "text/plain": [ 438 | "0.8322166387493021" 439 | ] 440 | }, 441 | "execution_count": 9, 442 | "metadata": {}, 443 | "output_type": "execute_result" 444 | } 445 | ], 446 | "source": [ 447 | "#creating X and y variables\n", 448 | "X = df.drop('income', axis=1)\n", 449 | "y = df.income\n", 450 | "\n", 451 | "#spliting data into train and test data\n", 452 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)\n", 453 | "\n", 454 | "#fitting pipeline with train data and predicting test data\n", 455 | "pipe.fit(X_train, y_train)\n", 456 | "predictions = pipe.predict(X_test)\n", 457 | "\n", 458 | "#checking pipeline's accuracy\n", 459 | "accuracy_score(y_test, predictions)" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 10, 465 | "id": "ff499a65", 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "data": { 470 | "text/plain": [ 471 | "Pipeline(steps=[('column_transformer',\n", 472 | " ColumnTransformer(transformers=[('numerical',\n", 473 | " Pipeline(steps=[('imputer',\n", 474 | " SimpleImputer()),\n", 475 | " ('scaler',\n", 476 | " StandardScaler())]),\n", 477 | " ),\n", 478 | " ('categorical',\n", 479 | " Pipeline(steps=[('imputer',\n", 480 | " SimpleImputer(strategy='most_frequent')),\n", 481 | " ('one_hot',\n", 482 | " OneHotEncoder(handle_unknown='ignore'))]),\n", 483 | " )])),\n", 484 | " ('model', KNeighborsClassifier())])" 485 | ] 486 | }, 487 | "execution_count": 10, 488 | "metadata": {}, 489 | "output_type": "execute_result" 490 | } 491 | ], 492 | "source": [ 493 | "pipe" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 51, 499 | "id": "69c78f74", 500 | "metadata": { 501 | "scrolled": false 502 | }, 503 | "outputs": [ 504 | { 505 | "name": "stdout", 506 | "output_type": "stream", 507 | "text": [ 508 | "Fitting 3 folds for each of 64 candidates, totalling 192 fits\n", 509 | "CPU times: user 8min 53s, sys: 1min 20s, total: 10min 13s\n", 510 | "Wall time: 10min 13s\n" 511 | ] 512 | }, 513 | { 514 | "data": { 515 | "text/plain": [ 516 | "GridSearchCV(cv=3,\n", 517 | " estimator=Pipeline(steps=[('column_transformer',\n", 518 | " ColumnTransformer(transformers=[('numerical',\n", 519 | " Pipeline(steps=[('imputer',\n", 520 | " SimpleImputer()),\n", 521 | " ('scaler',\n", 522 | " StandardScaler())]),\n", 523 | " ),\n", 524 | " ('categorical',\n", 525 | " Pipeline(steps=[('imputer',\n", 526 | " SimpleImputer(strategy='most_frequent')),...\n", 527 | " )])),\n", 528 | " ('model', KNeighborsClassifier())]),\n", 529 | " param_grid={'column_transformer__numerical__imputer__strategy': ['mean',\n", 530 | " 'median'],\n", 531 | " 'column_transformer__numerical__scaler': [StandardScaler(),\n", 532 | " MinMaxScaler()],\n", 533 | " 'model__leaf_size': [30, 40],\n", 534 | " 'model__n_neighbors': [3, 6, 10, 15],\n", 535 | " 'model__weights': ['uniform', 'distance']},\n", 536 | " scoring=make_scorer(accuracy_score), verbose=1)" 537 | ] 538 | }, 539 | "execution_count": 51, 540 | "metadata": {}, 541 | "output_type": "execute_result" 542 | } 543 | ], 544 | "source": [ 545 | "%%time\n", 546 | "\n", 547 | "#defining the hyperparameter space for searching\n", 548 | "parameters = {\n", 549 | " 'column_transformer__numerical__imputer__strategy': ['mean', 'median'],\n", 550 | " 'column_transformer__numerical__scaler': [StandardScaler(), MinMaxScaler()],\n", 551 | " 'model__n_neighbors': [3, 6, 10, 15],\n", 552 | " 'model__weights': ['uniform', 'distance'],\n", 553 | " 'model__leaf_size': [30, 40]\n", 554 | "}\n", 555 | "\n", 556 | "#defining a scorer and a GridSearchCV instance\n", 557 | "my_scorer = make_scorer(accuracy_score, greater_is_better=True)\n", 558 | "search = GridSearchCV(pipe, parameters, cv=3, scoring=my_scorer, n_jobs=-1, verbose=1)\n", 559 | "\n", 560 | "#search for the best hiperparameter combination within our defined hyperparameter space\n", 561 | "search.fit(X_train, y_train)" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 59, 567 | "id": "2e7988a6", 568 | "metadata": {}, 569 | "outputs": [ 570 | { 571 | "data": { 572 | "text/plain": [ 573 | "0.8408710217755444" 574 | ] 575 | }, 576 | "execution_count": 59, 577 | "metadata": {}, 578 | "output_type": "execute_result" 579 | } 580 | ], 581 | "source": [ 582 | "#change pipeline parameters\n", 583 | "pipe.set_params(**search.best_params_)\n", 584 | "\n", 585 | "#making predictions\n", 586 | "predictions = pipe.predict(X_test)\n", 587 | "\n", 588 | "#checking accuracy\n", 589 | "accuracy_score(y_test, predictions)" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 60, 595 | "id": "800f1b76", 596 | "metadata": {}, 597 | "outputs": [ 598 | { 599 | "data": { 600 | "text/plain": [ 601 | "Pipeline(steps=[('column_transformer',\n", 602 | " ColumnTransformer(transformers=[('numerical',\n", 603 | " Pipeline(steps=[('imputer',\n", 604 | " SimpleImputer()),\n", 605 | " ('scaler',\n", 606 | " StandardScaler())]),\n", 607 | " ),\n", 608 | " ('categorical',\n", 609 | " Pipeline(steps=[('imputer',\n", 610 | " SimpleImputer(strategy='most_frequent')),\n", 611 | " ('one_hot',\n", 612 | " OneHotEncoder(handle_unknown='ignore'))]),\n", 613 | " )])),\n", 614 | " ('model', KNeighborsClassifier(n_neighbors=15))])" 615 | ] 616 | }, 617 | "execution_count": 60, 618 | "metadata": {}, 619 | "output_type": "execute_result" 620 | } 621 | ], 622 | "source": [ 623 | "pipe" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "id": "d14602a1", 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [] 633 | } 634 | ], 635 | "metadata": { 636 | "kernelspec": { 637 | "display_name": "ml", 638 | "language": "python", 639 | "name": "ml" 640 | }, 641 | "language_info": { 642 | "codemirror_mode": { 643 | "name": "ipython", 644 | "version": 3 645 | }, 646 | "file_extension": ".py", 647 | "mimetype": "text/x-python", 648 | "name": "python", 649 | "nbconvert_exporter": "python", 650 | "pygments_lexer": "ipython3", 651 | "version": "3.7.6" 652 | } 653 | }, 654 | "nbformat": 4, 655 | "nbformat_minor": 5 656 | } 657 | --------------------------------------------------------------------------------