├── time series
└── prophet + lightgbm.png
└── pipelines
├── Unleash the Power of Scikit-learn's Pipelines.ipynb
└── .ipynb_checkpoints
└── Unleash the Power of Scikit-learn's Pipelines-checkpoint.ipynb
/time series/prophet + lightgbm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unaiLopez/towards-data-science-posts-notebooks/HEAD/time series/prophet + lightgbm.png
--------------------------------------------------------------------------------
/pipelines/Unleash the Power of Scikit-learn's Pipelines.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "7c49b5f8",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "\n",
13 | "from sklearn.model_selection import train_test_split\n",
14 | "from sklearn.model_selection import GridSearchCV\n",
15 | "from sklearn.neighbors import KNeighborsClassifier\n",
16 | "from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler\n",
17 | "from sklearn.compose import ColumnTransformer\n",
18 | "from sklearn.compose import make_column_selector\n",
19 | "from sklearn.pipeline import Pipeline\n",
20 | "from sklearn.metrics import accuracy_score, make_scorer\n",
21 | "from sklearn.impute import SimpleImputer\n",
22 | "from sklearn.cluster import KMeans"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0dc901f",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "df = pd.read_csv('../datasets/adult.csv')"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 3,
38 | "id": "cddc6a54",
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "data": {
43 | "text/html": [
44 | "
\n",
45 | "\n",
58 | "
\n",
59 | " \n",
60 | " \n",
61 | " | \n",
62 | " age | \n",
63 | " workclass | \n",
64 | " fnlwgt | \n",
65 | " education | \n",
66 | " education.num | \n",
67 | " marital.status | \n",
68 | " occupation | \n",
69 | " relationship | \n",
70 | " race | \n",
71 | " sex | \n",
72 | " capital.gain | \n",
73 | " capital.loss | \n",
74 | " hours.per.week | \n",
75 | " native.country | \n",
76 | " income | \n",
77 | "
\n",
78 | " \n",
79 | " \n",
80 | " \n",
81 | " | 0 | \n",
82 | " 90 | \n",
83 | " ? | \n",
84 | " 77053 | \n",
85 | " HS-grad | \n",
86 | " 9 | \n",
87 | " Widowed | \n",
88 | " ? | \n",
89 | " Not-in-family | \n",
90 | " White | \n",
91 | " Female | \n",
92 | " 0 | \n",
93 | " 4356 | \n",
94 | " 40 | \n",
95 | " United-States | \n",
96 | " <=50K | \n",
97 | "
\n",
98 | " \n",
99 | " | 1 | \n",
100 | " 82 | \n",
101 | " Private | \n",
102 | " 132870 | \n",
103 | " HS-grad | \n",
104 | " 9 | \n",
105 | " Widowed | \n",
106 | " Exec-managerial | \n",
107 | " Not-in-family | \n",
108 | " White | \n",
109 | " Female | \n",
110 | " 0 | \n",
111 | " 4356 | \n",
112 | " 18 | \n",
113 | " United-States | \n",
114 | " <=50K | \n",
115 | "
\n",
116 | " \n",
117 | " | 2 | \n",
118 | " 66 | \n",
119 | " ? | \n",
120 | " 186061 | \n",
121 | " Some-college | \n",
122 | " 10 | \n",
123 | " Widowed | \n",
124 | " ? | \n",
125 | " Unmarried | \n",
126 | " Black | \n",
127 | " Female | \n",
128 | " 0 | \n",
129 | " 4356 | \n",
130 | " 40 | \n",
131 | " United-States | \n",
132 | " <=50K | \n",
133 | "
\n",
134 | " \n",
135 | " | 3 | \n",
136 | " 54 | \n",
137 | " Private | \n",
138 | " 140359 | \n",
139 | " 7th-8th | \n",
140 | " 4 | \n",
141 | " Divorced | \n",
142 | " Machine-op-inspct | \n",
143 | " Unmarried | \n",
144 | " White | \n",
145 | " Female | \n",
146 | " 0 | \n",
147 | " 3900 | \n",
148 | " 40 | \n",
149 | " United-States | \n",
150 | " <=50K | \n",
151 | "
\n",
152 | " \n",
153 | " | 4 | \n",
154 | " 41 | \n",
155 | " Private | \n",
156 | " 264663 | \n",
157 | " Some-college | \n",
158 | " 10 | \n",
159 | " Separated | \n",
160 | " Prof-specialty | \n",
161 | " Own-child | \n",
162 | " White | \n",
163 | " Female | \n",
164 | " 0 | \n",
165 | " 3900 | \n",
166 | " 40 | \n",
167 | " United-States | \n",
168 | " <=50K | \n",
169 | "
\n",
170 | " \n",
171 | " | ... | \n",
172 | " ... | \n",
173 | " ... | \n",
174 | " ... | \n",
175 | " ... | \n",
176 | " ... | \n",
177 | " ... | \n",
178 | " ... | \n",
179 | " ... | \n",
180 | " ... | \n",
181 | " ... | \n",
182 | " ... | \n",
183 | " ... | \n",
184 | " ... | \n",
185 | " ... | \n",
186 | " ... | \n",
187 | "
\n",
188 | " \n",
189 | " | 32556 | \n",
190 | " 22 | \n",
191 | " Private | \n",
192 | " 310152 | \n",
193 | " Some-college | \n",
194 | " 10 | \n",
195 | " Never-married | \n",
196 | " Protective-serv | \n",
197 | " Not-in-family | \n",
198 | " White | \n",
199 | " Male | \n",
200 | " 0 | \n",
201 | " 0 | \n",
202 | " 40 | \n",
203 | " United-States | \n",
204 | " <=50K | \n",
205 | "
\n",
206 | " \n",
207 | " | 32557 | \n",
208 | " 27 | \n",
209 | " Private | \n",
210 | " 257302 | \n",
211 | " Assoc-acdm | \n",
212 | " 12 | \n",
213 | " Married-civ-spouse | \n",
214 | " Tech-support | \n",
215 | " Wife | \n",
216 | " White | \n",
217 | " Female | \n",
218 | " 0 | \n",
219 | " 0 | \n",
220 | " 38 | \n",
221 | " United-States | \n",
222 | " <=50K | \n",
223 | "
\n",
224 | " \n",
225 | " | 32558 | \n",
226 | " 40 | \n",
227 | " Private | \n",
228 | " 154374 | \n",
229 | " HS-grad | \n",
230 | " 9 | \n",
231 | " Married-civ-spouse | \n",
232 | " Machine-op-inspct | \n",
233 | " Husband | \n",
234 | " White | \n",
235 | " Male | \n",
236 | " 0 | \n",
237 | " 0 | \n",
238 | " 40 | \n",
239 | " United-States | \n",
240 | " >50K | \n",
241 | "
\n",
242 | " \n",
243 | " | 32559 | \n",
244 | " 58 | \n",
245 | " Private | \n",
246 | " 151910 | \n",
247 | " HS-grad | \n",
248 | " 9 | \n",
249 | " Widowed | \n",
250 | " Adm-clerical | \n",
251 | " Unmarried | \n",
252 | " White | \n",
253 | " Female | \n",
254 | " 0 | \n",
255 | " 0 | \n",
256 | " 40 | \n",
257 | " United-States | \n",
258 | " <=50K | \n",
259 | "
\n",
260 | " \n",
261 | " | 32560 | \n",
262 | " 22 | \n",
263 | " Private | \n",
264 | " 201490 | \n",
265 | " HS-grad | \n",
266 | " 9 | \n",
267 | " Never-married | \n",
268 | " Adm-clerical | \n",
269 | " Own-child | \n",
270 | " White | \n",
271 | " Male | \n",
272 | " 0 | \n",
273 | " 0 | \n",
274 | " 20 | \n",
275 | " United-States | \n",
276 | " <=50K | \n",
277 | "
\n",
278 | " \n",
279 | "
\n",
280 | "
32561 rows × 15 columns
\n",
281 | "
"
282 | ],
283 | "text/plain": [
284 | " age workclass fnlwgt education education.num marital.status \\\n",
285 | "0 90 ? 77053 HS-grad 9 Widowed \n",
286 | "1 82 Private 132870 HS-grad 9 Widowed \n",
287 | "2 66 ? 186061 Some-college 10 Widowed \n",
288 | "3 54 Private 140359 7th-8th 4 Divorced \n",
289 | "4 41 Private 264663 Some-college 10 Separated \n",
290 | "... ... ... ... ... ... ... \n",
291 | "32556 22 Private 310152 Some-college 10 Never-married \n",
292 | "32557 27 Private 257302 Assoc-acdm 12 Married-civ-spouse \n",
293 | "32558 40 Private 154374 HS-grad 9 Married-civ-spouse \n",
294 | "32559 58 Private 151910 HS-grad 9 Widowed \n",
295 | "32560 22 Private 201490 HS-grad 9 Never-married \n",
296 | "\n",
297 | " occupation relationship race sex capital.gain \\\n",
298 | "0 ? Not-in-family White Female 0 \n",
299 | "1 Exec-managerial Not-in-family White Female 0 \n",
300 | "2 ? Unmarried Black Female 0 \n",
301 | "3 Machine-op-inspct Unmarried White Female 0 \n",
302 | "4 Prof-specialty Own-child White Female 0 \n",
303 | "... ... ... ... ... ... \n",
304 | "32556 Protective-serv Not-in-family White Male 0 \n",
305 | "32557 Tech-support Wife White Female 0 \n",
306 | "32558 Machine-op-inspct Husband White Male 0 \n",
307 | "32559 Adm-clerical Unmarried White Female 0 \n",
308 | "32560 Adm-clerical Own-child White Male 0 \n",
309 | "\n",
310 | " capital.loss hours.per.week native.country income \n",
311 | "0 4356 40 United-States <=50K \n",
312 | "1 4356 18 United-States <=50K \n",
313 | "2 4356 40 United-States <=50K \n",
314 | "3 3900 40 United-States <=50K \n",
315 | "4 3900 40 United-States <=50K \n",
316 | "... ... ... ... ... \n",
317 | "32556 0 40 United-States <=50K \n",
318 | "32557 0 38 United-States <=50K \n",
319 | "32558 0 40 United-States >50K \n",
320 | "32559 0 40 United-States <=50K \n",
321 | "32560 0 20 United-States <=50K \n",
322 | "\n",
323 | "[32561 rows x 15 columns]"
324 | ]
325 | },
326 | "execution_count": 3,
327 | "metadata": {},
328 | "output_type": "execute_result"
329 | }
330 | ],
331 | "source": [
332 | "df"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 5,
338 | "id": "d6809423",
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "#convert question mark symbols '?' to NaN\n",
343 | "df.replace('?', np.nan, inplace=True)"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 6,
349 | "id": "63af7124",
350 | "metadata": {},
351 | "outputs": [
352 | {
353 | "name": "stdout",
354 | "output_type": "stream",
355 | "text": [
356 | "\n",
357 | "RangeIndex: 32561 entries, 0 to 32560\n",
358 | "Data columns (total 15 columns):\n",
359 | " # Column Non-Null Count Dtype \n",
360 | "--- ------ -------------- ----- \n",
361 | " 0 age 32561 non-null int64 \n",
362 | " 1 workclass 30725 non-null object\n",
363 | " 2 fnlwgt 32561 non-null int64 \n",
364 | " 3 education 32561 non-null object\n",
365 | " 4 education.num 32561 non-null int64 \n",
366 | " 5 marital.status 32561 non-null object\n",
367 | " 6 occupation 30718 non-null object\n",
368 | " 7 relationship 32561 non-null object\n",
369 | " 8 race 32561 non-null object\n",
370 | " 9 sex 32561 non-null object\n",
371 | " 10 capital.gain 32561 non-null int64 \n",
372 | " 11 capital.loss 32561 non-null int64 \n",
373 | " 12 hours.per.week 32561 non-null int64 \n",
374 | " 13 native.country 31978 non-null object\n",
375 | " 14 income 32561 non-null object\n",
376 | "dtypes: int64(6), object(9)\n",
377 | "memory usage: 3.7+ MB\n"
378 | ]
379 | }
380 | ],
381 | "source": [
382 | "df.info()"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 7,
388 | "id": "b4b1c3b0",
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "#map the target column from string to number\n",
393 | "le = LabelEncoder()\n",
394 | "df.income = le.fit_transform(df.income)"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 8,
400 | "id": "82d00a08",
401 | "metadata": {},
402 | "outputs": [],
403 | "source": [
404 | "#creating pipeline for numerical features\n",
405 | "numerical_pipe = Pipeline([\n",
406 | " ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),\n",
407 | " ('scaler', StandardScaler()),\n",
408 | "])\n",
409 | "\n",
410 | "#creating pipeline for categorical features\n",
411 | "categorical_pipe = Pipeline([\n",
412 | " ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),\n",
413 | " ('one_hot', OneHotEncoder(handle_unknown='ignore'))\n",
414 | "])\n",
415 | "\n",
416 | "#creating column transformer component\n",
417 | "preprocessor = ColumnTransformer([\n",
418 | " ('numerical', numerical_pipe, make_column_selector(dtype_include=['int', 'float'])),\n",
419 | " ('categorical', categorical_pipe, make_column_selector(dtype_include=['object'])),\n",
420 | "])\n",
421 | "\n",
422 | "#creating main pipeline\n",
423 | "pipe = Pipeline([\n",
424 | " ('column_transformer', preprocessor),\n",
425 | " ('model', KNeighborsClassifier())\n",
426 | "])"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": 9,
432 | "id": "998c2aa2",
433 | "metadata": {},
434 | "outputs": [
435 | {
436 | "data": {
437 | "text/plain": [
438 | "0.8322166387493021"
439 | ]
440 | },
441 | "execution_count": 9,
442 | "metadata": {},
443 | "output_type": "execute_result"
444 | }
445 | ],
446 | "source": [
447 | "#creating X and y variables\n",
448 | "X = df.drop('income', axis=1)\n",
449 | "y = df.income\n",
450 | "\n",
451 | "#spliting data into train and test data\n",
452 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)\n",
453 | "\n",
454 | "#fitting pipeline with train data and predicting test data\n",
455 | "pipe.fit(X_train, y_train)\n",
456 | "predictions = pipe.predict(X_test)\n",
457 | "\n",
458 | "#checking pipeline's accuracy\n",
459 | "accuracy_score(y_test, predictions)"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 10,
465 | "id": "ff499a65",
466 | "metadata": {},
467 | "outputs": [
468 | {
469 | "data": {
470 | "text/plain": [
471 | "Pipeline(steps=[('column_transformer',\n",
472 | " ColumnTransformer(transformers=[('numerical',\n",
473 | " Pipeline(steps=[('imputer',\n",
474 | " SimpleImputer()),\n",
475 | " ('scaler',\n",
476 | " StandardScaler())]),\n",
477 | " ),\n",
478 | " ('categorical',\n",
479 | " Pipeline(steps=[('imputer',\n",
480 | " SimpleImputer(strategy='most_frequent')),\n",
481 | " ('one_hot',\n",
482 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
483 | " )])),\n",
484 | " ('model', KNeighborsClassifier())])"
485 | ]
486 | },
487 | "execution_count": 10,
488 | "metadata": {},
489 | "output_type": "execute_result"
490 | }
491 | ],
492 | "source": [
493 | "pipe"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": 51,
499 | "id": "69c78f74",
500 | "metadata": {
501 | "scrolled": false
502 | },
503 | "outputs": [
504 | {
505 | "name": "stdout",
506 | "output_type": "stream",
507 | "text": [
508 | "Fitting 3 folds for each of 64 candidates, totalling 192 fits\n",
509 | "CPU times: user 8min 53s, sys: 1min 20s, total: 10min 13s\n",
510 | "Wall time: 10min 13s\n"
511 | ]
512 | },
513 | {
514 | "data": {
515 | "text/plain": [
516 | "GridSearchCV(cv=3,\n",
517 | " estimator=Pipeline(steps=[('column_transformer',\n",
518 | " ColumnTransformer(transformers=[('numerical',\n",
519 | " Pipeline(steps=[('imputer',\n",
520 | " SimpleImputer()),\n",
521 | " ('scaler',\n",
522 | " StandardScaler())]),\n",
523 | " ),\n",
524 | " ('categorical',\n",
525 | " Pipeline(steps=[('imputer',\n",
526 | " SimpleImputer(strategy='most_frequent')),...\n",
527 | " )])),\n",
528 | " ('model', KNeighborsClassifier())]),\n",
529 | " param_grid={'column_transformer__numerical__imputer__strategy': ['mean',\n",
530 | " 'median'],\n",
531 | " 'column_transformer__numerical__scaler': [StandardScaler(),\n",
532 | " MinMaxScaler()],\n",
533 | " 'model__leaf_size': [30, 40],\n",
534 | " 'model__n_neighbors': [3, 6, 10, 15],\n",
535 | " 'model__weights': ['uniform', 'distance']},\n",
536 | " scoring=make_scorer(accuracy_score), verbose=1)"
537 | ]
538 | },
539 | "execution_count": 51,
540 | "metadata": {},
541 | "output_type": "execute_result"
542 | }
543 | ],
544 | "source": [
545 | "%%time\n",
546 | "\n",
547 | "#defining the hyperparameter space for searching\n",
548 | "parameters = {\n",
549 | " 'column_transformer__numerical__imputer__strategy': ['mean', 'median'],\n",
550 | " 'column_transformer__numerical__scaler': [StandardScaler(), MinMaxScaler()],\n",
551 | " 'model__n_neighbors': [3, 6, 10, 15],\n",
552 | " 'model__weights': ['uniform', 'distance'],\n",
553 | " 'model__leaf_size': [30, 40]\n",
554 | "}\n",
555 | "\n",
556 | "#defining a scorer and a GridSearchCV instance\n",
557 | "my_scorer = make_scorer(accuracy_score, greater_is_better=True)\n",
558 | "search = GridSearchCV(pipe, parameters, cv=3, scoring=my_scorer, n_jobs=-1, verbose=1)\n",
559 | "\n",
560 | "#search for the best hiperparameter combination within our defined hyperparameter space\n",
561 | "search.fit(X_train, y_train)"
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": 59,
567 | "id": "2e7988a6",
568 | "metadata": {},
569 | "outputs": [
570 | {
571 | "data": {
572 | "text/plain": [
573 | "0.8408710217755444"
574 | ]
575 | },
576 | "execution_count": 59,
577 | "metadata": {},
578 | "output_type": "execute_result"
579 | }
580 | ],
581 | "source": [
582 | "#change pipeline parameters\n",
583 | "pipe.set_params(**search.best_params_)\n",
584 | "\n",
585 | "#making predictions\n",
586 | "predictions = pipe.predict(X_test)\n",
587 | "\n",
588 | "#checking accuracy\n",
589 | "accuracy_score(y_test, predictions)"
590 | ]
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": 60,
595 | "id": "800f1b76",
596 | "metadata": {},
597 | "outputs": [
598 | {
599 | "data": {
600 | "text/plain": [
601 | "Pipeline(steps=[('column_transformer',\n",
602 | " ColumnTransformer(transformers=[('numerical',\n",
603 | " Pipeline(steps=[('imputer',\n",
604 | " SimpleImputer()),\n",
605 | " ('scaler',\n",
606 | " StandardScaler())]),\n",
607 | " ),\n",
608 | " ('categorical',\n",
609 | " Pipeline(steps=[('imputer',\n",
610 | " SimpleImputer(strategy='most_frequent')),\n",
611 | " ('one_hot',\n",
612 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
613 | " )])),\n",
614 | " ('model', KNeighborsClassifier(n_neighbors=15))])"
615 | ]
616 | },
617 | "execution_count": 60,
618 | "metadata": {},
619 | "output_type": "execute_result"
620 | }
621 | ],
622 | "source": [
623 | "pipe"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": null,
629 | "id": "d14602a1",
630 | "metadata": {},
631 | "outputs": [],
632 | "source": []
633 | }
634 | ],
635 | "metadata": {
636 | "kernelspec": {
637 | "display_name": "ml",
638 | "language": "python",
639 | "name": "ml"
640 | },
641 | "language_info": {
642 | "codemirror_mode": {
643 | "name": "ipython",
644 | "version": 3
645 | },
646 | "file_extension": ".py",
647 | "mimetype": "text/x-python",
648 | "name": "python",
649 | "nbconvert_exporter": "python",
650 | "pygments_lexer": "ipython3",
651 | "version": "3.7.6"
652 | }
653 | },
654 | "nbformat": 4,
655 | "nbformat_minor": 5
656 | }
657 |
--------------------------------------------------------------------------------
/pipelines/.ipynb_checkpoints/Unleash the Power of Scikit-learn's Pipelines-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "7c49b5f8",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "\n",
13 | "from sklearn.model_selection import train_test_split\n",
14 | "from sklearn.model_selection import GridSearchCV\n",
15 | "from sklearn.neighbors import KNeighborsClassifier\n",
16 | "from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler\n",
17 | "from sklearn.compose import ColumnTransformer\n",
18 | "from sklearn.compose import make_column_selector\n",
19 | "from sklearn.pipeline import Pipeline\n",
20 | "from sklearn.metrics import accuracy_score, make_scorer\n",
21 | "from sklearn.impute import SimpleImputer\n",
22 | "from sklearn.cluster import KMeans"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0dc901f",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "df = pd.read_csv('../datasets/adult.csv')"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 3,
38 | "id": "cddc6a54",
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "data": {
43 | "text/html": [
44 | "\n",
45 | "\n",
58 | "
\n",
59 | " \n",
60 | " \n",
61 | " | \n",
62 | " age | \n",
63 | " workclass | \n",
64 | " fnlwgt | \n",
65 | " education | \n",
66 | " education.num | \n",
67 | " marital.status | \n",
68 | " occupation | \n",
69 | " relationship | \n",
70 | " race | \n",
71 | " sex | \n",
72 | " capital.gain | \n",
73 | " capital.loss | \n",
74 | " hours.per.week | \n",
75 | " native.country | \n",
76 | " income | \n",
77 | "
\n",
78 | " \n",
79 | " \n",
80 | " \n",
81 | " | 0 | \n",
82 | " 90 | \n",
83 | " ? | \n",
84 | " 77053 | \n",
85 | " HS-grad | \n",
86 | " 9 | \n",
87 | " Widowed | \n",
88 | " ? | \n",
89 | " Not-in-family | \n",
90 | " White | \n",
91 | " Female | \n",
92 | " 0 | \n",
93 | " 4356 | \n",
94 | " 40 | \n",
95 | " United-States | \n",
96 | " <=50K | \n",
97 | "
\n",
98 | " \n",
99 | " | 1 | \n",
100 | " 82 | \n",
101 | " Private | \n",
102 | " 132870 | \n",
103 | " HS-grad | \n",
104 | " 9 | \n",
105 | " Widowed | \n",
106 | " Exec-managerial | \n",
107 | " Not-in-family | \n",
108 | " White | \n",
109 | " Female | \n",
110 | " 0 | \n",
111 | " 4356 | \n",
112 | " 18 | \n",
113 | " United-States | \n",
114 | " <=50K | \n",
115 | "
\n",
116 | " \n",
117 | " | 2 | \n",
118 | " 66 | \n",
119 | " ? | \n",
120 | " 186061 | \n",
121 | " Some-college | \n",
122 | " 10 | \n",
123 | " Widowed | \n",
124 | " ? | \n",
125 | " Unmarried | \n",
126 | " Black | \n",
127 | " Female | \n",
128 | " 0 | \n",
129 | " 4356 | \n",
130 | " 40 | \n",
131 | " United-States | \n",
132 | " <=50K | \n",
133 | "
\n",
134 | " \n",
135 | " | 3 | \n",
136 | " 54 | \n",
137 | " Private | \n",
138 | " 140359 | \n",
139 | " 7th-8th | \n",
140 | " 4 | \n",
141 | " Divorced | \n",
142 | " Machine-op-inspct | \n",
143 | " Unmarried | \n",
144 | " White | \n",
145 | " Female | \n",
146 | " 0 | \n",
147 | " 3900 | \n",
148 | " 40 | \n",
149 | " United-States | \n",
150 | " <=50K | \n",
151 | "
\n",
152 | " \n",
153 | " | 4 | \n",
154 | " 41 | \n",
155 | " Private | \n",
156 | " 264663 | \n",
157 | " Some-college | \n",
158 | " 10 | \n",
159 | " Separated | \n",
160 | " Prof-specialty | \n",
161 | " Own-child | \n",
162 | " White | \n",
163 | " Female | \n",
164 | " 0 | \n",
165 | " 3900 | \n",
166 | " 40 | \n",
167 | " United-States | \n",
168 | " <=50K | \n",
169 | "
\n",
170 | " \n",
171 | " | ... | \n",
172 | " ... | \n",
173 | " ... | \n",
174 | " ... | \n",
175 | " ... | \n",
176 | " ... | \n",
177 | " ... | \n",
178 | " ... | \n",
179 | " ... | \n",
180 | " ... | \n",
181 | " ... | \n",
182 | " ... | \n",
183 | " ... | \n",
184 | " ... | \n",
185 | " ... | \n",
186 | " ... | \n",
187 | "
\n",
188 | " \n",
189 | " | 32556 | \n",
190 | " 22 | \n",
191 | " Private | \n",
192 | " 310152 | \n",
193 | " Some-college | \n",
194 | " 10 | \n",
195 | " Never-married | \n",
196 | " Protective-serv | \n",
197 | " Not-in-family | \n",
198 | " White | \n",
199 | " Male | \n",
200 | " 0 | \n",
201 | " 0 | \n",
202 | " 40 | \n",
203 | " United-States | \n",
204 | " <=50K | \n",
205 | "
\n",
206 | " \n",
207 | " | 32557 | \n",
208 | " 27 | \n",
209 | " Private | \n",
210 | " 257302 | \n",
211 | " Assoc-acdm | \n",
212 | " 12 | \n",
213 | " Married-civ-spouse | \n",
214 | " Tech-support | \n",
215 | " Wife | \n",
216 | " White | \n",
217 | " Female | \n",
218 | " 0 | \n",
219 | " 0 | \n",
220 | " 38 | \n",
221 | " United-States | \n",
222 | " <=50K | \n",
223 | "
\n",
224 | " \n",
225 | " | 32558 | \n",
226 | " 40 | \n",
227 | " Private | \n",
228 | " 154374 | \n",
229 | " HS-grad | \n",
230 | " 9 | \n",
231 | " Married-civ-spouse | \n",
232 | " Machine-op-inspct | \n",
233 | " Husband | \n",
234 | " White | \n",
235 | " Male | \n",
236 | " 0 | \n",
237 | " 0 | \n",
238 | " 40 | \n",
239 | " United-States | \n",
240 | " >50K | \n",
241 | "
\n",
242 | " \n",
243 | " | 32559 | \n",
244 | " 58 | \n",
245 | " Private | \n",
246 | " 151910 | \n",
247 | " HS-grad | \n",
248 | " 9 | \n",
249 | " Widowed | \n",
250 | " Adm-clerical | \n",
251 | " Unmarried | \n",
252 | " White | \n",
253 | " Female | \n",
254 | " 0 | \n",
255 | " 0 | \n",
256 | " 40 | \n",
257 | " United-States | \n",
258 | " <=50K | \n",
259 | "
\n",
260 | " \n",
261 | " | 32560 | \n",
262 | " 22 | \n",
263 | " Private | \n",
264 | " 201490 | \n",
265 | " HS-grad | \n",
266 | " 9 | \n",
267 | " Never-married | \n",
268 | " Adm-clerical | \n",
269 | " Own-child | \n",
270 | " White | \n",
271 | " Male | \n",
272 | " 0 | \n",
273 | " 0 | \n",
274 | " 20 | \n",
275 | " United-States | \n",
276 | " <=50K | \n",
277 | "
\n",
278 | " \n",
279 | "
\n",
280 | "
32561 rows × 15 columns
\n",
281 | "
"
282 | ],
283 | "text/plain": [
284 | " age workclass fnlwgt education education.num marital.status \\\n",
285 | "0 90 ? 77053 HS-grad 9 Widowed \n",
286 | "1 82 Private 132870 HS-grad 9 Widowed \n",
287 | "2 66 ? 186061 Some-college 10 Widowed \n",
288 | "3 54 Private 140359 7th-8th 4 Divorced \n",
289 | "4 41 Private 264663 Some-college 10 Separated \n",
290 | "... ... ... ... ... ... ... \n",
291 | "32556 22 Private 310152 Some-college 10 Never-married \n",
292 | "32557 27 Private 257302 Assoc-acdm 12 Married-civ-spouse \n",
293 | "32558 40 Private 154374 HS-grad 9 Married-civ-spouse \n",
294 | "32559 58 Private 151910 HS-grad 9 Widowed \n",
295 | "32560 22 Private 201490 HS-grad 9 Never-married \n",
296 | "\n",
297 | " occupation relationship race sex capital.gain \\\n",
298 | "0 ? Not-in-family White Female 0 \n",
299 | "1 Exec-managerial Not-in-family White Female 0 \n",
300 | "2 ? Unmarried Black Female 0 \n",
301 | "3 Machine-op-inspct Unmarried White Female 0 \n",
302 | "4 Prof-specialty Own-child White Female 0 \n",
303 | "... ... ... ... ... ... \n",
304 | "32556 Protective-serv Not-in-family White Male 0 \n",
305 | "32557 Tech-support Wife White Female 0 \n",
306 | "32558 Machine-op-inspct Husband White Male 0 \n",
307 | "32559 Adm-clerical Unmarried White Female 0 \n",
308 | "32560 Adm-clerical Own-child White Male 0 \n",
309 | "\n",
310 | " capital.loss hours.per.week native.country income \n",
311 | "0 4356 40 United-States <=50K \n",
312 | "1 4356 18 United-States <=50K \n",
313 | "2 4356 40 United-States <=50K \n",
314 | "3 3900 40 United-States <=50K \n",
315 | "4 3900 40 United-States <=50K \n",
316 | "... ... ... ... ... \n",
317 | "32556 0 40 United-States <=50K \n",
318 | "32557 0 38 United-States <=50K \n",
319 | "32558 0 40 United-States >50K \n",
320 | "32559 0 40 United-States <=50K \n",
321 | "32560 0 20 United-States <=50K \n",
322 | "\n",
323 | "[32561 rows x 15 columns]"
324 | ]
325 | },
326 | "execution_count": 3,
327 | "metadata": {},
328 | "output_type": "execute_result"
329 | }
330 | ],
331 | "source": [
332 | "df"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 5,
338 | "id": "d6809423",
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "#convert question mark symbols '?' to NaN\n",
343 | "df.replace('?', np.nan, inplace=True)"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 6,
349 | "id": "63af7124",
350 | "metadata": {},
351 | "outputs": [
352 | {
353 | "name": "stdout",
354 | "output_type": "stream",
355 | "text": [
356 | "\n",
357 | "RangeIndex: 32561 entries, 0 to 32560\n",
358 | "Data columns (total 15 columns):\n",
359 | " # Column Non-Null Count Dtype \n",
360 | "--- ------ -------------- ----- \n",
361 | " 0 age 32561 non-null int64 \n",
362 | " 1 workclass 30725 non-null object\n",
363 | " 2 fnlwgt 32561 non-null int64 \n",
364 | " 3 education 32561 non-null object\n",
365 | " 4 education.num 32561 non-null int64 \n",
366 | " 5 marital.status 32561 non-null object\n",
367 | " 6 occupation 30718 non-null object\n",
368 | " 7 relationship 32561 non-null object\n",
369 | " 8 race 32561 non-null object\n",
370 | " 9 sex 32561 non-null object\n",
371 | " 10 capital.gain 32561 non-null int64 \n",
372 | " 11 capital.loss 32561 non-null int64 \n",
373 | " 12 hours.per.week 32561 non-null int64 \n",
374 | " 13 native.country 31978 non-null object\n",
375 | " 14 income 32561 non-null object\n",
376 | "dtypes: int64(6), object(9)\n",
377 | "memory usage: 3.7+ MB\n"
378 | ]
379 | }
380 | ],
381 | "source": [
382 | "df.info()"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 7,
388 | "id": "b4b1c3b0",
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "#map the target column from string to number\n",
393 | "le = LabelEncoder()\n",
394 | "df.income = le.fit_transform(df.income)"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 8,
400 | "id": "82d00a08",
401 | "metadata": {},
402 | "outputs": [],
403 | "source": [
404 | "#creating pipeline for numerical features\n",
405 | "numerical_pipe = Pipeline([\n",
406 | " ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),\n",
407 | " ('scaler', StandardScaler()),\n",
408 | "])\n",
409 | "\n",
410 | "#creating pipeline for categorical features\n",
411 | "categorical_pipe = Pipeline([\n",
412 | " ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),\n",
413 | " ('one_hot', OneHotEncoder(handle_unknown='ignore'))\n",
414 | "])\n",
415 | "\n",
416 | "#creating column transformer component\n",
417 | "preprocessor = ColumnTransformer([\n",
418 | " ('numerical', numerical_pipe, make_column_selector(dtype_include=['int', 'float'])),\n",
419 | " ('categorical', categorical_pipe, make_column_selector(dtype_include=['object'])),\n",
420 | "])\n",
421 | "\n",
422 | "#creating main pipeline\n",
423 | "pipe = Pipeline([\n",
424 | " ('column_transformer', preprocessor),\n",
425 | " ('model', KNeighborsClassifier())\n",
426 | "])"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": 9,
432 | "id": "998c2aa2",
433 | "metadata": {},
434 | "outputs": [
435 | {
436 | "data": {
437 | "text/plain": [
438 | "0.8322166387493021"
439 | ]
440 | },
441 | "execution_count": 9,
442 | "metadata": {},
443 | "output_type": "execute_result"
444 | }
445 | ],
446 | "source": [
447 | "#creating X and y variables\n",
448 | "X = df.drop('income', axis=1)\n",
449 | "y = df.income\n",
450 | "\n",
451 | "#spliting data into train and test data\n",
452 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)\n",
453 | "\n",
454 | "#fitting pipeline with train data and predicting test data\n",
455 | "pipe.fit(X_train, y_train)\n",
456 | "predictions = pipe.predict(X_test)\n",
457 | "\n",
458 | "#checking pipeline's accuracy\n",
459 | "accuracy_score(y_test, predictions)"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 10,
465 | "id": "ff499a65",
466 | "metadata": {},
467 | "outputs": [
468 | {
469 | "data": {
470 | "text/plain": [
471 | "Pipeline(steps=[('column_transformer',\n",
472 | " ColumnTransformer(transformers=[('numerical',\n",
473 | " Pipeline(steps=[('imputer',\n",
474 | " SimpleImputer()),\n",
475 | " ('scaler',\n",
476 | " StandardScaler())]),\n",
477 | " ),\n",
478 | " ('categorical',\n",
479 | " Pipeline(steps=[('imputer',\n",
480 | " SimpleImputer(strategy='most_frequent')),\n",
481 | " ('one_hot',\n",
482 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
483 | " )])),\n",
484 | " ('model', KNeighborsClassifier())])"
485 | ]
486 | },
487 | "execution_count": 10,
488 | "metadata": {},
489 | "output_type": "execute_result"
490 | }
491 | ],
492 | "source": [
493 | "pipe"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": 51,
499 | "id": "69c78f74",
500 | "metadata": {
501 | "scrolled": false
502 | },
503 | "outputs": [
504 | {
505 | "name": "stdout",
506 | "output_type": "stream",
507 | "text": [
508 | "Fitting 3 folds for each of 64 candidates, totalling 192 fits\n",
509 | "CPU times: user 8min 53s, sys: 1min 20s, total: 10min 13s\n",
510 | "Wall time: 10min 13s\n"
511 | ]
512 | },
513 | {
514 | "data": {
515 | "text/plain": [
516 | "GridSearchCV(cv=3,\n",
517 | " estimator=Pipeline(steps=[('column_transformer',\n",
518 | " ColumnTransformer(transformers=[('numerical',\n",
519 | " Pipeline(steps=[('imputer',\n",
520 | " SimpleImputer()),\n",
521 | " ('scaler',\n",
522 | " StandardScaler())]),\n",
523 | " ),\n",
524 | " ('categorical',\n",
525 | " Pipeline(steps=[('imputer',\n",
526 | " SimpleImputer(strategy='most_frequent')),...\n",
527 | " )])),\n",
528 | " ('model', KNeighborsClassifier())]),\n",
529 | " param_grid={'column_transformer__numerical__imputer__strategy': ['mean',\n",
530 | " 'median'],\n",
531 | " 'column_transformer__numerical__scaler': [StandardScaler(),\n",
532 | " MinMaxScaler()],\n",
533 | " 'model__leaf_size': [30, 40],\n",
534 | " 'model__n_neighbors': [3, 6, 10, 15],\n",
535 | " 'model__weights': ['uniform', 'distance']},\n",
536 | " scoring=make_scorer(accuracy_score), verbose=1)"
537 | ]
538 | },
539 | "execution_count": 51,
540 | "metadata": {},
541 | "output_type": "execute_result"
542 | }
543 | ],
544 | "source": [
545 | "%%time\n",
546 | "\n",
547 | "#defining the hyperparameter space for searching\n",
548 | "parameters = {\n",
549 | " 'column_transformer__numerical__imputer__strategy': ['mean', 'median'],\n",
550 | " 'column_transformer__numerical__scaler': [StandardScaler(), MinMaxScaler()],\n",
551 | " 'model__n_neighbors': [3, 6, 10, 15],\n",
552 | " 'model__weights': ['uniform', 'distance'],\n",
553 | " 'model__leaf_size': [30, 40]\n",
554 | "}\n",
555 | "\n",
556 | "#defining a scorer and a GridSearchCV instance\n",
557 | "my_scorer = make_scorer(accuracy_score, greater_is_better=True)\n",
558 | "search = GridSearchCV(pipe, parameters, cv=3, scoring=my_scorer, n_jobs=-1, verbose=1)\n",
559 | "\n",
560 | "#search for the best hiperparameter combination within our defined hyperparameter space\n",
561 | "search.fit(X_train, y_train)"
562 | ]
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": 59,
567 | "id": "2e7988a6",
568 | "metadata": {},
569 | "outputs": [
570 | {
571 | "data": {
572 | "text/plain": [
573 | "0.8408710217755444"
574 | ]
575 | },
576 | "execution_count": 59,
577 | "metadata": {},
578 | "output_type": "execute_result"
579 | }
580 | ],
581 | "source": [
582 | "#change pipeline parameters\n",
583 | "pipe.set_params(**search.best_params_)\n",
584 | "\n",
585 | "#making predictions\n",
586 | "predictions = pipe.predict(X_test)\n",
587 | "\n",
588 | "#checking accuracy\n",
589 | "accuracy_score(y_test, predictions)"
590 | ]
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": 60,
595 | "id": "800f1b76",
596 | "metadata": {},
597 | "outputs": [
598 | {
599 | "data": {
600 | "text/plain": [
601 | "Pipeline(steps=[('column_transformer',\n",
602 | " ColumnTransformer(transformers=[('numerical',\n",
603 | " Pipeline(steps=[('imputer',\n",
604 | " SimpleImputer()),\n",
605 | " ('scaler',\n",
606 | " StandardScaler())]),\n",
607 | " ),\n",
608 | " ('categorical',\n",
609 | " Pipeline(steps=[('imputer',\n",
610 | " SimpleImputer(strategy='most_frequent')),\n",
611 | " ('one_hot',\n",
612 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
613 | " )])),\n",
614 | " ('model', KNeighborsClassifier(n_neighbors=15))])"
615 | ]
616 | },
617 | "execution_count": 60,
618 | "metadata": {},
619 | "output_type": "execute_result"
620 | }
621 | ],
622 | "source": [
623 | "pipe"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": null,
629 | "id": "d14602a1",
630 | "metadata": {},
631 | "outputs": [],
632 | "source": []
633 | }
634 | ],
635 | "metadata": {
636 | "kernelspec": {
637 | "display_name": "ml",
638 | "language": "python",
639 | "name": "ml"
640 | },
641 | "language_info": {
642 | "codemirror_mode": {
643 | "name": "ipython",
644 | "version": 3
645 | },
646 | "file_extension": ".py",
647 | "mimetype": "text/x-python",
648 | "name": "python",
649 | "nbconvert_exporter": "python",
650 | "pygments_lexer": "ipython3",
651 | "version": "3.7.6"
652 | }
653 | },
654 | "nbformat": 4,
655 | "nbformat_minor": 5
656 | }
657 |
--------------------------------------------------------------------------------