├── IQR method for handling outliers.ipynb
├── Iterative Imputer demo.ipynb
├── README.md
├── Winsorizing outliers.ipynb
├── column transformer part 1.ipynb
├── column transformer part 2.ipynb
├── confusion matrix metrics + ROC & PR curves.ipynb
├── cross validation.ipynb
├── data
├── churn modelling.csv
└── income_evaluation.csv
├── exponential and log transformer.ipynb
├── grid search.ipynb
├── knn imputer.ipynb
├── label encoder.ipynb
├── maxabs scaler.ipynb
├── min max scaler.ipynb
├── missing indicator.ipynb
├── normalizer.ipynb
├── one hot encoder.ipynb
├── ordinal encoder.ipynb
├── pipeline.ipynb
├── power transformer.ipynb
├── ppts
├── CM metrics 1-4.pptx
├── CM metrics 5-6.pptx
├── Feature scaling.pptx
├── Grid Search.pptx
├── KNN Imputer Algorithm.pptx
├── README.md
├── bias variance.pptx
├── confusion matrix.pptx
├── cross validation.pptx
├── mcc.pptx
├── mice.pptx
├── outlier.pptx
└── roc pr auc.pptx
├── quantile transformer.ipynb
├── robust scaler.ipynb
├── simple imputer.ipynb
├── standard scaler.ipynb
├── train_test_split.ipynb
├── why NEVER use pd.get_dummies.ipynb
└── z score for handling outliers.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # machineLearning
2 | A repo for all the relevant code notebooks and datasets used in my Machine Learning tutorial videos on YouTube, accessible here: https://www.youtube.com/playlist?list=PLlg4M31xJeYa7XcJZWypot8l7R-0E65Ls
3 |
--------------------------------------------------------------------------------
/knn imputer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "from sklearn.model_selection import train_test_split"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "data": {
21 | "text/html": [
22 | "
\n",
23 | "\n",
36 | "
\n",
37 | " \n",
38 | " \n",
39 | " | \n",
40 | " age | \n",
41 | " workclass | \n",
42 | " fnlwgt | \n",
43 | " education | \n",
44 | " education-num | \n",
45 | " marital-status | \n",
46 | " occupation | \n",
47 | " relationship | \n",
48 | " race | \n",
49 | " sex | \n",
50 | " capital-gain | \n",
51 | " capital-loss | \n",
52 | " hours-per-week | \n",
53 | " native-country | \n",
54 | " income | \n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " \n",
59 | " 0 | \n",
60 | " 39 | \n",
61 | " State-gov | \n",
62 | " 77516 | \n",
63 | " Bachelors | \n",
64 | " 13 | \n",
65 | " Never-married | \n",
66 | " Adm-clerical | \n",
67 | " Not-in-family | \n",
68 | " White | \n",
69 | " Male | \n",
70 | " 2174 | \n",
71 | " 0 | \n",
72 | " 40 | \n",
73 | " United-States | \n",
74 | " <=50K | \n",
75 | "
\n",
76 | " \n",
77 | " 1 | \n",
78 | " 50 | \n",
79 | " Self-emp-not-inc | \n",
80 | " 83311 | \n",
81 | " Bachelors | \n",
82 | " 13 | \n",
83 | " Married-civ-spouse | \n",
84 | " Exec-managerial | \n",
85 | " Husband | \n",
86 | " White | \n",
87 | " Male | \n",
88 | " 0 | \n",
89 | " 0 | \n",
90 | " 13 | \n",
91 | " United-States | \n",
92 | " <=50K | \n",
93 | "
\n",
94 | " \n",
95 | " 2 | \n",
96 | " 38 | \n",
97 | " Private | \n",
98 | " 215646 | \n",
99 | " HS-grad | \n",
100 | " 9 | \n",
101 | " Divorced | \n",
102 | " Handlers-cleaners | \n",
103 | " Not-in-family | \n",
104 | " White | \n",
105 | " Male | \n",
106 | " 0 | \n",
107 | " 0 | \n",
108 | " 40 | \n",
109 | " United-States | \n",
110 | " <=50K | \n",
111 | "
\n",
112 | " \n",
113 | " 3 | \n",
114 | " 53 | \n",
115 | " Private | \n",
116 | " 234721 | \n",
117 | " 11th | \n",
118 | " 7 | \n",
119 | " Married-civ-spouse | \n",
120 | " Handlers-cleaners | \n",
121 | " Husband | \n",
122 | " Black | \n",
123 | " Male | \n",
124 | " 0 | \n",
125 | " 0 | \n",
126 | " 40 | \n",
127 | " United-States | \n",
128 | " <=50K | \n",
129 | "
\n",
130 | " \n",
131 | " 4 | \n",
132 | " 28 | \n",
133 | " Private | \n",
134 | " 338409 | \n",
135 | " Bachelors | \n",
136 | " 13 | \n",
137 | " Married-civ-spouse | \n",
138 | " Prof-specialty | \n",
139 | " Wife | \n",
140 | " Black | \n",
141 | " Female | \n",
142 | " 0 | \n",
143 | " 0 | \n",
144 | " 40 | \n",
145 | " Cuba | \n",
146 | " <=50K | \n",
147 | "
\n",
148 | " \n",
149 | "
\n",
150 | "
"
151 | ],
152 | "text/plain": [
153 | " age workclass fnlwgt education education-num \\\n",
154 | "0 39 State-gov 77516 Bachelors 13 \n",
155 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
156 | "2 38 Private 215646 HS-grad 9 \n",
157 | "3 53 Private 234721 11th 7 \n",
158 | "4 28 Private 338409 Bachelors 13 \n",
159 | "\n",
160 | " marital-status occupation relationship race sex \\\n",
161 | "0 Never-married Adm-clerical Not-in-family White Male \n",
162 | "1 Married-civ-spouse Exec-managerial Husband White Male \n",
163 | "2 Divorced Handlers-cleaners Not-in-family White Male \n",
164 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
165 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n",
166 | "\n",
167 | " capital-gain capital-loss hours-per-week native-country income \n",
168 | "0 2174 0 40 United-States <=50K \n",
169 | "1 0 0 13 United-States <=50K \n",
170 | "2 0 0 40 United-States <=50K \n",
171 | "3 0 0 40 United-States <=50K \n",
172 | "4 0 0 40 Cuba <=50K "
173 | ]
174 | },
175 | "execution_count": 2,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')\n",
182 | "df.head()"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 9,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "data": {
192 | "text/plain": [
193 | "array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',\n",
194 | " ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',\n",
195 | " ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',\n",
196 | " ' Preschool', ' 12th'], dtype=object)"
197 | ]
198 | },
199 | "execution_count": 9,
200 | "metadata": {},
201 | "output_type": "execute_result"
202 | }
203 | ],
204 | "source": [
205 | "df[' education'].unique()"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 3,
211 | "metadata": {},
212 | "outputs": [
213 | {
214 | "data": {
215 | "text/plain": [
216 | "age 0\n",
217 | " workclass 1836\n",
218 | " fnlwgt 0\n",
219 | " education 0\n",
220 | " education-num 0\n",
221 | " marital-status 0\n",
222 | " occupation 1843\n",
223 | " relationship 0\n",
224 | " race 0\n",
225 | " sex 0\n",
226 | " capital-gain 0\n",
227 | " capital-loss 0\n",
228 | " hours-per-week 0\n",
229 | " native-country 583\n",
230 | " income 0\n",
231 | "dtype: int64"
232 | ]
233 | },
234 | "execution_count": 3,
235 | "metadata": {},
236 | "output_type": "execute_result"
237 | }
238 | ],
239 | "source": [
240 | "df.isna().sum()"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 4,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "# hours per week\n",
250 | "np.random.seed(seed=0)\n",
251 | "h = np.random.choice(a=df.index, replace=False, size=20)\n",
252 | "df.loc[h, ' hours-per-week'] = np.nan"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 5,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "# age\n",
262 | "np.random.seed(seed=10)\n",
263 | "a = np.random.choice(a=df.index, replace=False, size=28)\n",
264 | "df.loc[a, 'age'] = np.nan"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 6,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "data": {
274 | "text/plain": [
275 | "age 28\n",
276 | " workclass 1836\n",
277 | " fnlwgt 0\n",
278 | " education 0\n",
279 | " education-num 0\n",
280 | " marital-status 0\n",
281 | " occupation 1843\n",
282 | " relationship 0\n",
283 | " race 0\n",
284 | " sex 0\n",
285 | " capital-gain 0\n",
286 | " capital-loss 0\n",
287 | " hours-per-week 20\n",
288 | " native-country 583\n",
289 | " income 0\n",
290 | "dtype: int64"
291 | ]
292 | },
293 | "execution_count": 6,
294 | "metadata": {},
295 | "output_type": "execute_result"
296 | }
297 | ],
298 | "source": [
299 | "df.isna().sum()"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 7,
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', 1), df[' income'],\n",
309 | " test_size=0.2, random_state=5)"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 10,
315 | "metadata": {},
316 | "outputs": [],
317 | "source": [
318 | "from sklearn.impute import KNNImputer"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 21,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "knn = KNNImputer(n_neighbors=5, add_indicator=True)"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 14,
333 | "metadata": {},
334 | "outputs": [
335 | {
336 | "data": {
337 | "text/plain": [
338 | "dtype('float64')"
339 | ]
340 | },
341 | "execution_count": 14,
342 | "metadata": {},
343 | "output_type": "execute_result"
344 | }
345 | ],
346 | "source": [
347 | "X_train['age'].dtypes"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 15,
353 | "metadata": {},
354 | "outputs": [
355 | {
356 | "data": {
357 | "text/plain": [
358 | "Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',\n",
359 | " ' marital-status', ' occupation', ' relationship', ' race', ' sex',\n",
360 | " ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country'],\n",
361 | " dtype='object')"
362 | ]
363 | },
364 | "execution_count": 15,
365 | "metadata": {},
366 | "output_type": "execute_result"
367 | }
368 | ],
369 | "source": [
370 | "X_train.columns"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 17,
376 | "metadata": {},
377 | "outputs": [],
378 | "source": [
379 | "num = [col for col in X_train.columns if X_train[col].dtypes != 'O']"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": 19,
385 | "metadata": {},
386 | "outputs": [
387 | {
388 | "data": {
389 | "text/html": [
390 | "\n",
391 | "\n",
404 | "
\n",
405 | " \n",
406 | " \n",
407 | " | \n",
408 | " age | \n",
409 | " fnlwgt | \n",
410 | " education-num | \n",
411 | " capital-gain | \n",
412 | " capital-loss | \n",
413 | " hours-per-week | \n",
414 | "
\n",
415 | " \n",
416 | " \n",
417 | " \n",
418 | " 21425 | \n",
419 | " 55.0 | \n",
420 | " 238216 | \n",
421 | " 9 | \n",
422 | " 0 | \n",
423 | " 0 | \n",
424 | " 40.0 | \n",
425 | "
\n",
426 | " \n",
427 | " 28707 | \n",
428 | " 24.0 | \n",
429 | " 306460 | \n",
430 | " 9 | \n",
431 | " 0 | \n",
432 | " 0 | \n",
433 | " 40.0 | \n",
434 | "
\n",
435 | " \n",
436 | " 4455 | \n",
437 | " 48.0 | \n",
438 | " 213140 | \n",
439 | " 4 | \n",
440 | " 0 | \n",
441 | " 0 | \n",
442 | " 40.0 | \n",
443 | "
\n",
444 | " \n",
445 | " 2231 | \n",
446 | " 36.0 | \n",
447 | " 127306 | \n",
448 | " 13 | \n",
449 | " 0 | \n",
450 | " 0 | \n",
451 | " 40.0 | \n",
452 | "
\n",
453 | " \n",
454 | " 18864 | \n",
455 | " 53.0 | \n",
456 | " 103586 | \n",
457 | " 13 | \n",
458 | " 0 | \n",
459 | " 0 | \n",
460 | " 55.0 | \n",
461 | "
\n",
462 | " \n",
463 | "
\n",
464 | "
"
465 | ],
466 | "text/plain": [
467 | " age fnlwgt education-num capital-gain capital-loss \\\n",
468 | "21425 55.0 238216 9 0 0 \n",
469 | "28707 24.0 306460 9 0 0 \n",
470 | "4455 48.0 213140 4 0 0 \n",
471 | "2231 36.0 127306 13 0 0 \n",
472 | "18864 53.0 103586 13 0 0 \n",
473 | "\n",
474 | " hours-per-week \n",
475 | "21425 40.0 \n",
476 | "28707 40.0 \n",
477 | "4455 40.0 \n",
478 | "2231 40.0 \n",
479 | "18864 55.0 "
480 | ]
481 | },
482 | "execution_count": 19,
483 | "metadata": {},
484 | "output_type": "execute_result"
485 | }
486 | ],
487 | "source": [
488 | "X_train[num].head()"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": 22,
494 | "metadata": {},
495 | "outputs": [
496 | {
497 | "data": {
498 | "text/plain": [
499 | "KNNImputer(add_indicator=True)"
500 | ]
501 | },
502 | "execution_count": 22,
503 | "metadata": {},
504 | "output_type": "execute_result"
505 | }
506 | ],
507 | "source": [
508 | "knn.fit(X_train[num])"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": 23,
514 | "metadata": {},
515 | "outputs": [
516 | {
517 | "data": {
518 | "text/plain": [
519 | "array([[5.50000e+01, 2.38216e+05, 9.00000e+00, ..., 4.00000e+01,\n",
520 | " 0.00000e+00, 0.00000e+00],\n",
521 | " [2.40000e+01, 3.06460e+05, 9.00000e+00, ..., 4.00000e+01,\n",
522 | " 0.00000e+00, 0.00000e+00],\n",
523 | " [4.80000e+01, 2.13140e+05, 4.00000e+00, ..., 4.00000e+01,\n",
524 | " 0.00000e+00, 0.00000e+00],\n",
525 | " ...,\n",
526 | " [8.50000e+01, 1.66027e+05, 9.00000e+00, ..., 5.00000e+01,\n",
527 | " 0.00000e+00, 0.00000e+00],\n",
528 | " [3.60000e+01, 4.69056e+05, 9.00000e+00, ..., 2.50000e+01,\n",
529 | " 0.00000e+00, 0.00000e+00],\n",
530 | " [2.60000e+01, 1.98163e+05, 1.40000e+01, ..., 4.00000e+01,\n",
531 | " 0.00000e+00, 0.00000e+00]])"
532 | ]
533 | },
534 | "execution_count": 23,
535 | "metadata": {},
536 | "output_type": "execute_result"
537 | }
538 | ],
539 | "source": [
540 | "knn.transform(X_train[num])"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": 27,
546 | "metadata": {
547 | "scrolled": true
548 | },
549 | "outputs": [
550 | {
551 | "data": {
552 | "text/html": [
553 | "\n",
554 | "\n",
567 | "
\n",
568 | " \n",
569 | " \n",
570 | " | \n",
571 | " 0 | \n",
572 | " 1 | \n",
573 | " 2 | \n",
574 | " 3 | \n",
575 | " 4 | \n",
576 | " 5 | \n",
577 | " 6 | \n",
578 | " 7 | \n",
579 | "
\n",
580 | " \n",
581 | " \n",
582 | " \n",
583 | " 0 | \n",
584 | " 55.0 | \n",
585 | " 238216.0 | \n",
586 | " 9.0 | \n",
587 | " 0.0 | \n",
588 | " 0.0 | \n",
589 | " 40.0 | \n",
590 | " 0.0 | \n",
591 | " 0.0 | \n",
592 | "
\n",
593 | " \n",
594 | " 1 | \n",
595 | " 24.0 | \n",
596 | " 306460.0 | \n",
597 | " 9.0 | \n",
598 | " 0.0 | \n",
599 | " 0.0 | \n",
600 | " 40.0 | \n",
601 | " 0.0 | \n",
602 | " 0.0 | \n",
603 | "
\n",
604 | " \n",
605 | " 2 | \n",
606 | " 48.0 | \n",
607 | " 213140.0 | \n",
608 | " 4.0 | \n",
609 | " 0.0 | \n",
610 | " 0.0 | \n",
611 | " 40.0 | \n",
612 | " 0.0 | \n",
613 | " 0.0 | \n",
614 | "
\n",
615 | " \n",
616 | " 3 | \n",
617 | " 36.0 | \n",
618 | " 127306.0 | \n",
619 | " 13.0 | \n",
620 | " 0.0 | \n",
621 | " 0.0 | \n",
622 | " 40.0 | \n",
623 | " 0.0 | \n",
624 | " 0.0 | \n",
625 | "
\n",
626 | " \n",
627 | " 4 | \n",
628 | " 53.0 | \n",
629 | " 103586.0 | \n",
630 | " 13.0 | \n",
631 | " 0.0 | \n",
632 | " 0.0 | \n",
633 | " 55.0 | \n",
634 | " 0.0 | \n",
635 | " 0.0 | \n",
636 | "
\n",
637 | " \n",
638 | "
\n",
639 | "
"
640 | ],
641 | "text/plain": [
642 | " 0 1 2 3 4 5 6 7\n",
643 | "0 55.0 238216.0 9.0 0.0 0.0 40.0 0.0 0.0\n",
644 | "1 24.0 306460.0 9.0 0.0 0.0 40.0 0.0 0.0\n",
645 | "2 48.0 213140.0 4.0 0.0 0.0 40.0 0.0 0.0\n",
646 | "3 36.0 127306.0 13.0 0.0 0.0 40.0 0.0 0.0\n",
647 | "4 53.0 103586.0 13.0 0.0 0.0 55.0 0.0 0.0"
648 | ]
649 | },
650 | "execution_count": 27,
651 | "metadata": {},
652 | "output_type": "execute_result"
653 | }
654 | ],
655 | "source": [
656 | "pd.DataFrame(knn.transform(X_train[num])).head()"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": 30,
662 | "metadata": {},
663 | "outputs": [
664 | {
665 | "data": {
666 | "text/plain": [
667 | "age 5\n",
668 | " fnlwgt 0\n",
669 | " education-num 0\n",
670 | " capital-gain 0\n",
671 | " capital-loss 0\n",
672 | " hours-per-week 1\n",
673 | "dtype: int64"
674 | ]
675 | },
676 | "execution_count": 30,
677 | "metadata": {},
678 | "output_type": "execute_result"
679 | }
680 | ],
681 | "source": [
682 | "X_test[num].isna().sum()"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": 31,
688 | "metadata": {},
689 | "outputs": [
690 | {
691 | "data": {
692 | "text/plain": [
693 | "array([[3.20000e+01, 2.60954e+05, 7.00000e+00, ..., 3.00000e+01,\n",
694 | " 0.00000e+00, 0.00000e+00],\n",
695 | " [3.10000e+01, 2.36391e+05, 1.00000e+01, ..., 4.00000e+01,\n",
696 | " 0.00000e+00, 0.00000e+00],\n",
697 | " [5.90000e+01, 1.75689e+05, 1.00000e+01, ..., 1.40000e+01,\n",
698 | " 0.00000e+00, 0.00000e+00],\n",
699 | " ...,\n",
700 | " [2.60000e+01, 1.77482e+05, 1.20000e+01, ..., 4.50000e+01,\n",
701 | " 0.00000e+00, 0.00000e+00],\n",
702 | " [4.70000e+01, 2.58498e+05, 1.00000e+01, ..., 5.20000e+01,\n",
703 | " 0.00000e+00, 0.00000e+00],\n",
704 | " [4.50000e+01, 1.60962e+05, 1.00000e+01, ..., 3.50000e+01,\n",
705 | " 0.00000e+00, 0.00000e+00]])"
706 | ]
707 | },
708 | "execution_count": 31,
709 | "metadata": {},
710 | "output_type": "execute_result"
711 | }
712 | ],
713 | "source": [
714 | "knn.transform(X_test[num])"
715 | ]
716 | },
717 | {
718 | "cell_type": "code",
719 | "execution_count": 33,
720 | "metadata": {},
721 | "outputs": [
722 | {
723 | "data": {
724 | "text/plain": [
725 | "0"
726 | ]
727 | },
728 | "execution_count": 33,
729 | "metadata": {},
730 | "output_type": "execute_result"
731 | }
732 | ],
733 | "source": [
734 | "pd.DataFrame(knn.transform(X_test[num])).isna().sum().sum()"
735 | ]
736 | }
737 | ],
738 | "metadata": {
739 | "kernelspec": {
740 | "display_name": "Python 3",
741 | "language": "python",
742 | "name": "python3"
743 | },
744 | "language_info": {
745 | "codemirror_mode": {
746 | "name": "ipython",
747 | "version": 3
748 | },
749 | "file_extension": ".py",
750 | "mimetype": "text/x-python",
751 | "name": "python",
752 | "nbconvert_exporter": "python",
753 | "pygments_lexer": "ipython3",
754 | "version": "3.7.3"
755 | }
756 | },
757 | "nbformat": 4,
758 | "nbformat_minor": 2
759 | }
760 |
--------------------------------------------------------------------------------
/label encoder.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "data": {
20 | "text/html": [
21 | "\n",
22 | "\n",
35 | "
\n",
36 | " \n",
37 | " \n",
38 | " | \n",
39 | " age | \n",
40 | " workclass | \n",
41 | " fnlwgt | \n",
42 | " education | \n",
43 | " education-num | \n",
44 | " marital-status | \n",
45 | " occupation | \n",
46 | " relationship | \n",
47 | " race | \n",
48 | " sex | \n",
49 | " capital-gain | \n",
50 | " capital-loss | \n",
51 | " hours-per-week | \n",
52 | " native-country | \n",
53 | " income | \n",
54 | "
\n",
55 | " \n",
56 | " \n",
57 | " \n",
58 | " 0 | \n",
59 | " 39 | \n",
60 | " State-gov | \n",
61 | " 77516 | \n",
62 | " Bachelors | \n",
63 | " 13 | \n",
64 | " Never-married | \n",
65 | " Adm-clerical | \n",
66 | " Not-in-family | \n",
67 | " White | \n",
68 | " Male | \n",
69 | " 2174 | \n",
70 | " 0 | \n",
71 | " 40 | \n",
72 | " United-States | \n",
73 | " <=50K | \n",
74 | "
\n",
75 | " \n",
76 | " 1 | \n",
77 | " 50 | \n",
78 | " Self-emp-not-inc | \n",
79 | " 83311 | \n",
80 | " Bachelors | \n",
81 | " 13 | \n",
82 | " Married-civ-spouse | \n",
83 | " Exec-managerial | \n",
84 | " Husband | \n",
85 | " White | \n",
86 | " Male | \n",
87 | " 0 | \n",
88 | " 0 | \n",
89 | " 13 | \n",
90 | " United-States | \n",
91 | " <=50K | \n",
92 | "
\n",
93 | " \n",
94 | " 2 | \n",
95 | " 38 | \n",
96 | " Private | \n",
97 | " 215646 | \n",
98 | " HS-grad | \n",
99 | " 9 | \n",
100 | " Divorced | \n",
101 | " Handlers-cleaners | \n",
102 | " Not-in-family | \n",
103 | " White | \n",
104 | " Male | \n",
105 | " 0 | \n",
106 | " 0 | \n",
107 | " 40 | \n",
108 | " United-States | \n",
109 | " <=50K | \n",
110 | "
\n",
111 | " \n",
112 | " 3 | \n",
113 | " 53 | \n",
114 | " Private | \n",
115 | " 234721 | \n",
116 | " 11th | \n",
117 | " 7 | \n",
118 | " Married-civ-spouse | \n",
119 | " Handlers-cleaners | \n",
120 | " Husband | \n",
121 | " Black | \n",
122 | " Male | \n",
123 | " 0 | \n",
124 | " 0 | \n",
125 | " 40 | \n",
126 | " United-States | \n",
127 | " <=50K | \n",
128 | "
\n",
129 | " \n",
130 | " 4 | \n",
131 | " 28 | \n",
132 | " Private | \n",
133 | " 338409 | \n",
134 | " Bachelors | \n",
135 | " 13 | \n",
136 | " Married-civ-spouse | \n",
137 | " Prof-specialty | \n",
138 | " Wife | \n",
139 | " Black | \n",
140 | " Female | \n",
141 | " 0 | \n",
142 | " 0 | \n",
143 | " 40 | \n",
144 | " Cuba | \n",
145 | " <=50K | \n",
146 | "
\n",
147 | " \n",
148 | "
\n",
149 | "
"
150 | ],
151 | "text/plain": [
152 | " age workclass fnlwgt education education-num \\\n",
153 | "0 39 State-gov 77516 Bachelors 13 \n",
154 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
155 | "2 38 Private 215646 HS-grad 9 \n",
156 | "3 53 Private 234721 11th 7 \n",
157 | "4 28 Private 338409 Bachelors 13 \n",
158 | "\n",
159 | " marital-status occupation relationship race sex \\\n",
160 | "0 Never-married Adm-clerical Not-in-family White Male \n",
161 | "1 Married-civ-spouse Exec-managerial Husband White Male \n",
162 | "2 Divorced Handlers-cleaners Not-in-family White Male \n",
163 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
164 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n",
165 | "\n",
166 | " capital-gain capital-loss hours-per-week native-country income \n",
167 | "0 2174 0 40 United-States <=50K \n",
168 | "1 0 0 13 United-States <=50K \n",
169 | "2 0 0 40 United-States <=50K \n",
170 | "3 0 0 40 United-States <=50K \n",
171 | "4 0 0 40 Cuba <=50K "
172 | ]
173 | },
174 | "execution_count": 2,
175 | "metadata": {},
176 | "output_type": "execute_result"
177 | }
178 | ],
179 | "source": [
180 | "df = pd.read_csv('data/income_evaluation.csv')\n",
181 | "df.head()"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 28,
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "data": {
191 | "text/plain": [
192 | "Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',\n",
193 | " ' marital-status', ' occupation', ' relationship', ' race', ' sex',\n",
194 | " ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',\n",
195 | " ' income'],\n",
196 | " dtype='object')"
197 | ]
198 | },
199 | "execution_count": 28,
200 | "metadata": {},
201 | "output_type": "execute_result"
202 | }
203 | ],
204 | "source": [
205 | "df.columns"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 25,
211 | "metadata": {},
212 | "outputs": [
213 | {
214 | "data": {
215 | "text/plain": [
216 | " <=50K 24720\n",
217 | " >50K 7841\n",
218 | "Name: income, dtype: int64"
219 | ]
220 | },
221 | "execution_count": 25,
222 | "metadata": {},
223 | "output_type": "execute_result"
224 | }
225 | ],
226 | "source": [
227 | "df[' income'].value_counts()"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 26,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "from sklearn.model_selection import train_test_split"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 29,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', axis=1), df[' income'],\n",
246 | " test_size=0.2, random_state=0)"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 30,
252 | "metadata": {},
253 | "outputs": [],
254 | "source": [
255 | "from sklearn.preprocessing import LabelEncoder"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 31,
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "le = LabelEncoder()"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 32,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "data": {
274 | "text/plain": [
275 | "LabelEncoder()"
276 | ]
277 | },
278 | "execution_count": 32,
279 | "metadata": {},
280 | "output_type": "execute_result"
281 | }
282 | ],
283 | "source": [
284 | "le.fit(y_train)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 33,
290 | "metadata": {},
291 | "outputs": [
292 | {
293 | "data": {
294 | "text/plain": [
295 | "array([' <=50K', ' >50K'], dtype=object)"
296 | ]
297 | },
298 | "execution_count": 33,
299 | "metadata": {},
300 | "output_type": "execute_result"
301 | }
302 | ],
303 | "source": [
304 | "le.classes_"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 37,
310 | "metadata": {
311 | "scrolled": true
312 | },
313 | "outputs": [
314 | {
315 | "data": {
316 | "text/plain": [
317 | "15282 <=50K\n",
318 | "24870 <=50K\n",
319 | "18822 <=50K\n",
320 | "26404 <=50K\n",
321 | "7842 <=50K\n",
322 | "4890 <=50K\n",
323 | "3243 <=50K\n",
324 | "17470 <=50K\n",
325 | "14211 <=50K\n",
326 | "22453 <=50K\n",
327 | "631 <=50K\n",
328 | "29051 <=50K\n",
329 | "21478 <=50K\n",
330 | "26565 <=50K\n",
331 | "25140 <=50K\n",
332 | "15497 >50K\n",
333 | "14689 <=50K\n",
334 | "18726 <=50K\n",
335 | "28105 <=50K\n",
336 | "6965 <=50K\n",
337 | "4343 >50K\n",
338 | "24308 <=50K\n",
339 | "11380 <=50K\n",
340 | "26087 <=50K\n",
341 | "5679 <=50K\n",
342 | "13019 <=50K\n",
343 | "24049 >50K\n",
344 | "32119 >50K\n",
345 | "25586 >50K\n",
346 | "26959 <=50K\n",
347 | " ... \n",
348 | "6216 >50K\n",
349 | "27469 <=50K\n",
350 | "16921 <=50K\n",
351 | "26277 >50K\n",
352 | "2897 <=50K\n",
353 | "24152 <=50K\n",
354 | "18606 <=50K\n",
355 | "10327 >50K\n",
356 | "18983 <=50K\n",
357 | "32230 >50K\n",
358 | "17089 <=50K\n",
359 | "14650 >50K\n",
360 | "19852 <=50K\n",
361 | "6744 <=50K\n",
362 | "15832 >50K\n",
363 | "15430 <=50K\n",
364 | "14935 <=50K\n",
365 | "14116 <=50K\n",
366 | "22258 <=50K\n",
367 | "20757 <=50K\n",
368 | "24275 <=50K\n",
369 | "9225 <=50K\n",
370 | "32103 <=50K\n",
371 | "30403 <=50K\n",
372 | "21243 <=50K\n",
373 | "13123 >50K\n",
374 | "19648 <=50K\n",
375 | "9845 <=50K\n",
376 | "10799 >50K\n",
377 | "2732 <=50K\n",
378 | "Name: income, Length: 26048, dtype: object"
379 | ]
380 | },
381 | "execution_count": 37,
382 | "metadata": {},
383 | "output_type": "execute_result"
384 | }
385 | ],
386 | "source": [
387 | "y_train"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 35,
393 | "metadata": {
394 | "scrolled": true
395 | },
396 | "outputs": [
397 | {
398 | "data": {
399 | "text/plain": [
400 | "0 0\n",
401 | "1 0\n",
402 | "2 0\n",
403 | "3 0\n",
404 | "4 0\n",
405 | "5 0\n",
406 | "6 0\n",
407 | "7 0\n",
408 | "8 0\n",
409 | "9 0\n",
410 | "10 0\n",
411 | "11 0\n",
412 | "12 0\n",
413 | "13 0\n",
414 | "14 0\n",
415 | "15 1\n",
416 | "16 0\n",
417 | "17 0\n",
418 | "18 0\n",
419 | "19 0\n",
420 | "20 1\n",
421 | "21 0\n",
422 | "22 0\n",
423 | "23 0\n",
424 | "24 0\n",
425 | "25 0\n",
426 | "26 1\n",
427 | "27 1\n",
428 | "28 1\n",
429 | "29 0\n",
430 | " ..\n",
431 | "26018 1\n",
432 | "26019 0\n",
433 | "26020 0\n",
434 | "26021 1\n",
435 | "26022 0\n",
436 | "26023 0\n",
437 | "26024 0\n",
438 | "26025 1\n",
439 | "26026 0\n",
440 | "26027 1\n",
441 | "26028 0\n",
442 | "26029 1\n",
443 | "26030 0\n",
444 | "26031 0\n",
445 | "26032 1\n",
446 | "26033 0\n",
447 | "26034 0\n",
448 | "26035 0\n",
449 | "26036 0\n",
450 | "26037 0\n",
451 | "26038 0\n",
452 | "26039 0\n",
453 | "26040 0\n",
454 | "26041 0\n",
455 | "26042 0\n",
456 | "26043 1\n",
457 | "26044 0\n",
458 | "26045 0\n",
459 | "26046 1\n",
460 | "26047 0\n",
461 | "Length: 26048, dtype: int32"
462 | ]
463 | },
464 | "execution_count": 35,
465 | "metadata": {},
466 | "output_type": "execute_result"
467 | }
468 | ],
469 | "source": [
470 | "pd.Series(le.transform(y_train))"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 38,
476 | "metadata": {},
477 | "outputs": [
478 | {
479 | "data": {
480 | "text/plain": [
481 | "array([0, 0, 0, ..., 1, 0, 1])"
482 | ]
483 | },
484 | "execution_count": 38,
485 | "metadata": {},
486 | "output_type": "execute_result"
487 | }
488 | ],
489 | "source": [
490 | "le.transform(y_test)"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 41,
496 | "metadata": {
497 | "scrolled": true
498 | },
499 | "outputs": [
500 | {
501 | "data": {
502 | "text/plain": [
503 | "15282 United-States\n",
504 | "24870 United-States\n",
505 | "18822 United-States\n",
506 | "26404 United-States\n",
507 | "7842 United-States\n",
508 | "4890 United-States\n",
509 | "3243 Mexico\n",
510 | "17470 United-States\n",
511 | "14211 United-States\n",
512 | "22453 United-States\n",
513 | "631 United-States\n",
514 | "29051 United-States\n",
515 | "21478 United-States\n",
516 | "26565 United-States\n",
517 | "25140 United-States\n",
518 | "15497 United-States\n",
519 | "14689 United-States\n",
520 | "18726 Mexico\n",
521 | "28105 United-States\n",
522 | "6965 United-States\n",
523 | "4343 United-States\n",
524 | "24308 United-States\n",
525 | "11380 United-States\n",
526 | "26087 Ireland\n",
527 | "5679 United-States\n",
528 | "13019 United-States\n",
529 | "24049 United-States\n",
530 | "32119 United-States\n",
531 | "25586 United-States\n",
532 | "26959 United-States\n",
533 | " ... \n",
534 | "6216 United-States\n",
535 | "27469 United-States\n",
536 | "16921 United-States\n",
537 | "26277 England\n",
538 | "2897 United-States\n",
539 | "24152 United-States\n",
540 | "18606 Nicaragua\n",
541 | "10327 United-States\n",
542 | "18983 United-States\n",
543 | "32230 United-States\n",
544 | "17089 United-States\n",
545 | "14650 United-States\n",
546 | "19852 United-States\n",
547 | "6744 United-States\n",
548 | "15832 United-States\n",
549 | "15430 United-States\n",
550 | "14935 United-States\n",
551 | "14116 United-States\n",
552 | "22258 United-States\n",
553 | "20757 United-States\n",
554 | "24275 United-States\n",
555 | "9225 England\n",
556 | "32103 United-States\n",
557 | "30403 United-States\n",
558 | "21243 United-States\n",
559 | "13123 United-States\n",
560 | "19648 United-States\n",
561 | "9845 United-States\n",
562 | "10799 United-States\n",
563 | "2732 United-States\n",
564 | "Name: native-country, Length: 26048, dtype: object"
565 | ]
566 | },
567 | "execution_count": 41,
568 | "metadata": {},
569 | "output_type": "execute_result"
570 | }
571 | ],
572 | "source": [
573 | "X_train[' native-country']"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": 40,
579 | "metadata": {},
580 | "outputs": [
581 | {
582 | "data": {
583 | "text/plain": [
584 | "array([39, 39, 39, ..., 39, 39, 39])"
585 | ]
586 | },
587 | "execution_count": 40,
588 | "metadata": {},
589 | "output_type": "execute_result"
590 | }
591 | ],
592 | "source": [
593 | "le1 = LabelEncoder()\n",
594 | "le1.fit_transform(X_train[' native-country'])"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": 42,
600 | "metadata": {},
601 | "outputs": [
602 | {
603 | "data": {
604 | "text/plain": [
605 | "0 39\n",
606 | "1 39\n",
607 | "2 39\n",
608 | "3 39\n",
609 | "4 39\n",
610 | "5 39\n",
611 | "6 26\n",
612 | "7 39\n",
613 | "8 39\n",
614 | "9 39\n",
615 | "10 39\n",
616 | "11 39\n",
617 | "12 39\n",
618 | "13 39\n",
619 | "14 39\n",
620 | "15 39\n",
621 | "16 39\n",
622 | "17 26\n",
623 | "18 39\n",
624 | "19 39\n",
625 | "20 39\n",
626 | "21 39\n",
627 | "22 39\n",
628 | "23 21\n",
629 | "24 39\n",
630 | "25 39\n",
631 | "26 39\n",
632 | "27 39\n",
633 | "28 39\n",
634 | "29 39\n",
635 | " ..\n",
636 | "26018 39\n",
637 | "26019 39\n",
638 | "26020 39\n",
639 | "26021 9\n",
640 | "26022 39\n",
641 | "26023 39\n",
642 | "26024 27\n",
643 | "26025 39\n",
644 | "26026 39\n",
645 | "26027 39\n",
646 | "26028 39\n",
647 | "26029 39\n",
648 | "26030 39\n",
649 | "26031 39\n",
650 | "26032 39\n",
651 | "26033 39\n",
652 | "26034 39\n",
653 | "26035 39\n",
654 | "26036 39\n",
655 | "26037 39\n",
656 | "26038 39\n",
657 | "26039 9\n",
658 | "26040 39\n",
659 | "26041 39\n",
660 | "26042 39\n",
661 | "26043 39\n",
662 | "26044 39\n",
663 | "26045 39\n",
664 | "26046 39\n",
665 | "26047 39\n",
666 | "Length: 26048, dtype: int32"
667 | ]
668 | },
669 | "execution_count": 42,
670 | "metadata": {},
671 | "output_type": "execute_result"
672 | }
673 | ],
674 | "source": [
675 | "pd.Series(le1.fit_transform(X_train[' native-country']))"
676 | ]
677 | }
678 | ],
679 | "metadata": {
680 | "kernelspec": {
681 | "display_name": "Python 3",
682 | "language": "python",
683 | "name": "python3"
684 | },
685 | "language_info": {
686 | "codemirror_mode": {
687 | "name": "ipython",
688 | "version": 3
689 | },
690 | "file_extension": ".py",
691 | "mimetype": "text/x-python",
692 | "name": "python",
693 | "nbconvert_exporter": "python",
694 | "pygments_lexer": "ipython3",
695 | "version": "3.7.3"
696 | }
697 | },
698 | "nbformat": 4,
699 | "nbformat_minor": 2
700 | }
701 |
--------------------------------------------------------------------------------
/missing indicator.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "from sklearn.model_selection import train_test_split"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "data": {
21 | "text/html": [
22 | "\n",
23 | "\n",
36 | "
\n",
37 | " \n",
38 | " \n",
39 | " | \n",
40 | " age | \n",
41 | " workclass | \n",
42 | " fnlwgt | \n",
43 | " education | \n",
44 | " education-num | \n",
45 | " marital-status | \n",
46 | " occupation | \n",
47 | " relationship | \n",
48 | " race | \n",
49 | " sex | \n",
50 | " capital-gain | \n",
51 | " capital-loss | \n",
52 | " hours-per-week | \n",
53 | " native-country | \n",
54 | " income | \n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " \n",
59 | " 0 | \n",
60 | " 39 | \n",
61 | " State-gov | \n",
62 | " 77516 | \n",
63 | " Bachelors | \n",
64 | " 13 | \n",
65 | " Never-married | \n",
66 | " Adm-clerical | \n",
67 | " Not-in-family | \n",
68 | " White | \n",
69 | " Male | \n",
70 | " 2174 | \n",
71 | " 0 | \n",
72 | " 40 | \n",
73 | " United-States | \n",
74 | " <=50K | \n",
75 | "
\n",
76 | " \n",
77 | " 1 | \n",
78 | " 50 | \n",
79 | " Self-emp-not-inc | \n",
80 | " 83311 | \n",
81 | " Bachelors | \n",
82 | " 13 | \n",
83 | " Married-civ-spouse | \n",
84 | " Exec-managerial | \n",
85 | " Husband | \n",
86 | " White | \n",
87 | " Male | \n",
88 | " 0 | \n",
89 | " 0 | \n",
90 | " 13 | \n",
91 | " United-States | \n",
92 | " <=50K | \n",
93 | "
\n",
94 | " \n",
95 | " 2 | \n",
96 | " 38 | \n",
97 | " Private | \n",
98 | " 215646 | \n",
99 | " HS-grad | \n",
100 | " 9 | \n",
101 | " Divorced | \n",
102 | " Handlers-cleaners | \n",
103 | " Not-in-family | \n",
104 | " White | \n",
105 | " Male | \n",
106 | " 0 | \n",
107 | " 0 | \n",
108 | " 40 | \n",
109 | " United-States | \n",
110 | " <=50K | \n",
111 | "
\n",
112 | " \n",
113 | " 3 | \n",
114 | " 53 | \n",
115 | " Private | \n",
116 | " 234721 | \n",
117 | " 11th | \n",
118 | " 7 | \n",
119 | " Married-civ-spouse | \n",
120 | " Handlers-cleaners | \n",
121 | " Husband | \n",
122 | " Black | \n",
123 | " Male | \n",
124 | " 0 | \n",
125 | " 0 | \n",
126 | " 40 | \n",
127 | " United-States | \n",
128 | " <=50K | \n",
129 | "
\n",
130 | " \n",
131 | " 4 | \n",
132 | " 28 | \n",
133 | " Private | \n",
134 | " 338409 | \n",
135 | " Bachelors | \n",
136 | " 13 | \n",
137 | " Married-civ-spouse | \n",
138 | " Prof-specialty | \n",
139 | " Wife | \n",
140 | " Black | \n",
141 | " Female | \n",
142 | " 0 | \n",
143 | " 0 | \n",
144 | " 40 | \n",
145 | " Cuba | \n",
146 | " <=50K | \n",
147 | "
\n",
148 | " \n",
149 | "
\n",
150 | "
"
151 | ],
152 | "text/plain": [
153 | " age workclass fnlwgt education education-num \\\n",
154 | "0 39 State-gov 77516 Bachelors 13 \n",
155 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
156 | "2 38 Private 215646 HS-grad 9 \n",
157 | "3 53 Private 234721 11th 7 \n",
158 | "4 28 Private 338409 Bachelors 13 \n",
159 | "\n",
160 | " marital-status occupation relationship race sex \\\n",
161 | "0 Never-married Adm-clerical Not-in-family White Male \n",
162 | "1 Married-civ-spouse Exec-managerial Husband White Male \n",
163 | "2 Divorced Handlers-cleaners Not-in-family White Male \n",
164 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
165 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n",
166 | "\n",
167 | " capital-gain capital-loss hours-per-week native-country income \n",
168 | "0 2174 0 40 United-States <=50K \n",
169 | "1 0 0 13 United-States <=50K \n",
170 | "2 0 0 40 United-States <=50K \n",
171 | "3 0 0 40 United-States <=50K \n",
172 | "4 0 0 40 Cuba <=50K "
173 | ]
174 | },
175 | "execution_count": 2,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')\n",
182 | "df.head()"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 3,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "data": {
192 | "text/plain": [
193 | "age 0\n",
194 | " workclass 1836\n",
195 | " fnlwgt 0\n",
196 | " education 0\n",
197 | " education-num 0\n",
198 | " marital-status 0\n",
199 | " occupation 1843\n",
200 | " relationship 0\n",
201 | " race 0\n",
202 | " sex 0\n",
203 | " capital-gain 0\n",
204 | " capital-loss 0\n",
205 | " hours-per-week 0\n",
206 | " native-country 583\n",
207 | " income 0\n",
208 | "dtype: int64"
209 | ]
210 | },
211 | "execution_count": 3,
212 | "metadata": {},
213 | "output_type": "execute_result"
214 | }
215 | ],
216 | "source": [
217 | "df.isna().sum()"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 4,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', 1), df[' income'],\n",
227 | " test_size=0.2, random_state=5)"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 5,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "from sklearn.impute import MissingIndicator"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 6,
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "mi = MissingIndicator()"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 7,
251 | "metadata": {},
252 | "outputs": [
253 | {
254 | "data": {
255 | "text/plain": [
256 | "MissingIndicator()"
257 | ]
258 | },
259 | "execution_count": 7,
260 | "metadata": {},
261 | "output_type": "execute_result"
262 | }
263 | ],
264 | "source": [
265 | "mi.fit(X_train)"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 10,
271 | "metadata": {},
272 | "outputs": [
273 | {
274 | "data": {
275 | "text/html": [
276 | "\n",
277 | "\n",
290 | "
\n",
291 | " \n",
292 | " \n",
293 | " | \n",
294 | " 0 | \n",
295 | " 1 | \n",
296 | " 2 | \n",
297 | "
\n",
298 | " \n",
299 | " \n",
300 | " \n",
301 | " 0 | \n",
302 | " False | \n",
303 | " False | \n",
304 | " False | \n",
305 | "
\n",
306 | " \n",
307 | " 1 | \n",
308 | " False | \n",
309 | " False | \n",
310 | " False | \n",
311 | "
\n",
312 | " \n",
313 | " 2 | \n",
314 | " False | \n",
315 | " False | \n",
316 | " False | \n",
317 | "
\n",
318 | " \n",
319 | " 3 | \n",
320 | " False | \n",
321 | " False | \n",
322 | " False | \n",
323 | "
\n",
324 | " \n",
325 | " 4 | \n",
326 | " False | \n",
327 | " False | \n",
328 | " False | \n",
329 | "
\n",
330 | " \n",
331 | "
\n",
332 | "
"
333 | ],
334 | "text/plain": [
335 | " 0 1 2\n",
336 | "0 False False False\n",
337 | "1 False False False\n",
338 | "2 False False False\n",
339 | "3 False False False\n",
340 | "4 False False False"
341 | ]
342 | },
343 | "execution_count": 10,
344 | "metadata": {},
345 | "output_type": "execute_result"
346 | }
347 | ],
348 | "source": [
349 | "pd.DataFrame(mi.transform(X_train)).head()"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 16,
355 | "metadata": {},
356 | "outputs": [
357 | {
358 | "data": {
359 | "text/plain": [
360 | "age 0\n",
361 | " workclass 1468\n",
362 | " fnlwgt 0\n",
363 | " education 0\n",
364 | " education-num 0\n",
365 | " marital-status 0\n",
366 | " occupation 1475\n",
367 | " relationship 0\n",
368 | " race 0\n",
369 | " sex 0\n",
370 | " capital-gain 0\n",
371 | " capital-loss 0\n",
372 | " hours-per-week 0\n",
373 | " native-country 474\n",
374 | "dtype: int64"
375 | ]
376 | },
377 | "execution_count": 16,
378 | "metadata": {},
379 | "output_type": "execute_result"
380 | }
381 | ],
382 | "source": [
383 | "X_train.isna().sum()"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 11,
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "mi1 = MissingIndicator(features='all')"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 15,
398 | "metadata": {},
399 | "outputs": [
400 | {
401 | "data": {
402 | "text/html": [
403 | "\n",
404 | "\n",
417 | "
\n",
418 | " \n",
419 | " \n",
420 | " | \n",
421 | " 0 | \n",
422 | " 1 | \n",
423 | " 2 | \n",
424 | " 3 | \n",
425 | " 4 | \n",
426 | " 5 | \n",
427 | " 6 | \n",
428 | " 7 | \n",
429 | " 8 | \n",
430 | " 9 | \n",
431 | " 10 | \n",
432 | " 11 | \n",
433 | " 12 | \n",
434 | " 13 | \n",
435 | "
\n",
436 | " \n",
437 | " \n",
438 | " \n",
439 | " 0 | \n",
440 | " False | \n",
441 | " False | \n",
442 | " False | \n",
443 | " False | \n",
444 | " False | \n",
445 | " False | \n",
446 | " False | \n",
447 | " False | \n",
448 | " False | \n",
449 | " False | \n",
450 | " False | \n",
451 | " False | \n",
452 | " False | \n",
453 | " False | \n",
454 | "
\n",
455 | " \n",
456 | " 1 | \n",
457 | " False | \n",
458 | " False | \n",
459 | " False | \n",
460 | " False | \n",
461 | " False | \n",
462 | " False | \n",
463 | " False | \n",
464 | " False | \n",
465 | " False | \n",
466 | " False | \n",
467 | " False | \n",
468 | " False | \n",
469 | " False | \n",
470 | " False | \n",
471 | "
\n",
472 | " \n",
473 | " 2 | \n",
474 | " False | \n",
475 | " False | \n",
476 | " False | \n",
477 | " False | \n",
478 | " False | \n",
479 | " False | \n",
480 | " False | \n",
481 | " False | \n",
482 | " False | \n",
483 | " False | \n",
484 | " False | \n",
485 | " False | \n",
486 | " False | \n",
487 | " False | \n",
488 | "
\n",
489 | " \n",
490 | " 3 | \n",
491 | " False | \n",
492 | " False | \n",
493 | " False | \n",
494 | " False | \n",
495 | " False | \n",
496 | " False | \n",
497 | " False | \n",
498 | " False | \n",
499 | " False | \n",
500 | " False | \n",
501 | " False | \n",
502 | " False | \n",
503 | " False | \n",
504 | " False | \n",
505 | "
\n",
506 | " \n",
507 | " 4 | \n",
508 | " False | \n",
509 | " False | \n",
510 | " False | \n",
511 | " False | \n",
512 | " False | \n",
513 | " False | \n",
514 | " False | \n",
515 | " False | \n",
516 | " False | \n",
517 | " False | \n",
518 | " False | \n",
519 | " False | \n",
520 | " False | \n",
521 | " False | \n",
522 | "
\n",
523 | " \n",
524 | "
\n",
525 | "
"
526 | ],
527 | "text/plain": [
528 | " 0 1 2 3 4 5 6 7 8 9 \\\n",
529 | "0 False False False False False False False False False False \n",
530 | "1 False False False False False False False False False False \n",
531 | "2 False False False False False False False False False False \n",
532 | "3 False False False False False False False False False False \n",
533 | "4 False False False False False False False False False False \n",
534 | "\n",
535 | " 10 11 12 13 \n",
536 | "0 False False False False \n",
537 | "1 False False False False \n",
538 | "2 False False False False \n",
539 | "3 False False False False \n",
540 | "4 False False False False "
541 | ]
542 | },
543 | "execution_count": 15,
544 | "metadata": {},
545 | "output_type": "execute_result"
546 | }
547 | ],
548 | "source": [
549 | "pd.DataFrame(mi1.fit_transform(X_train)).head()"
550 | ]
551 | },
552 | {
553 | "cell_type": "code",
554 | "execution_count": 18,
555 | "metadata": {},
556 | "outputs": [
557 | {
558 | "data": {
559 | "text/plain": [
560 | "array([ 1, 6, 13], dtype=int64)"
561 | ]
562 | },
563 | "execution_count": 18,
564 | "metadata": {},
565 | "output_type": "execute_result"
566 | }
567 | ],
568 | "source": [
569 | "mi.features_"
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": 19,
575 | "metadata": {},
576 | "outputs": [
577 | {
578 | "data": {
579 | "text/plain": [
580 | "Index([' workclass', ' occupation', ' native-country'], dtype='object')"
581 | ]
582 | },
583 | "execution_count": 19,
584 | "metadata": {},
585 | "output_type": "execute_result"
586 | }
587 | ],
588 | "source": [
589 | "X_train.columns[mi.features_]"
590 | ]
591 | }
592 | ],
593 | "metadata": {
594 | "kernelspec": {
595 | "display_name": "Python 3",
596 | "language": "python",
597 | "name": "python3"
598 | },
599 | "language_info": {
600 | "codemirror_mode": {
601 | "name": "ipython",
602 | "version": 3
603 | },
604 | "file_extension": ".py",
605 | "mimetype": "text/x-python",
606 | "name": "python",
607 | "nbconvert_exporter": "python",
608 | "pygments_lexer": "ipython3",
609 | "version": "3.7.3"
610 | }
611 | },
612 | "nbformat": 4,
613 | "nbformat_minor": 2
614 | }
615 |
--------------------------------------------------------------------------------
/ordinal encoder.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "from sklearn.model_selection import train_test_split"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "data": {
20 | "text/html": [
21 | "\n",
22 | "\n",
35 | "
\n",
36 | " \n",
37 | " \n",
38 | " | \n",
39 | " age | \n",
40 | " workclass | \n",
41 | " fnlwgt | \n",
42 | " education | \n",
43 | " education-num | \n",
44 | " marital-status | \n",
45 | " occupation | \n",
46 | " relationship | \n",
47 | " race | \n",
48 | " sex | \n",
49 | " capital-gain | \n",
50 | " capital-loss | \n",
51 | " hours-per-week | \n",
52 | " native-country | \n",
53 | " income | \n",
54 | "
\n",
55 | " \n",
56 | " \n",
57 | " \n",
58 | " 0 | \n",
59 | " 39 | \n",
60 | " State-gov | \n",
61 | " 77516 | \n",
62 | " Bachelors | \n",
63 | " 13 | \n",
64 | " Never-married | \n",
65 | " Adm-clerical | \n",
66 | " Not-in-family | \n",
67 | " White | \n",
68 | " Male | \n",
69 | " 2174 | \n",
70 | " 0 | \n",
71 | " 40 | \n",
72 | " United-States | \n",
73 | " <=50K | \n",
74 | "
\n",
75 | " \n",
76 | " 1 | \n",
77 | " 50 | \n",
78 | " Self-emp-not-inc | \n",
79 | " 83311 | \n",
80 | " Bachelors | \n",
81 | " 13 | \n",
82 | " Married-civ-spouse | \n",
83 | " Exec-managerial | \n",
84 | " Husband | \n",
85 | " White | \n",
86 | " Male | \n",
87 | " 0 | \n",
88 | " 0 | \n",
89 | " 13 | \n",
90 | " United-States | \n",
91 | " <=50K | \n",
92 | "
\n",
93 | " \n",
94 | " 2 | \n",
95 | " 38 | \n",
96 | " Private | \n",
97 | " 215646 | \n",
98 | " HS-grad | \n",
99 | " 9 | \n",
100 | " Divorced | \n",
101 | " Handlers-cleaners | \n",
102 | " Not-in-family | \n",
103 | " White | \n",
104 | " Male | \n",
105 | " 0 | \n",
106 | " 0 | \n",
107 | " 40 | \n",
108 | " United-States | \n",
109 | " <=50K | \n",
110 | "
\n",
111 | " \n",
112 | " 3 | \n",
113 | " 53 | \n",
114 | " Private | \n",
115 | " 234721 | \n",
116 | " 11th | \n",
117 | " 7 | \n",
118 | " Married-civ-spouse | \n",
119 | " Handlers-cleaners | \n",
120 | " Husband | \n",
121 | " Black | \n",
122 | " Male | \n",
123 | " 0 | \n",
124 | " 0 | \n",
125 | " 40 | \n",
126 | " United-States | \n",
127 | " <=50K | \n",
128 | "
\n",
129 | " \n",
130 | " 4 | \n",
131 | " 28 | \n",
132 | " Private | \n",
133 | " 338409 | \n",
134 | " Bachelors | \n",
135 | " 13 | \n",
136 | " Married-civ-spouse | \n",
137 | " Prof-specialty | \n",
138 | " Wife | \n",
139 | " Black | \n",
140 | " Female | \n",
141 | " 0 | \n",
142 | " 0 | \n",
143 | " 40 | \n",
144 | " Cuba | \n",
145 | " <=50K | \n",
146 | "
\n",
147 | " \n",
148 | "
\n",
149 | "
"
150 | ],
151 | "text/plain": [
152 | " age workclass fnlwgt education education-num \\\n",
153 | "0 39 State-gov 77516 Bachelors 13 \n",
154 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
155 | "2 38 Private 215646 HS-grad 9 \n",
156 | "3 53 Private 234721 11th 7 \n",
157 | "4 28 Private 338409 Bachelors 13 \n",
158 | "\n",
159 | " marital-status occupation relationship race sex \\\n",
160 | "0 Never-married Adm-clerical Not-in-family White Male \n",
161 | "1 Married-civ-spouse Exec-managerial Husband White Male \n",
162 | "2 Divorced Handlers-cleaners Not-in-family White Male \n",
163 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
164 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n",
165 | "\n",
166 | " capital-gain capital-loss hours-per-week native-country income \n",
167 | "0 2174 0 40 United-States <=50K \n",
168 | "1 0 0 13 United-States <=50K \n",
169 | "2 0 0 40 United-States <=50K \n",
170 | "3 0 0 40 United-States <=50K \n",
171 | "4 0 0 40 Cuba <=50K "
172 | ]
173 | },
174 | "execution_count": 2,
175 | "metadata": {},
176 | "output_type": "execute_result"
177 | }
178 | ],
179 | "source": [
180 | "df = pd.read_csv('data/income_evaluation.csv')\n",
181 | "df.head()"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 4,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "X_train, X_test, y_train, y_test = train_test_split(df.drop([' income'], axis=1), df[' income'],\n",
191 | " test_size=0.2, random_state=0)"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 6,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "data": {
201 | "text/plain": [
202 | " HS-grad 8450\n",
203 | " Some-college 5832\n",
204 | " Bachelors 4242\n",
205 | " Masters 1414\n",
206 | " Assoc-voc 1110\n",
207 | " 11th 920\n",
208 | " Assoc-acdm 817\n",
209 | " 10th 752\n",
210 | " 7th-8th 526\n",
211 | " Prof-school 459\n",
212 | " 9th 419\n",
213 | " 12th 360\n",
214 | " Doctorate 306\n",
215 | " 5th-6th 259\n",
216 | " 1st-4th 139\n",
217 | " Preschool 43\n",
218 | "Name: education, dtype: int64"
219 | ]
220 | },
221 | "execution_count": 6,
222 | "metadata": {},
223 | "output_type": "execute_result"
224 | }
225 | ],
226 | "source": [
227 | "X_train[' education'].value_counts()"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 7,
233 | "metadata": {},
234 | "outputs": [
235 | {
236 | "data": {
237 | "text/plain": [
238 | "array([' 11th', ' HS-grad', ' Bachelors', ' Assoc-voc', ' Some-college',\n",
239 | " ' 9th', ' 10th', ' 12th', ' Doctorate', ' Prof-school', ' Masters',\n",
240 | " ' Assoc-acdm', ' 7th-8th', ' 5th-6th', ' Preschool', ' 1st-4th'],\n",
241 | " dtype=object)"
242 | ]
243 | },
244 | "execution_count": 7,
245 | "metadata": {},
246 | "output_type": "execute_result"
247 | }
248 | ],
249 | "source": [
250 | "X_train[' education'].unique()"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 18,
256 | "metadata": {},
257 | "outputs": [
258 | {
259 | "data": {
260 | "text/plain": [
261 | "array([' Male', ' Female'], dtype=object)"
262 | ]
263 | },
264 | "execution_count": 18,
265 | "metadata": {},
266 | "output_type": "execute_result"
267 | }
268 | ],
269 | "source": [
270 | "X_train[' sex'].unique()"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 26,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "gender = [' Male', ' Female']"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 9,
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "edu = [' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th',\n",
289 | " ' 11th', ' 12th', ' HS-grad', ' Prof-school', ' Some-college',\n",
290 | " ' Assoc-acdm', ' Assoc-voc',' Bachelors', ' Masters', ' Doctorate' ]"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 10,
296 | "metadata": {},
297 | "outputs": [],
298 | "source": [
299 | "from sklearn.preprocessing import OrdinalEncoder"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 27,
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "ordi = OrdinalEncoder(categories=[edu, gender])"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 28,
314 | "metadata": {},
315 | "outputs": [
316 | {
317 | "data": {
318 | "text/plain": [
319 | "OrdinalEncoder(categories=[[' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th',\n",
320 | " ' 9th', ' 10th', ' 11th', ' 12th', ' HS-grad',\n",
321 | " ' Prof-school', ' Some-college', ' Assoc-acdm',\n",
322 | " ' Assoc-voc', ' Bachelors', ' Masters',\n",
323 | " ' Doctorate'],\n",
324 | " [' Male', ' Female']])"
325 | ]
326 | },
327 | "execution_count": 28,
328 | "metadata": {},
329 | "output_type": "execute_result"
330 | }
331 | ],
332 | "source": [
333 | "ordi.fit(X_train[[' education', ' sex']])"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 29,
339 | "metadata": {
340 | "scrolled": true
341 | },
342 | "outputs": [
343 | {
344 | "data": {
345 | "text/html": [
346 | "\n",
347 | "\n",
360 | "
\n",
361 | " \n",
362 | " \n",
363 | " | \n",
364 | " education | \n",
365 | " sex | \n",
366 | "
\n",
367 | " \n",
368 | " \n",
369 | " \n",
370 | " 15282 | \n",
371 | " 11th | \n",
372 | " Male | \n",
373 | "
\n",
374 | " \n",
375 | " 24870 | \n",
376 | " HS-grad | \n",
377 | " Female | \n",
378 | "
\n",
379 | " \n",
380 | " 18822 | \n",
381 | " Bachelors | \n",
382 | " Female | \n",
383 | "
\n",
384 | " \n",
385 | " 26404 | \n",
386 | " HS-grad | \n",
387 | " Female | \n",
388 | "
\n",
389 | " \n",
390 | " 7842 | \n",
391 | " Assoc-voc | \n",
392 | " Male | \n",
393 | "
\n",
394 | " \n",
395 | " 4890 | \n",
396 | " Some-college | \n",
397 | " Male | \n",
398 | "
\n",
399 | " \n",
400 | " 3243 | \n",
401 | " 9th | \n",
402 | " Male | \n",
403 | "
\n",
404 | " \n",
405 | " 17470 | \n",
406 | " Some-college | \n",
407 | " Female | \n",
408 | "
\n",
409 | " \n",
410 | " 14211 | \n",
411 | " Assoc-voc | \n",
412 | " Female | \n",
413 | "
\n",
414 | " \n",
415 | " 22453 | \n",
416 | " Some-college | \n",
417 | " Female | \n",
418 | "
\n",
419 | " \n",
420 | " 631 | \n",
421 | " 10th | \n",
422 | " Female | \n",
423 | "
\n",
424 | " \n",
425 | " 29051 | \n",
426 | " Some-college | \n",
427 | " Male | \n",
428 | "
\n",
429 | " \n",
430 | " 21478 | \n",
431 | " 12th | \n",
432 | " Male | \n",
433 | "
\n",
434 | " \n",
435 | " 26565 | \n",
436 | " HS-grad | \n",
437 | " Male | \n",
438 | "
\n",
439 | " \n",
440 | " 25140 | \n",
441 | " HS-grad | \n",
442 | " Male | \n",
443 | "
\n",
444 | " \n",
445 | " 15497 | \n",
446 | " Doctorate | \n",
447 | " Male | \n",
448 | "
\n",
449 | " \n",
450 | " 14689 | \n",
451 | " Assoc-voc | \n",
452 | " Female | \n",
453 | "
\n",
454 | " \n",
455 | " 18726 | \n",
456 | " Some-college | \n",
457 | " Female | \n",
458 | "
\n",
459 | " \n",
460 | " 28105 | \n",
461 | " Some-college | \n",
462 | " Male | \n",
463 | "
\n",
464 | " \n",
465 | " 6965 | \n",
466 | " Prof-school | \n",
467 | " Male | \n",
468 | "
\n",
469 | " \n",
470 | " 4343 | \n",
471 | " Masters | \n",
472 | " Male | \n",
473 | "
\n",
474 | " \n",
475 | " 24308 | \n",
476 | " Some-college | \n",
477 | " Female | \n",
478 | "
\n",
479 | " \n",
480 | " 11380 | \n",
481 | " Some-college | \n",
482 | " Female | \n",
483 | "
\n",
484 | " \n",
485 | " 26087 | \n",
486 | " HS-grad | \n",
487 | " Male | \n",
488 | "
\n",
489 | " \n",
490 | " 5679 | \n",
491 | " Bachelors | \n",
492 | " Male | \n",
493 | "
\n",
494 | " \n",
495 | " 13019 | \n",
496 | " HS-grad | \n",
497 | " Female | \n",
498 | "
\n",
499 | " \n",
500 | " 24049 | \n",
501 | " Masters | \n",
502 | " Male | \n",
503 | "
\n",
504 | " \n",
505 | " 32119 | \n",
506 | " Some-college | \n",
507 | " Male | \n",
508 | "
\n",
509 | " \n",
510 | " 25586 | \n",
511 | " Some-college | \n",
512 | " Male | \n",
513 | "
\n",
514 | " \n",
515 | " 26959 | \n",
516 | " Masters | \n",
517 | " Male | \n",
518 | "
\n",
519 | " \n",
520 | " ... | \n",
521 | " ... | \n",
522 | " ... | \n",
523 | "
\n",
524 | " \n",
525 | " 6216 | \n",
526 | " Bachelors | \n",
527 | " Male | \n",
528 | "
\n",
529 | " \n",
530 | " 27469 | \n",
531 | " Some-college | \n",
532 | " Female | \n",
533 | "
\n",
534 | " \n",
535 | " 16921 | \n",
536 | " 9th | \n",
537 | " Male | \n",
538 | "
\n",
539 | " \n",
540 | " 26277 | \n",
541 | " HS-grad | \n",
542 | " Male | \n",
543 | "
\n",
544 | " \n",
545 | " 2897 | \n",
546 | " 10th | \n",
547 | " Female | \n",
548 | "
\n",
549 | " \n",
550 | " 24152 | \n",
551 | " 10th | \n",
552 | " Male | \n",
553 | "
\n",
554 | " \n",
555 | " 18606 | \n",
556 | " Some-college | \n",
557 | " Male | \n",
558 | "
\n",
559 | " \n",
560 | " 10327 | \n",
561 | " Prof-school | \n",
562 | " Male | \n",
563 | "
\n",
564 | " \n",
565 | " 18983 | \n",
566 | " Bachelors | \n",
567 | " Male | \n",
568 | "
\n",
569 | " \n",
570 | " 32230 | \n",
571 | " Prof-school | \n",
572 | " Male | \n",
573 | "
\n",
574 | " \n",
575 | " 17089 | \n",
576 | " HS-grad | \n",
577 | " Female | \n",
578 | "
\n",
579 | " \n",
580 | " 14650 | \n",
581 | " Masters | \n",
582 | " Male | \n",
583 | "
\n",
584 | " \n",
585 | " 19852 | \n",
586 | " HS-grad | \n",
587 | " Male | \n",
588 | "
\n",
589 | " \n",
590 | " 6744 | \n",
591 | " HS-grad | \n",
592 | " Male | \n",
593 | "
\n",
594 | " \n",
595 | " 15832 | \n",
596 | " Doctorate | \n",
597 | " Male | \n",
598 | "
\n",
599 | " \n",
600 | " 15430 | \n",
601 | " Some-college | \n",
602 | " Female | \n",
603 | "
\n",
604 | " \n",
605 | " 14935 | \n",
606 | " 5th-6th | \n",
607 | " Male | \n",
608 | "
\n",
609 | " \n",
610 | " 14116 | \n",
611 | " HS-grad | \n",
612 | " Female | \n",
613 | "
\n",
614 | " \n",
615 | " 22258 | \n",
616 | " 10th | \n",
617 | " Female | \n",
618 | "
\n",
619 | " \n",
620 | " 20757 | \n",
621 | " 10th | \n",
622 | " Male | \n",
623 | "
\n",
624 | " \n",
625 | " 24275 | \n",
626 | " HS-grad | \n",
627 | " Male | \n",
628 | "
\n",
629 | " \n",
630 | " 9225 | \n",
631 | " Assoc-acdm | \n",
632 | " Male | \n",
633 | "
\n",
634 | " \n",
635 | " 32103 | \n",
636 | " 7th-8th | \n",
637 | " Female | \n",
638 | "
\n",
639 | " \n",
640 | " 30403 | \n",
641 | " HS-grad | \n",
642 | " Female | \n",
643 | "
\n",
644 | " \n",
645 | " 21243 | \n",
646 | " HS-grad | \n",
647 | " Male | \n",
648 | "
\n",
649 | " \n",
650 | " 13123 | \n",
651 | " Masters | \n",
652 | " Male | \n",
653 | "
\n",
654 | " \n",
655 | " 19648 | \n",
656 | " 10th | \n",
657 | " Male | \n",
658 | "
\n",
659 | " \n",
660 | " 9845 | \n",
661 | " Some-college | \n",
662 | " Female | \n",
663 | "
\n",
664 | " \n",
665 | " 10799 | \n",
666 | " Doctorate | \n",
667 | " Male | \n",
668 | "
\n",
669 | " \n",
670 | " 2732 | \n",
671 | " Some-college | \n",
672 | " Male | \n",
673 | "
\n",
674 | " \n",
675 | "
\n",
676 | "
26048 rows × 2 columns
\n",
677 | "
"
678 | ],
679 | "text/plain": [
680 | " education sex\n",
681 | "15282 11th Male\n",
682 | "24870 HS-grad Female\n",
683 | "18822 Bachelors Female\n",
684 | "26404 HS-grad Female\n",
685 | "7842 Assoc-voc Male\n",
686 | "4890 Some-college Male\n",
687 | "3243 9th Male\n",
688 | "17470 Some-college Female\n",
689 | "14211 Assoc-voc Female\n",
690 | "22453 Some-college Female\n",
691 | "631 10th Female\n",
692 | "29051 Some-college Male\n",
693 | "21478 12th Male\n",
694 | "26565 HS-grad Male\n",
695 | "25140 HS-grad Male\n",
696 | "15497 Doctorate Male\n",
697 | "14689 Assoc-voc Female\n",
698 | "18726 Some-college Female\n",
699 | "28105 Some-college Male\n",
700 | "6965 Prof-school Male\n",
701 | "4343 Masters Male\n",
702 | "24308 Some-college Female\n",
703 | "11380 Some-college Female\n",
704 | "26087 HS-grad Male\n",
705 | "5679 Bachelors Male\n",
706 | "13019 HS-grad Female\n",
707 | "24049 Masters Male\n",
708 | "32119 Some-college Male\n",
709 | "25586 Some-college Male\n",
710 | "26959 Masters Male\n",
711 | "... ... ...\n",
712 | "6216 Bachelors Male\n",
713 | "27469 Some-college Female\n",
714 | "16921 9th Male\n",
715 | "26277 HS-grad Male\n",
716 | "2897 10th Female\n",
717 | "24152 10th Male\n",
718 | "18606 Some-college Male\n",
719 | "10327 Prof-school Male\n",
720 | "18983 Bachelors Male\n",
721 | "32230 Prof-school Male\n",
722 | "17089 HS-grad Female\n",
723 | "14650 Masters Male\n",
724 | "19852 HS-grad Male\n",
725 | "6744 HS-grad Male\n",
726 | "15832 Doctorate Male\n",
727 | "15430 Some-college Female\n",
728 | "14935 5th-6th Male\n",
729 | "14116 HS-grad Female\n",
730 | "22258 10th Female\n",
731 | "20757 10th Male\n",
732 | "24275 HS-grad Male\n",
733 | "9225 Assoc-acdm Male\n",
734 | "32103 7th-8th Female\n",
735 | "30403 HS-grad Female\n",
736 | "21243 HS-grad Male\n",
737 | "13123 Masters Male\n",
738 | "19648 10th Male\n",
739 | "9845 Some-college Female\n",
740 | "10799 Doctorate Male\n",
741 | "2732 Some-college Male\n",
742 | "\n",
743 | "[26048 rows x 2 columns]"
744 | ]
745 | },
746 | "execution_count": 29,
747 | "metadata": {},
748 | "output_type": "execute_result"
749 | }
750 | ],
751 | "source": [
752 | "X_train[[' education', ' sex']]"
753 | ]
754 | },
755 | {
756 | "cell_type": "code",
757 | "execution_count": 30,
758 | "metadata": {
759 | "scrolled": true
760 | },
761 | "outputs": [
762 | {
763 | "data": {
764 | "text/html": [
765 | "\n",
766 | "\n",
779 | "
\n",
780 | " \n",
781 | " \n",
782 | " | \n",
783 | " 0 | \n",
784 | " 1 | \n",
785 | "
\n",
786 | " \n",
787 | " \n",
788 | " \n",
789 | " 0 | \n",
790 | " 6.0 | \n",
791 | " 0.0 | \n",
792 | "
\n",
793 | " \n",
794 | " 1 | \n",
795 | " 8.0 | \n",
796 | " 1.0 | \n",
797 | "
\n",
798 | " \n",
799 | " 2 | \n",
800 | " 13.0 | \n",
801 | " 1.0 | \n",
802 | "
\n",
803 | " \n",
804 | " 3 | \n",
805 | " 8.0 | \n",
806 | " 1.0 | \n",
807 | "
\n",
808 | " \n",
809 | " 4 | \n",
810 | " 12.0 | \n",
811 | " 0.0 | \n",
812 | "
\n",
813 | " \n",
814 | " 5 | \n",
815 | " 10.0 | \n",
816 | " 0.0 | \n",
817 | "
\n",
818 | " \n",
819 | " 6 | \n",
820 | " 4.0 | \n",
821 | " 0.0 | \n",
822 | "
\n",
823 | " \n",
824 | " 7 | \n",
825 | " 10.0 | \n",
826 | " 1.0 | \n",
827 | "
\n",
828 | " \n",
829 | " 8 | \n",
830 | " 12.0 | \n",
831 | " 1.0 | \n",
832 | "
\n",
833 | " \n",
834 | " 9 | \n",
835 | " 10.0 | \n",
836 | " 1.0 | \n",
837 | "
\n",
838 | " \n",
839 | " 10 | \n",
840 | " 5.0 | \n",
841 | " 1.0 | \n",
842 | "
\n",
843 | " \n",
844 | " 11 | \n",
845 | " 10.0 | \n",
846 | " 0.0 | \n",
847 | "
\n",
848 | " \n",
849 | " 12 | \n",
850 | " 7.0 | \n",
851 | " 0.0 | \n",
852 | "
\n",
853 | " \n",
854 | " 13 | \n",
855 | " 8.0 | \n",
856 | " 0.0 | \n",
857 | "
\n",
858 | " \n",
859 | " 14 | \n",
860 | " 8.0 | \n",
861 | " 0.0 | \n",
862 | "
\n",
863 | " \n",
864 | " 15 | \n",
865 | " 15.0 | \n",
866 | " 0.0 | \n",
867 | "
\n",
868 | " \n",
869 | " 16 | \n",
870 | " 12.0 | \n",
871 | " 1.0 | \n",
872 | "
\n",
873 | " \n",
874 | " 17 | \n",
875 | " 10.0 | \n",
876 | " 1.0 | \n",
877 | "
\n",
878 | " \n",
879 | " 18 | \n",
880 | " 10.0 | \n",
881 | " 0.0 | \n",
882 | "
\n",
883 | " \n",
884 | " 19 | \n",
885 | " 9.0 | \n",
886 | " 0.0 | \n",
887 | "
\n",
888 | " \n",
889 | " 20 | \n",
890 | " 14.0 | \n",
891 | " 0.0 | \n",
892 | "
\n",
893 | " \n",
894 | " 21 | \n",
895 | " 10.0 | \n",
896 | " 1.0 | \n",
897 | "
\n",
898 | " \n",
899 | " 22 | \n",
900 | " 10.0 | \n",
901 | " 1.0 | \n",
902 | "
\n",
903 | " \n",
904 | " 23 | \n",
905 | " 8.0 | \n",
906 | " 0.0 | \n",
907 | "
\n",
908 | " \n",
909 | " 24 | \n",
910 | " 13.0 | \n",
911 | " 0.0 | \n",
912 | "
\n",
913 | " \n",
914 | " 25 | \n",
915 | " 8.0 | \n",
916 | " 1.0 | \n",
917 | "
\n",
918 | " \n",
919 | " 26 | \n",
920 | " 14.0 | \n",
921 | " 0.0 | \n",
922 | "
\n",
923 | " \n",
924 | " 27 | \n",
925 | " 10.0 | \n",
926 | " 0.0 | \n",
927 | "
\n",
928 | " \n",
929 | " 28 | \n",
930 | " 10.0 | \n",
931 | " 0.0 | \n",
932 | "
\n",
933 | " \n",
934 | " 29 | \n",
935 | " 14.0 | \n",
936 | " 0.0 | \n",
937 | "
\n",
938 | " \n",
939 | " ... | \n",
940 | " ... | \n",
941 | " ... | \n",
942 | "
\n",
943 | " \n",
944 | " 26018 | \n",
945 | " 13.0 | \n",
946 | " 0.0 | \n",
947 | "
\n",
948 | " \n",
949 | " 26019 | \n",
950 | " 10.0 | \n",
951 | " 1.0 | \n",
952 | "
\n",
953 | " \n",
954 | " 26020 | \n",
955 | " 4.0 | \n",
956 | " 0.0 | \n",
957 | "
\n",
958 | " \n",
959 | " 26021 | \n",
960 | " 8.0 | \n",
961 | " 0.0 | \n",
962 | "
\n",
963 | " \n",
964 | " 26022 | \n",
965 | " 5.0 | \n",
966 | " 1.0 | \n",
967 | "
\n",
968 | " \n",
969 | " 26023 | \n",
970 | " 5.0 | \n",
971 | " 0.0 | \n",
972 | "
\n",
973 | " \n",
974 | " 26024 | \n",
975 | " 10.0 | \n",
976 | " 0.0 | \n",
977 | "
\n",
978 | " \n",
979 | " 26025 | \n",
980 | " 9.0 | \n",
981 | " 0.0 | \n",
982 | "
\n",
983 | " \n",
984 | " 26026 | \n",
985 | " 13.0 | \n",
986 | " 0.0 | \n",
987 | "
\n",
988 | " \n",
989 | " 26027 | \n",
990 | " 9.0 | \n",
991 | " 0.0 | \n",
992 | "
\n",
993 | " \n",
994 | " 26028 | \n",
995 | " 8.0 | \n",
996 | " 1.0 | \n",
997 | "
\n",
998 | " \n",
999 | " 26029 | \n",
1000 | " 14.0 | \n",
1001 | " 0.0 | \n",
1002 | "
\n",
1003 | " \n",
1004 | " 26030 | \n",
1005 | " 8.0 | \n",
1006 | " 0.0 | \n",
1007 | "
\n",
1008 | " \n",
1009 | " 26031 | \n",
1010 | " 8.0 | \n",
1011 | " 0.0 | \n",
1012 | "
\n",
1013 | " \n",
1014 | " 26032 | \n",
1015 | " 15.0 | \n",
1016 | " 0.0 | \n",
1017 | "
\n",
1018 | " \n",
1019 | " 26033 | \n",
1020 | " 10.0 | \n",
1021 | " 1.0 | \n",
1022 | "
\n",
1023 | " \n",
1024 | " 26034 | \n",
1025 | " 2.0 | \n",
1026 | " 0.0 | \n",
1027 | "
\n",
1028 | " \n",
1029 | " 26035 | \n",
1030 | " 8.0 | \n",
1031 | " 1.0 | \n",
1032 | "
\n",
1033 | " \n",
1034 | " 26036 | \n",
1035 | " 5.0 | \n",
1036 | " 1.0 | \n",
1037 | "
\n",
1038 | " \n",
1039 | " 26037 | \n",
1040 | " 5.0 | \n",
1041 | " 0.0 | \n",
1042 | "
\n",
1043 | " \n",
1044 | " 26038 | \n",
1045 | " 8.0 | \n",
1046 | " 0.0 | \n",
1047 | "
\n",
1048 | " \n",
1049 | " 26039 | \n",
1050 | " 11.0 | \n",
1051 | " 0.0 | \n",
1052 | "
\n",
1053 | " \n",
1054 | " 26040 | \n",
1055 | " 3.0 | \n",
1056 | " 1.0 | \n",
1057 | "
\n",
1058 | " \n",
1059 | " 26041 | \n",
1060 | " 8.0 | \n",
1061 | " 1.0 | \n",
1062 | "
\n",
1063 | " \n",
1064 | " 26042 | \n",
1065 | " 8.0 | \n",
1066 | " 0.0 | \n",
1067 | "
\n",
1068 | " \n",
1069 | " 26043 | \n",
1070 | " 14.0 | \n",
1071 | " 0.0 | \n",
1072 | "
\n",
1073 | " \n",
1074 | " 26044 | \n",
1075 | " 5.0 | \n",
1076 | " 0.0 | \n",
1077 | "
\n",
1078 | " \n",
1079 | " 26045 | \n",
1080 | " 10.0 | \n",
1081 | " 1.0 | \n",
1082 | "
\n",
1083 | " \n",
1084 | " 26046 | \n",
1085 | " 15.0 | \n",
1086 | " 0.0 | \n",
1087 | "
\n",
1088 | " \n",
1089 | " 26047 | \n",
1090 | " 10.0 | \n",
1091 | " 0.0 | \n",
1092 | "
\n",
1093 | " \n",
1094 | "
\n",
1095 | "
26048 rows × 2 columns
\n",
1096 | "
"
1097 | ],
1098 | "text/plain": [
1099 | " 0 1\n",
1100 | "0 6.0 0.0\n",
1101 | "1 8.0 1.0\n",
1102 | "2 13.0 1.0\n",
1103 | "3 8.0 1.0\n",
1104 | "4 12.0 0.0\n",
1105 | "5 10.0 0.0\n",
1106 | "6 4.0 0.0\n",
1107 | "7 10.0 1.0\n",
1108 | "8 12.0 1.0\n",
1109 | "9 10.0 1.0\n",
1110 | "10 5.0 1.0\n",
1111 | "11 10.0 0.0\n",
1112 | "12 7.0 0.0\n",
1113 | "13 8.0 0.0\n",
1114 | "14 8.0 0.0\n",
1115 | "15 15.0 0.0\n",
1116 | "16 12.0 1.0\n",
1117 | "17 10.0 1.0\n",
1118 | "18 10.0 0.0\n",
1119 | "19 9.0 0.0\n",
1120 | "20 14.0 0.0\n",
1121 | "21 10.0 1.0\n",
1122 | "22 10.0 1.0\n",
1123 | "23 8.0 0.0\n",
1124 | "24 13.0 0.0\n",
1125 | "25 8.0 1.0\n",
1126 | "26 14.0 0.0\n",
1127 | "27 10.0 0.0\n",
1128 | "28 10.0 0.0\n",
1129 | "29 14.0 0.0\n",
1130 | "... ... ...\n",
1131 | "26018 13.0 0.0\n",
1132 | "26019 10.0 1.0\n",
1133 | "26020 4.0 0.0\n",
1134 | "26021 8.0 0.0\n",
1135 | "26022 5.0 1.0\n",
1136 | "26023 5.0 0.0\n",
1137 | "26024 10.0 0.0\n",
1138 | "26025 9.0 0.0\n",
1139 | "26026 13.0 0.0\n",
1140 | "26027 9.0 0.0\n",
1141 | "26028 8.0 1.0\n",
1142 | "26029 14.0 0.0\n",
1143 | "26030 8.0 0.0\n",
1144 | "26031 8.0 0.0\n",
1145 | "26032 15.0 0.0\n",
1146 | "26033 10.0 1.0\n",
1147 | "26034 2.0 0.0\n",
1148 | "26035 8.0 1.0\n",
1149 | "26036 5.0 1.0\n",
1150 | "26037 5.0 0.0\n",
1151 | "26038 8.0 0.0\n",
1152 | "26039 11.0 0.0\n",
1153 | "26040 3.0 1.0\n",
1154 | "26041 8.0 1.0\n",
1155 | "26042 8.0 0.0\n",
1156 | "26043 14.0 0.0\n",
1157 | "26044 5.0 0.0\n",
1158 | "26045 10.0 1.0\n",
1159 | "26046 15.0 0.0\n",
1160 | "26047 10.0 0.0\n",
1161 | "\n",
1162 | "[26048 rows x 2 columns]"
1163 | ]
1164 | },
1165 | "execution_count": 30,
1166 | "metadata": {},
1167 | "output_type": "execute_result"
1168 | }
1169 | ],
1170 | "source": [
1171 | "pd.DataFrame(ordi.transform(X_train[[' education', ' sex']]))"
1172 | ]
1173 | },
1174 | {
1175 | "cell_type": "code",
1176 | "execution_count": 31,
1177 | "metadata": {},
1178 | "outputs": [
1179 | {
1180 | "data": {
1181 | "text/plain": [
1182 | "array([[10., 1.],\n",
1183 | " [13., 1.],\n",
1184 | " [11., 0.],\n",
1185 | " ...,\n",
1186 | " [13., 0.],\n",
1187 | " [ 8., 0.],\n",
1188 | " [ 8., 0.]])"
1189 | ]
1190 | },
1191 | "execution_count": 31,
1192 | "metadata": {},
1193 | "output_type": "execute_result"
1194 | }
1195 | ],
1196 | "source": [
1197 | "ordi.transform(X_test[[' education', ' sex']])"
1198 | ]
1199 | }
1200 | ],
1201 | "metadata": {
1202 | "kernelspec": {
1203 | "display_name": "Python 3",
1204 | "language": "python",
1205 | "name": "python3"
1206 | },
1207 | "language_info": {
1208 | "codemirror_mode": {
1209 | "name": "ipython",
1210 | "version": 3
1211 | },
1212 | "file_extension": ".py",
1213 | "mimetype": "text/x-python",
1214 | "name": "python",
1215 | "nbconvert_exporter": "python",
1216 | "pygments_lexer": "ipython3",
1217 | "version": "3.7.3"
1218 | }
1219 | },
1220 | "nbformat": 4,
1221 | "nbformat_minor": 2
1222 | }
1223 |
--------------------------------------------------------------------------------
/pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 27,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "\n",
12 | "from sklearn.model_selection import train_test_split\n",
13 | "from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler, MinMaxScaler\n",
14 | "from sklearn.pipeline import Pipeline\n",
15 | "from sklearn.compose import ColumnTransformer\n",
16 | "from sklearn.tree import DecisionTreeClassifier"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {},
23 | "outputs": [
24 | {
25 | "data": {
26 | "text/html": [
27 | "\n",
28 | "\n",
41 | "
\n",
42 | " \n",
43 | " \n",
44 | " | \n",
45 | " age | \n",
46 | " workclass | \n",
47 | " fnlwgt | \n",
48 | " education | \n",
49 | " education-num | \n",
50 | " marital-status | \n",
51 | " occupation | \n",
52 | " relationship | \n",
53 | " race | \n",
54 | " sex | \n",
55 | " capital-gain | \n",
56 | " capital-loss | \n",
57 | " hours-per-week | \n",
58 | " native-country | \n",
59 | " income | \n",
60 | "
\n",
61 | " \n",
62 | " \n",
63 | " \n",
64 | " 0 | \n",
65 | " 39 | \n",
66 | " State-gov | \n",
67 | " 77516 | \n",
68 | " Bachelors | \n",
69 | " 13 | \n",
70 | " Never-married | \n",
71 | " Adm-clerical | \n",
72 | " Not-in-family | \n",
73 | " White | \n",
74 | " Male | \n",
75 | " 2174 | \n",
76 | " 0 | \n",
77 | " 40 | \n",
78 | " United-States | \n",
79 | " <=50K | \n",
80 | "
\n",
81 | " \n",
82 | " 1 | \n",
83 | " 50 | \n",
84 | " Self-emp-not-inc | \n",
85 | " 83311 | \n",
86 | " Bachelors | \n",
87 | " 13 | \n",
88 | " Married-civ-spouse | \n",
89 | " Exec-managerial | \n",
90 | " Husband | \n",
91 | " White | \n",
92 | " Male | \n",
93 | " 0 | \n",
94 | " 0 | \n",
95 | " 13 | \n",
96 | " United-States | \n",
97 | " <=50K | \n",
98 | "
\n",
99 | " \n",
100 | " 2 | \n",
101 | " 38 | \n",
102 | " Private | \n",
103 | " 215646 | \n",
104 | " HS-grad | \n",
105 | " 9 | \n",
106 | " Divorced | \n",
107 | " Handlers-cleaners | \n",
108 | " Not-in-family | \n",
109 | " White | \n",
110 | " Male | \n",
111 | " 0 | \n",
112 | " 0 | \n",
113 | " 40 | \n",
114 | " United-States | \n",
115 | " <=50K | \n",
116 | "
\n",
117 | " \n",
118 | " 3 | \n",
119 | " 53 | \n",
120 | " Private | \n",
121 | " 234721 | \n",
122 | " 11th | \n",
123 | " 7 | \n",
124 | " Married-civ-spouse | \n",
125 | " Handlers-cleaners | \n",
126 | " Husband | \n",
127 | " Black | \n",
128 | " Male | \n",
129 | " 0 | \n",
130 | " 0 | \n",
131 | " 40 | \n",
132 | " United-States | \n",
133 | " <=50K | \n",
134 | "
\n",
135 | " \n",
136 | " 4 | \n",
137 | " 28 | \n",
138 | " Private | \n",
139 | " 338409 | \n",
140 | " Bachelors | \n",
141 | " 13 | \n",
142 | " Married-civ-spouse | \n",
143 | " Prof-specialty | \n",
144 | " Wife | \n",
145 | " Black | \n",
146 | " Female | \n",
147 | " 0 | \n",
148 | " 0 | \n",
149 | " 40 | \n",
150 | " Cuba | \n",
151 | " <=50K | \n",
152 | "
\n",
153 | " \n",
154 | "
\n",
155 | "
"
156 | ],
157 | "text/plain": [
158 | " age workclass fnlwgt education education-num \\\n",
159 | "0 39 State-gov 77516 Bachelors 13 \n",
160 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
161 | "2 38 Private 215646 HS-grad 9 \n",
162 | "3 53 Private 234721 11th 7 \n",
163 | "4 28 Private 338409 Bachelors 13 \n",
164 | "\n",
165 | " marital-status occupation relationship race sex \\\n",
166 | "0 Never-married Adm-clerical Not-in-family White Male \n",
167 | "1 Married-civ-spouse Exec-managerial Husband White Male \n",
168 | "2 Divorced Handlers-cleaners Not-in-family White Male \n",
169 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
170 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n",
171 | "\n",
172 | " capital-gain capital-loss hours-per-week native-country income \n",
173 | "0 2174 0 40 United-States <=50K \n",
174 | "1 0 0 13 United-States <=50K \n",
175 | "2 0 0 40 United-States <=50K \n",
176 | "3 0 0 40 United-States <=50K \n",
177 | "4 0 0 40 Cuba <=50K "
178 | ]
179 | },
180 | "execution_count": 2,
181 | "metadata": {},
182 | "output_type": "execute_result"
183 | }
184 | ],
185 | "source": [
186 | "df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')\n",
187 | "df.head()"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 3,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "data": {
197 | "text/plain": [
198 | "age 0\n",
199 | " workclass 1836\n",
200 | " fnlwgt 0\n",
201 | " education 0\n",
202 | " education-num 0\n",
203 | " marital-status 0\n",
204 | " occupation 1843\n",
205 | " relationship 0\n",
206 | " race 0\n",
207 | " sex 0\n",
208 | " capital-gain 0\n",
209 | " capital-loss 0\n",
210 | " hours-per-week 0\n",
211 | " native-country 583\n",
212 | " income 0\n",
213 | "dtype: int64"
214 | ]
215 | },
216 | "execution_count": 3,
217 | "metadata": {},
218 | "output_type": "execute_result"
219 | }
220 | ],
221 | "source": [
222 | "df.isna().sum()"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 4,
228 | "metadata": {},
229 | "outputs": [
230 | {
231 | "data": {
232 | "text/plain": [
233 | "(32561, 15)"
234 | ]
235 | },
236 | "execution_count": 4,
237 | "metadata": {},
238 | "output_type": "execute_result"
239 | }
240 | ],
241 | "source": [
242 | "df.shape"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 5,
248 | "metadata": {},
249 | "outputs": [
250 | {
251 | "data": {
252 | "text/plain": [
253 | "(30162, 15)"
254 | ]
255 | },
256 | "execution_count": 5,
257 | "metadata": {},
258 | "output_type": "execute_result"
259 | }
260 | ],
261 | "source": [
262 | "df.dropna(inplace=True)\n",
263 | "df.shape"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 6,
269 | "metadata": {},
270 | "outputs": [
271 | {
272 | "data": {
273 | "text/plain": [
274 | "age 0\n",
275 | " workclass 0\n",
276 | " fnlwgt 0\n",
277 | " education 0\n",
278 | " education-num 0\n",
279 | " marital-status 0\n",
280 | " occupation 0\n",
281 | " relationship 0\n",
282 | " race 0\n",
283 | " sex 0\n",
284 | " capital-gain 0\n",
285 | " capital-loss 0\n",
286 | " hours-per-week 0\n",
287 | " native-country 0\n",
288 | " income 0\n",
289 | "dtype: int64"
290 | ]
291 | },
292 | "execution_count": 6,
293 | "metadata": {},
294 | "output_type": "execute_result"
295 | }
296 | ],
297 | "source": [
298 | "df.isna().sum()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 7,
304 | "metadata": {},
305 | "outputs": [
306 | {
307 | "data": {
308 | "text/plain": [
309 | "Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',\n",
310 | " ' marital-status', ' occupation', ' relationship', ' race', ' sex',\n",
311 | " ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',\n",
312 | " ' income'],\n",
313 | " dtype='object')"
314 | ]
315 | },
316 | "execution_count": 7,
317 | "metadata": {},
318 | "output_type": "execute_result"
319 | }
320 | ],
321 | "source": [
322 | "df.columns"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 8,
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "data": {
332 | "text/plain": [
333 | "Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',\n",
334 | " 'marital-status', 'occupation', 'relationship', 'race', 'sex',\n",
335 | " 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',\n",
336 | " 'income'],\n",
337 | " dtype='object')"
338 | ]
339 | },
340 | "execution_count": 8,
341 | "metadata": {},
342 | "output_type": "execute_result"
343 | }
344 | ],
345 | "source": [
346 | "df.columns = df.columns.str.strip()\n",
347 | "df.columns"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 9,
353 | "metadata": {},
354 | "outputs": [],
355 | "source": [
356 | "X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df.income,\n",
357 | " test_size=0.2, random_state=0)"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": 19,
363 | "metadata": {},
364 | "outputs": [
365 | {
366 | "data": {
367 | "text/plain": [
368 | "['age',\n",
369 | " 'fnlwgt',\n",
370 | " 'education-num',\n",
371 | " 'capital-gain',\n",
372 | " 'capital-loss',\n",
373 | " 'hours-per-week']"
374 | ]
375 | },
376 | "execution_count": 19,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "num_cols = [col for col in X_train.columns if X_train[col].dtypes!='O']\n",
383 | "num_cols"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 20,
389 | "metadata": {},
390 | "outputs": [
391 | {
392 | "data": {
393 | "text/plain": [
394 | "['workclass',\n",
395 | " 'education',\n",
396 | " 'marital-status',\n",
397 | " 'occupation',\n",
398 | " 'relationship',\n",
399 | " 'race',\n",
400 | " 'sex',\n",
401 | " 'native-country']"
402 | ]
403 | },
404 | "execution_count": 20,
405 | "metadata": {},
406 | "output_type": "execute_result"
407 | }
408 | ],
409 | "source": [
410 | "cat_cols = [col for col in X_train.columns if X_train[col].dtypes=='O']\n",
411 | "cat_cols"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 22,
417 | "metadata": {
418 | "scrolled": true
419 | },
420 | "outputs": [
421 | {
422 | "data": {
423 | "text/html": [
424 | "\n",
425 | "\n",
438 | "
\n",
439 | " \n",
440 | " \n",
441 | " | \n",
442 | " education | \n",
443 | " education-num | \n",
444 | "
\n",
445 | " \n",
446 | " \n",
447 | " \n",
448 | " 0 | \n",
449 | " Bachelors | \n",
450 | " 13 | \n",
451 | "
\n",
452 | " \n",
453 | " 1 | \n",
454 | " Bachelors | \n",
455 | " 13 | \n",
456 | "
\n",
457 | " \n",
458 | " 2 | \n",
459 | " HS-grad | \n",
460 | " 9 | \n",
461 | "
\n",
462 | " \n",
463 | " 3 | \n",
464 | " 11th | \n",
465 | " 7 | \n",
466 | "
\n",
467 | " \n",
468 | " 4 | \n",
469 | " Bachelors | \n",
470 | " 13 | \n",
471 | "
\n",
472 | " \n",
473 | "
\n",
474 | "
"
475 | ],
476 | "text/plain": [
477 | " education education-num\n",
478 | "0 Bachelors 13\n",
479 | "1 Bachelors 13\n",
480 | "2 HS-grad 9\n",
481 | "3 11th 7\n",
482 | "4 Bachelors 13"
483 | ]
484 | },
485 | "execution_count": 22,
486 | "metadata": {},
487 | "output_type": "execute_result"
488 | }
489 | ],
490 | "source": [
491 | "df[['education', 'education-num']].head()"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": 23,
497 | "metadata": {},
498 | "outputs": [],
499 | "source": [
500 | "ct = ColumnTransformer([\n",
501 | " ('step1', RobustScaler(), ['age', 'fnlwgt', 'hours-per-week']),\n",
502 | " ('step2', StandardScaler(), ['capital-gain', 'capital-loss', 'education-num']),\n",
503 | " ('step3', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['workclass', \n",
504 | " 'marital-status', 'occupation',\n",
505 | " 'relationship', 'race', \n",
506 | " 'sex', 'native-country'])\n",
507 | "], remainder='drop')"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": null,
513 | "metadata": {},
514 | "outputs": [],
515 | "source": []
516 | },
517 | {
518 | "cell_type": "markdown",
519 | "metadata": {},
520 | "source": [
521 | "# pipeline use case 1 - with an 'estimator' as final step"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": 52,
527 | "metadata": {},
528 | "outputs": [],
529 | "source": [
530 | "p = Pipeline([\n",
531 | " ('coltf_step', ct),\n",
532 | " ('model', DecisionTreeClassifier()),\n",
533 | "])"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 53,
539 | "metadata": {
540 | "scrolled": true
541 | },
542 | "outputs": [
543 | {
544 | "data": {
545 | "text/plain": [
546 | "Pipeline(steps=[('coltf_step',\n",
547 | " ColumnTransformer(transformers=[('step1', RobustScaler(),\n",
548 | " ['age', 'fnlwgt',\n",
549 | " 'hours-per-week']),\n",
550 | " ('step2', StandardScaler(),\n",
551 | " ['capital-gain',\n",
552 | " 'capital-loss',\n",
553 | " 'education-num']),\n",
554 | " ('step3',\n",
555 | " OneHotEncoder(handle_unknown='ignore',\n",
556 | " sparse=False),\n",
557 | " ['workclass',\n",
558 | " 'marital-status',\n",
559 | " 'occupation', 'relationship',\n",
560 | " 'race', 'sex',\n",
561 | " 'native-country'])])),\n",
562 | " ('model', DecisionTreeClassifier())])"
563 | ]
564 | },
565 | "execution_count": 53,
566 | "metadata": {},
567 | "output_type": "execute_result"
568 | }
569 | ],
570 | "source": [
571 | "p.fit(X_train, y_train)"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 54,
577 | "metadata": {},
578 | "outputs": [
579 | {
580 | "data": {
581 | "text/plain": [
582 | "array([' <=50K', ' <=50K', ' <=50K', ..., ' >50K', ' <=50K', ' <=50K'],\n",
583 | " dtype=object)"
584 | ]
585 | },
586 | "execution_count": 54,
587 | "metadata": {},
588 | "output_type": "execute_result"
589 | }
590 | ],
591 | "source": [
592 | "p.predict(X_test)"
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "execution_count": 55,
598 | "metadata": {},
599 | "outputs": [
600 | {
601 | "data": {
602 | "text/plain": [
603 | "0.8078899386706447"
604 | ]
605 | },
606 | "execution_count": 55,
607 | "metadata": {},
608 | "output_type": "execute_result"
609 | }
610 | ],
611 | "source": [
612 | "p.score(X_test, y_test)"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": 56,
618 | "metadata": {},
619 | "outputs": [
620 | {
621 | "data": {
622 | "text/plain": [
623 | "{'coltf_step': ColumnTransformer(transformers=[('step1', RobustScaler(),\n",
624 | " ['age', 'fnlwgt', 'hours-per-week']),\n",
625 | " ('step2', StandardScaler(),\n",
626 | " ['capital-gain', 'capital-loss',\n",
627 | " 'education-num']),\n",
628 | " ('step3',\n",
629 | " OneHotEncoder(handle_unknown='ignore',\n",
630 | " sparse=False),\n",
631 | " ['workclass', 'marital-status', 'occupation',\n",
632 | " 'relationship', 'race', 'sex',\n",
633 | " 'native-country'])]),\n",
634 | " 'model': DecisionTreeClassifier()}"
635 | ]
636 | },
637 | "execution_count": 56,
638 | "metadata": {},
639 | "output_type": "execute_result"
640 | }
641 | ],
642 | "source": [
643 | "p.named_steps"
644 | ]
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": 57,
649 | "metadata": {},
650 | "outputs": [
651 | {
652 | "data": {
653 | "text/plain": [
654 | "[('step1', RobustScaler(), ['age', 'fnlwgt', 'hours-per-week']),\n",
655 | " ('step2',\n",
656 | " StandardScaler(),\n",
657 | " ['capital-gain', 'capital-loss', 'education-num']),\n",
658 | " ('step3',\n",
659 | " OneHotEncoder(handle_unknown='ignore', sparse=False),\n",
660 | " ['workclass',\n",
661 | " 'marital-status',\n",
662 | " 'occupation',\n",
663 | " 'relationship',\n",
664 | " 'race',\n",
665 | " 'sex',\n",
666 | " 'native-country']),\n",
667 | " ('remainder', 'drop', [3])]"
668 | ]
669 | },
670 | "execution_count": 57,
671 | "metadata": {},
672 | "output_type": "execute_result"
673 | }
674 | ],
675 | "source": [
676 | "p.named_steps['coltf_step'].transformers_"
677 | ]
678 | },
679 | {
680 | "cell_type": "code",
681 | "execution_count": 58,
682 | "metadata": {
683 | "scrolled": true
684 | },
685 | "outputs": [
686 | {
687 | "data": {
688 | "text/plain": [
689 | "array(['x0_ Federal-gov', 'x0_ Local-gov', 'x0_ Private',\n",
690 | " 'x0_ Self-emp-inc', 'x0_ Self-emp-not-inc', 'x0_ State-gov',\n",
691 | " 'x0_ Without-pay', 'x1_ Divorced', 'x1_ Married-AF-spouse',\n",
692 | " 'x1_ Married-civ-spouse', 'x1_ Married-spouse-absent',\n",
693 | " 'x1_ Never-married', 'x1_ Separated', 'x1_ Widowed',\n",
694 | " 'x2_ Adm-clerical', 'x2_ Armed-Forces', 'x2_ Craft-repair',\n",
695 | " 'x2_ Exec-managerial', 'x2_ Farming-fishing',\n",
696 | " 'x2_ Handlers-cleaners', 'x2_ Machine-op-inspct',\n",
697 | " 'x2_ Other-service', 'x2_ Priv-house-serv', 'x2_ Prof-specialty',\n",
698 | " 'x2_ Protective-serv', 'x2_ Sales', 'x2_ Tech-support',\n",
699 | " 'x2_ Transport-moving', 'x3_ Husband', 'x3_ Not-in-family',\n",
700 | " 'x3_ Other-relative', 'x3_ Own-child', 'x3_ Unmarried', 'x3_ Wife',\n",
701 | " 'x4_ Amer-Indian-Eskimo', 'x4_ Asian-Pac-Islander', 'x4_ Black',\n",
702 | " 'x4_ Other', 'x4_ White', 'x5_ Female', 'x5_ Male', 'x6_ Cambodia',\n",
703 | " 'x6_ Canada', 'x6_ China', 'x6_ Columbia', 'x6_ Cuba',\n",
704 | " 'x6_ Dominican-Republic', 'x6_ Ecuador', 'x6_ El-Salvador',\n",
705 | " 'x6_ England', 'x6_ France', 'x6_ Germany', 'x6_ Greece',\n",
706 | " 'x6_ Guatemala', 'x6_ Haiti', 'x6_ Holand-Netherlands',\n",
707 | " 'x6_ Honduras', 'x6_ Hong', 'x6_ Hungary', 'x6_ India', 'x6_ Iran',\n",
708 | " 'x6_ Ireland', 'x6_ Italy', 'x6_ Jamaica', 'x6_ Japan', 'x6_ Laos',\n",
709 | " 'x6_ Mexico', 'x6_ Nicaragua', 'x6_ Outlying-US(Guam-USVI-etc)',\n",
710 | " 'x6_ Peru', 'x6_ Philippines', 'x6_ Poland', 'x6_ Portugal',\n",
711 | " 'x6_ Puerto-Rico', 'x6_ Scotland', 'x6_ South', 'x6_ Taiwan',\n",
712 | " 'x6_ Thailand', 'x6_ Trinadad&Tobago', 'x6_ United-States',\n",
713 | " 'x6_ Vietnam', 'x6_ Yugoslavia'], dtype=object)"
714 | ]
715 | },
716 | "execution_count": 58,
717 | "metadata": {},
718 | "output_type": "execute_result"
719 | }
720 | ],
721 | "source": [
722 | "p.named_steps['coltf_step'].transformers_[2][1].get_feature_names()"
723 | ]
724 | },
725 | {
726 | "cell_type": "code",
727 | "execution_count": null,
728 | "metadata": {},
729 | "outputs": [],
730 | "source": []
731 | },
732 | {
733 | "cell_type": "code",
734 | "execution_count": 59,
735 | "metadata": {},
736 | "outputs": [
737 | {
738 | "data": {
739 | "text/plain": [
740 | "array([3.70000e+01, 1.78319e+05, 4.00000e+01])"
741 | ]
742 | },
743 | "execution_count": 59,
744 | "metadata": {},
745 | "output_type": "execute_result"
746 | }
747 | ],
748 | "source": [
749 | "p.named_steps['coltf_step'].transformers_[0][1].center_"
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": null,
755 | "metadata": {},
756 | "outputs": [],
757 | "source": []
758 | },
759 | {
760 | "cell_type": "markdown",
761 | "metadata": {},
762 | "source": [
763 | "# pipeline use case 2 - without an estimator as final step"
764 | ]
765 | },
766 | {
767 | "cell_type": "code",
768 | "execution_count": 48,
769 | "metadata": {},
770 | "outputs": [],
771 | "source": [
772 | "p1 = Pipeline([\n",
773 | " ('coltf_step', ct),\n",
774 | " ('minmax', MinMaxScaler())\n",
775 | "# ('model', DecisionTreeClassifier()),\n",
776 | "])"
777 | ]
778 | },
779 | {
780 | "cell_type": "code",
781 | "execution_count": 50,
782 | "metadata": {},
783 | "outputs": [
784 | {
785 | "data": {
786 | "text/plain": [
787 | "Pipeline(steps=[('coltf_step',\n",
788 | " ColumnTransformer(transformers=[('step1', RobustScaler(),\n",
789 | " ['age', 'fnlwgt',\n",
790 | " 'hours-per-week']),\n",
791 | " ('step2', StandardScaler(),\n",
792 | " ['capital-gain',\n",
793 | " 'capital-loss',\n",
794 | " 'education-num']),\n",
795 | " ('step3',\n",
796 | " OneHotEncoder(handle_unknown='ignore',\n",
797 | " sparse=False),\n",
798 | " ['workclass',\n",
799 | " 'marital-status',\n",
800 | " 'occupation', 'relationship',\n",
801 | " 'race', 'sex',\n",
802 | " 'native-country'])])),\n",
803 | " ('minmax', MinMaxScaler())])"
804 | ]
805 | },
806 | "execution_count": 50,
807 | "metadata": {},
808 | "output_type": "execute_result"
809 | }
810 | ],
811 | "source": [
812 | "p1.fit(X_train)"
813 | ]
814 | },
815 | {
816 | "cell_type": "code",
817 | "execution_count": 51,
818 | "metadata": {},
819 | "outputs": [
820 | {
821 | "data": {
822 | "text/plain": [
823 | "array([[0.36986301, 0.04628617, 0.39795918, ..., 1. , 0. ,\n",
824 | " 0. ],\n",
825 | " [0.05479452, 0.1987476 , 0.19387755, ..., 1. , 0. ,\n",
826 | " 0. ],\n",
827 | " [0.26027397, 0.11716417, 0.39795918, ..., 1. , 0. ,\n",
828 | " 0. ],\n",
829 | " ...,\n",
830 | " [0.38356164, 0.17738365, 0.60204082, ..., 1. , 0. ,\n",
831 | " 0. ],\n",
832 | " [0.32876712, 0.23260631, 0.65306122, ..., 1. , 0. ,\n",
833 | " 0. ],\n",
834 | " [0.10958904, 0.09400613, 0.39795918, ..., 0. , 0. ,\n",
835 | " 0. ]])"
836 | ]
837 | },
838 | "execution_count": 51,
839 | "metadata": {},
840 | "output_type": "execute_result"
841 | }
842 | ],
843 | "source": [
844 | "p1.transform(X_test)"
845 | ]
846 | }
847 | ],
848 | "metadata": {
849 | "kernelspec": {
850 | "display_name": "Python 3",
851 | "language": "python",
852 | "name": "python3"
853 | },
854 | "language_info": {
855 | "codemirror_mode": {
856 | "name": "ipython",
857 | "version": 3
858 | },
859 | "file_extension": ".py",
860 | "mimetype": "text/x-python",
861 | "name": "python",
862 | "nbconvert_exporter": "python",
863 | "pygments_lexer": "ipython3",
864 | "version": "3.7.3"
865 | }
866 | },
867 | "nbformat": 4,
868 | "nbformat_minor": 2
869 | }
870 |
--------------------------------------------------------------------------------
/ppts/CM metrics 1-4.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/CM metrics 1-4.pptx
--------------------------------------------------------------------------------
/ppts/CM metrics 5-6.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/CM metrics 5-6.pptx
--------------------------------------------------------------------------------
/ppts/Feature scaling.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/Feature scaling.pptx
--------------------------------------------------------------------------------
/ppts/Grid Search.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/Grid Search.pptx
--------------------------------------------------------------------------------
/ppts/KNN Imputer Algorithm.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/KNN Imputer Algorithm.pptx
--------------------------------------------------------------------------------
/ppts/README.md:
--------------------------------------------------------------------------------
1 | This is where all the PowerPoint Presentations I use during my Machine Learning tutorials on YouTube will reside
2 |
3 | You can access my Machine Learning playlist videos here.
4 |
5 | Thank you for your interest :)
6 |
--------------------------------------------------------------------------------
/ppts/bias variance.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/bias variance.pptx
--------------------------------------------------------------------------------
/ppts/confusion matrix.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/confusion matrix.pptx
--------------------------------------------------------------------------------
/ppts/cross validation.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/cross validation.pptx
--------------------------------------------------------------------------------
/ppts/mcc.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/mcc.pptx
--------------------------------------------------------------------------------
/ppts/mice.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/mice.pptx
--------------------------------------------------------------------------------
/ppts/outlier.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/outlier.pptx
--------------------------------------------------------------------------------
/ppts/roc pr auc.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rachittoshniwal/machineLearning/ad9beced031bf3bd55ce6de57f78b39821897ac8/ppts/roc pr auc.pptx
--------------------------------------------------------------------------------
/simple imputer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "from sklearn.impute import SimpleImputer\n",
12 | "from sklearn.model_selection import train_test_split"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/html": [
23 | "\n",
24 | "\n",
37 | "
\n",
38 | " \n",
39 | " \n",
40 | " | \n",
41 | " age | \n",
42 | " workclass | \n",
43 | " fnlwgt | \n",
44 | " education | \n",
45 | " education-num | \n",
46 | " marital-status | \n",
47 | " occupation | \n",
48 | " relationship | \n",
49 | " race | \n",
50 | " sex | \n",
51 | " capital-gain | \n",
52 | " capital-loss | \n",
53 | " hours-per-week | \n",
54 | " native-country | \n",
55 | " income | \n",
56 | "
\n",
57 | " \n",
58 | " \n",
59 | " \n",
60 | " 0 | \n",
61 | " 39 | \n",
62 | " State-gov | \n",
63 | " 77516 | \n",
64 | " Bachelors | \n",
65 | " 13 | \n",
66 | " Never-married | \n",
67 | " Adm-clerical | \n",
68 | " Not-in-family | \n",
69 | " White | \n",
70 | " Male | \n",
71 | " 2174 | \n",
72 | " 0 | \n",
73 | " 40 | \n",
74 | " United-States | \n",
75 | " <=50K | \n",
76 | "
\n",
77 | " \n",
78 | " 1 | \n",
79 | " 50 | \n",
80 | " Self-emp-not-inc | \n",
81 | " 83311 | \n",
82 | " Bachelors | \n",
83 | " 13 | \n",
84 | " Married-civ-spouse | \n",
85 | " Exec-managerial | \n",
86 | " Husband | \n",
87 | " White | \n",
88 | " Male | \n",
89 | " 0 | \n",
90 | " 0 | \n",
91 | " 13 | \n",
92 | " United-States | \n",
93 | " <=50K | \n",
94 | "
\n",
95 | " \n",
96 | " 2 | \n",
97 | " 38 | \n",
98 | " Private | \n",
99 | " 215646 | \n",
100 | " HS-grad | \n",
101 | " 9 | \n",
102 | " Divorced | \n",
103 | " Handlers-cleaners | \n",
104 | " Not-in-family | \n",
105 | " White | \n",
106 | " Male | \n",
107 | " 0 | \n",
108 | " 0 | \n",
109 | " 40 | \n",
110 | " United-States | \n",
111 | " <=50K | \n",
112 | "
\n",
113 | " \n",
114 | " 3 | \n",
115 | " 53 | \n",
116 | " Private | \n",
117 | " 234721 | \n",
118 | " 11th | \n",
119 | " 7 | \n",
120 | " Married-civ-spouse | \n",
121 | " Handlers-cleaners | \n",
122 | " Husband | \n",
123 | " Black | \n",
124 | " Male | \n",
125 | " 0 | \n",
126 | " 0 | \n",
127 | " 40 | \n",
128 | " United-States | \n",
129 | " <=50K | \n",
130 | "
\n",
131 | " \n",
132 | " 4 | \n",
133 | " 28 | \n",
134 | " Private | \n",
135 | " 338409 | \n",
136 | " Bachelors | \n",
137 | " 13 | \n",
138 | " Married-civ-spouse | \n",
139 | " Prof-specialty | \n",
140 | " Wife | \n",
141 | " Black | \n",
142 | " Female | \n",
143 | " 0 | \n",
144 | " 0 | \n",
145 | " 40 | \n",
146 | " Cuba | \n",
147 | " <=50K | \n",
148 | "
\n",
149 | " \n",
150 | "
\n",
151 | "
"
152 | ],
153 | "text/plain": [
154 | " age workclass fnlwgt education education-num \\\n",
155 | "0 39 State-gov 77516 Bachelors 13 \n",
156 | "1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
157 | "2 38 Private 215646 HS-grad 9 \n",
158 | "3 53 Private 234721 11th 7 \n",
159 | "4 28 Private 338409 Bachelors 13 \n",
160 | "\n",
161 | " marital-status occupation relationship race sex \\\n",
162 | "0 Never-married Adm-clerical Not-in-family White Male \n",
163 | "1 Married-civ-spouse Exec-managerial Husband White Male \n",
164 | "2 Divorced Handlers-cleaners Not-in-family White Male \n",
165 | "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
166 | "4 Married-civ-spouse Prof-specialty Wife Black Female \n",
167 | "\n",
168 | " capital-gain capital-loss hours-per-week native-country income \n",
169 | "0 2174 0 40 United-States <=50K \n",
170 | "1 0 0 13 United-States <=50K \n",
171 | "2 0 0 40 United-States <=50K \n",
172 | "3 0 0 40 United-States <=50K \n",
173 | "4 0 0 40 Cuba <=50K "
174 | ]
175 | },
176 | "execution_count": 2,
177 | "metadata": {},
178 | "output_type": "execute_result"
179 | }
180 | ],
181 | "source": [
182 | "df = pd.read_csv('data/income_evaluation.csv', na_values = ' ?')\n",
183 | "df.head()"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 4,
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | "age 0\n",
195 | " workclass 1836\n",
196 | " fnlwgt 0\n",
197 | " education 0\n",
198 | " education-num 0\n",
199 | " marital-status 0\n",
200 | " occupation 1843\n",
201 | " relationship 0\n",
202 | " race 0\n",
203 | " sex 0\n",
204 | " capital-gain 0\n",
205 | " capital-loss 0\n",
206 | " hours-per-week 0\n",
207 | " native-country 583\n",
208 | " income 0\n",
209 | "dtype: int64"
210 | ]
211 | },
212 | "execution_count": 4,
213 | "metadata": {},
214 | "output_type": "execute_result"
215 | }
216 | ],
217 | "source": [
218 | "df.isna().sum()"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": null,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": []
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 4,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "# hours per week missing values\n",
235 | "np.random.seed(seed=0)\n",
236 | "h = np.random.choice(a=df.index, replace=False, size=20)\n",
237 | "df.loc[h, ' hours-per-week'] = np.nan"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 5,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "# age missing values\n",
247 | "np.random.seed(seed=10)\n",
248 | "a = np.random.choice(a=df.index, replace=False, size=28)\n",
249 | "df.loc[a, 'age'] = np.nan"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 6,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', axis=1),\n",
259 | " df[' income'], test_size=0.2,\n",
260 | " random_state=30)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 7,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "si_age = SimpleImputer(strategy='mean', add_indicator=True)"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 12,
275 | "metadata": {
276 | "scrolled": true
277 | },
278 | "outputs": [],
279 | "source": [
280 | "a = pd.DataFrame(si_age.fit_transform(X_train[['age']]))"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": 11,
286 | "metadata": {},
287 | "outputs": [
288 | {
289 | "data": {
290 | "text/plain": [
291 | "array([38.54201729])"
292 | ]
293 | },
294 | "execution_count": 11,
295 | "metadata": {},
296 | "output_type": "execute_result"
297 | }
298 | ],
299 | "source": [
300 | "si_age.statistics_"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 16,
306 | "metadata": {
307 | "scrolled": true
308 | },
309 | "outputs": [
310 | {
311 | "data": {
312 | "text/html": [
313 | "\n",
314 | "\n",
327 | "
\n",
328 | " \n",
329 | " \n",
330 | " | \n",
331 | " 0 | \n",
332 | " 1 | \n",
333 | "
\n",
334 | " \n",
335 | " \n",
336 | " \n",
337 | " 2969 | \n",
338 | " 38.542017 | \n",
339 | " 1.0 | \n",
340 | "
\n",
341 | " \n",
342 | " 3219 | \n",
343 | " 38.542017 | \n",
344 | " 1.0 | \n",
345 | "
\n",
346 | " \n",
347 | " 3522 | \n",
348 | " 38.542017 | \n",
349 | " 1.0 | \n",
350 | "
\n",
351 | " \n",
352 | " 4925 | \n",
353 | " 38.542017 | \n",
354 | " 1.0 | \n",
355 | "
\n",
356 | " \n",
357 | " 5543 | \n",
358 | " 38.542017 | \n",
359 | " 1.0 | \n",
360 | "
\n",
361 | " \n",
362 | " 5754 | \n",
363 | " 38.542017 | \n",
364 | " 1.0 | \n",
365 | "
\n",
366 | " \n",
367 | " 6305 | \n",
368 | " 38.542017 | \n",
369 | " 1.0 | \n",
370 | "
\n",
371 | " \n",
372 | " 7237 | \n",
373 | " 38.542017 | \n",
374 | " 1.0 | \n",
375 | "
\n",
376 | " \n",
377 | " 8587 | \n",
378 | " 38.542017 | \n",
379 | " 1.0 | \n",
380 | "
\n",
381 | " \n",
382 | " 11314 | \n",
383 | " 38.542017 | \n",
384 | " 1.0 | \n",
385 | "
\n",
386 | " \n",
387 | " 12112 | \n",
388 | " 38.542017 | \n",
389 | " 1.0 | \n",
390 | "
\n",
391 | " \n",
392 | " 12274 | \n",
393 | " 38.542017 | \n",
394 | " 1.0 | \n",
395 | "
\n",
396 | " \n",
397 | " 12591 | \n",
398 | " 38.542017 | \n",
399 | " 1.0 | \n",
400 | "
\n",
401 | " \n",
402 | " 12746 | \n",
403 | " 38.542017 | \n",
404 | " 1.0 | \n",
405 | "
\n",
406 | " \n",
407 | " 16927 | \n",
408 | " 38.542017 | \n",
409 | " 1.0 | \n",
410 | "
\n",
411 | " \n",
412 | " 17594 | \n",
413 | " 38.542017 | \n",
414 | " 1.0 | \n",
415 | "
\n",
416 | " \n",
417 | " 18909 | \n",
418 | " 38.542017 | \n",
419 | " 1.0 | \n",
420 | "
\n",
421 | " \n",
422 | " 20271 | \n",
423 | " 38.542017 | \n",
424 | " 1.0 | \n",
425 | "
\n",
426 | " \n",
427 | " 20366 | \n",
428 | " 38.542017 | \n",
429 | " 1.0 | \n",
430 | "
\n",
431 | " \n",
432 | " 20414 | \n",
433 | " 38.542017 | \n",
434 | " 1.0 | \n",
435 | "
\n",
436 | " \n",
437 | " 21598 | \n",
438 | " 38.542017 | \n",
439 | " 1.0 | \n",
440 | "
\n",
441 | " \n",
442 | " 22807 | \n",
443 | " 38.542017 | \n",
444 | " 1.0 | \n",
445 | "
\n",
446 | " \n",
447 | " 25240 | \n",
448 | " 38.542017 | \n",
449 | " 1.0 | \n",
450 | "
\n",
451 | " \n",
452 | "
\n",
453 | "
"
454 | ],
455 | "text/plain": [
456 | " 0 1\n",
457 | "2969 38.542017 1.0\n",
458 | "3219 38.542017 1.0\n",
459 | "3522 38.542017 1.0\n",
460 | "4925 38.542017 1.0\n",
461 | "5543 38.542017 1.0\n",
462 | "5754 38.542017 1.0\n",
463 | "6305 38.542017 1.0\n",
464 | "7237 38.542017 1.0\n",
465 | "8587 38.542017 1.0\n",
466 | "11314 38.542017 1.0\n",
467 | "12112 38.542017 1.0\n",
468 | "12274 38.542017 1.0\n",
469 | "12591 38.542017 1.0\n",
470 | "12746 38.542017 1.0\n",
471 | "16927 38.542017 1.0\n",
472 | "17594 38.542017 1.0\n",
473 | "18909 38.542017 1.0\n",
474 | "20271 38.542017 1.0\n",
475 | "20366 38.542017 1.0\n",
476 | "20414 38.542017 1.0\n",
477 | "21598 38.542017 1.0\n",
478 | "22807 38.542017 1.0\n",
479 | "25240 38.542017 1.0"
480 | ]
481 | },
482 | "execution_count": 16,
483 | "metadata": {},
484 | "output_type": "execute_result"
485 | }
486 | ],
487 | "source": [
488 | "a[a[1] == 1]"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": 21,
494 | "metadata": {},
495 | "outputs": [],
496 | "source": [
497 | "si_occ = SimpleImputer(strategy='constant', add_indicator=True, fill_value='not available')"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": 22,
503 | "metadata": {},
504 | "outputs": [
505 | {
506 | "data": {
507 | "text/plain": [
508 | "array([[' Exec-managerial', False],\n",
509 | " [' Transport-moving', False],\n",
510 | " [' Transport-moving', False],\n",
511 | " ...,\n",
512 | " [' Other-service', False],\n",
513 | " [' Sales', False],\n",
514 | " [' Tech-support', False]], dtype=object)"
515 | ]
516 | },
517 | "execution_count": 22,
518 | "metadata": {},
519 | "output_type": "execute_result"
520 | }
521 | ],
522 | "source": [
523 | "si_occ.fit_transform(X_train[[' occupation']])"
524 | ]
525 | },
526 | {
527 | "cell_type": "code",
528 | "execution_count": 23,
529 | "metadata": {
530 | "scrolled": true
531 | },
532 | "outputs": [
533 | {
534 | "data": {
535 | "text/html": [
536 | "\n",
537 | "\n",
550 | "
\n",
551 | " \n",
552 | " \n",
553 | " | \n",
554 | " 0 | \n",
555 | " 1 | \n",
556 | "
\n",
557 | " \n",
558 | " \n",
559 | " \n",
560 | " 0 | \n",
561 | " Exec-managerial | \n",
562 | " False | \n",
563 | "
\n",
564 | " \n",
565 | " 1 | \n",
566 | " Transport-moving | \n",
567 | " False | \n",
568 | "
\n",
569 | " \n",
570 | " 2 | \n",
571 | " Transport-moving | \n",
572 | " False | \n",
573 | "
\n",
574 | " \n",
575 | " 3 | \n",
576 | " Craft-repair | \n",
577 | " False | \n",
578 | "
\n",
579 | " \n",
580 | " 4 | \n",
581 | " Adm-clerical | \n",
582 | " False | \n",
583 | "
\n",
584 | " \n",
585 | " 5 | \n",
586 | " Sales | \n",
587 | " False | \n",
588 | "
\n",
589 | " \n",
590 | " 6 | \n",
591 | " Machine-op-inspct | \n",
592 | " False | \n",
593 | "
\n",
594 | " \n",
595 | " 7 | \n",
596 | " Farming-fishing | \n",
597 | " False | \n",
598 | "
\n",
599 | " \n",
600 | " 8 | \n",
601 | " Adm-clerical | \n",
602 | " False | \n",
603 | "
\n",
604 | " \n",
605 | " 9 | \n",
606 | " Machine-op-inspct | \n",
607 | " False | \n",
608 | "
\n",
609 | " \n",
610 | " 10 | \n",
611 | " Adm-clerical | \n",
612 | " False | \n",
613 | "
\n",
614 | " \n",
615 | " 11 | \n",
616 | " Other-service | \n",
617 | " False | \n",
618 | "
\n",
619 | " \n",
620 | " 12 | \n",
621 | " Sales | \n",
622 | " False | \n",
623 | "
\n",
624 | " \n",
625 | " 13 | \n",
626 | " Exec-managerial | \n",
627 | " False | \n",
628 | "
\n",
629 | " \n",
630 | " 14 | \n",
631 | " Craft-repair | \n",
632 | " False | \n",
633 | "
\n",
634 | " \n",
635 | " 15 | \n",
636 | " Adm-clerical | \n",
637 | " False | \n",
638 | "
\n",
639 | " \n",
640 | " 16 | \n",
641 | " Other-service | \n",
642 | " False | \n",
643 | "
\n",
644 | " \n",
645 | " 17 | \n",
646 | " Tech-support | \n",
647 | " False | \n",
648 | "
\n",
649 | " \n",
650 | " 18 | \n",
651 | " Sales | \n",
652 | " False | \n",
653 | "
\n",
654 | " \n",
655 | " 19 | \n",
656 | " Farming-fishing | \n",
657 | " False | \n",
658 | "
\n",
659 | " \n",
660 | " 20 | \n",
661 | " Protective-serv | \n",
662 | " False | \n",
663 | "
\n",
664 | " \n",
665 | " 21 | \n",
666 | " Craft-repair | \n",
667 | " False | \n",
668 | "
\n",
669 | " \n",
670 | " 22 | \n",
671 | " Prof-specialty | \n",
672 | " False | \n",
673 | "
\n",
674 | " \n",
675 | " 23 | \n",
676 | " Machine-op-inspct | \n",
677 | " False | \n",
678 | "
\n",
679 | " \n",
680 | " 24 | \n",
681 | " Exec-managerial | \n",
682 | " False | \n",
683 | "
\n",
684 | " \n",
685 | " 25 | \n",
686 | " Craft-repair | \n",
687 | " False | \n",
688 | "
\n",
689 | " \n",
690 | " 26 | \n",
691 | " not available | \n",
692 | " True | \n",
693 | "
\n",
694 | " \n",
695 | " 27 | \n",
696 | " Handlers-cleaners | \n",
697 | " False | \n",
698 | "
\n",
699 | " \n",
700 | " 28 | \n",
701 | " Other-service | \n",
702 | " False | \n",
703 | "
\n",
704 | " \n",
705 | " 29 | \n",
706 | " not available | \n",
707 | " True | \n",
708 | "
\n",
709 | " \n",
710 | " ... | \n",
711 | " ... | \n",
712 | " ... | \n",
713 | "
\n",
714 | " \n",
715 | " 26018 | \n",
716 | " Transport-moving | \n",
717 | " False | \n",
718 | "
\n",
719 | " \n",
720 | " 26019 | \n",
721 | " Prof-specialty | \n",
722 | " False | \n",
723 | "
\n",
724 | " \n",
725 | " 26020 | \n",
726 | " Sales | \n",
727 | " False | \n",
728 | "
\n",
729 | " \n",
730 | " 26021 | \n",
731 | " Adm-clerical | \n",
732 | " False | \n",
733 | "
\n",
734 | " \n",
735 | " 26022 | \n",
736 | " Tech-support | \n",
737 | " False | \n",
738 | "
\n",
739 | " \n",
740 | " 26023 | \n",
741 | " Adm-clerical | \n",
742 | " False | \n",
743 | "
\n",
744 | " \n",
745 | " 26024 | \n",
746 | " Craft-repair | \n",
747 | " False | \n",
748 | "
\n",
749 | " \n",
750 | " 26025 | \n",
751 | " Adm-clerical | \n",
752 | " False | \n",
753 | "
\n",
754 | " \n",
755 | " 26026 | \n",
756 | " Adm-clerical | \n",
757 | " False | \n",
758 | "
\n",
759 | " \n",
760 | " 26027 | \n",
761 | " Machine-op-inspct | \n",
762 | " False | \n",
763 | "
\n",
764 | " \n",
765 | " 26028 | \n",
766 | " not available | \n",
767 | " True | \n",
768 | "
\n",
769 | " \n",
770 | " 26029 | \n",
771 | " Machine-op-inspct | \n",
772 | " False | \n",
773 | "
\n",
774 | " \n",
775 | " 26030 | \n",
776 | " Craft-repair | \n",
777 | " False | \n",
778 | "
\n",
779 | " \n",
780 | " 26031 | \n",
781 | " Handlers-cleaners | \n",
782 | " False | \n",
783 | "
\n",
784 | " \n",
785 | " 26032 | \n",
786 | " Machine-op-inspct | \n",
787 | " False | \n",
788 | "
\n",
789 | " \n",
790 | " 26033 | \n",
791 | " Sales | \n",
792 | " False | \n",
793 | "
\n",
794 | " \n",
795 | " 26034 | \n",
796 | " Protective-serv | \n",
797 | " False | \n",
798 | "
\n",
799 | " \n",
800 | " 26035 | \n",
801 | " Farming-fishing | \n",
802 | " False | \n",
803 | "
\n",
804 | " \n",
805 | " 26036 | \n",
806 | " Exec-managerial | \n",
807 | " False | \n",
808 | "
\n",
809 | " \n",
810 | " 26037 | \n",
811 | " Exec-managerial | \n",
812 | " False | \n",
813 | "
\n",
814 | " \n",
815 | " 26038 | \n",
816 | " Farming-fishing | \n",
817 | " False | \n",
818 | "
\n",
819 | " \n",
820 | " 26039 | \n",
821 | " Other-service | \n",
822 | " False | \n",
823 | "
\n",
824 | " \n",
825 | " 26040 | \n",
826 | " Prof-specialty | \n",
827 | " False | \n",
828 | "
\n",
829 | " \n",
830 | " 26041 | \n",
831 | " Other-service | \n",
832 | " False | \n",
833 | "
\n",
834 | " \n",
835 | " 26042 | \n",
836 | " Adm-clerical | \n",
837 | " False | \n",
838 | "
\n",
839 | " \n",
840 | " 26043 | \n",
841 | " Farming-fishing | \n",
842 | " False | \n",
843 | "
\n",
844 | " \n",
845 | " 26044 | \n",
846 | " Adm-clerical | \n",
847 | " False | \n",
848 | "
\n",
849 | " \n",
850 | " 26045 | \n",
851 | " Other-service | \n",
852 | " False | \n",
853 | "
\n",
854 | " \n",
855 | " 26046 | \n",
856 | " Sales | \n",
857 | " False | \n",
858 | "
\n",
859 | " \n",
860 | " 26047 | \n",
861 | " Tech-support | \n",
862 | " False | \n",
863 | "
\n",
864 | " \n",
865 | "
\n",
866 | "
26048 rows × 2 columns
\n",
867 | "
"
868 | ],
869 | "text/plain": [
870 | " 0 1\n",
871 | "0 Exec-managerial False\n",
872 | "1 Transport-moving False\n",
873 | "2 Transport-moving False\n",
874 | "3 Craft-repair False\n",
875 | "4 Adm-clerical False\n",
876 | "5 Sales False\n",
877 | "6 Machine-op-inspct False\n",
878 | "7 Farming-fishing False\n",
879 | "8 Adm-clerical False\n",
880 | "9 Machine-op-inspct False\n",
881 | "10 Adm-clerical False\n",
882 | "11 Other-service False\n",
883 | "12 Sales False\n",
884 | "13 Exec-managerial False\n",
885 | "14 Craft-repair False\n",
886 | "15 Adm-clerical False\n",
887 | "16 Other-service False\n",
888 | "17 Tech-support False\n",
889 | "18 Sales False\n",
890 | "19 Farming-fishing False\n",
891 | "20 Protective-serv False\n",
892 | "21 Craft-repair False\n",
893 | "22 Prof-specialty False\n",
894 | "23 Machine-op-inspct False\n",
895 | "24 Exec-managerial False\n",
896 | "25 Craft-repair False\n",
897 | "26 not available True\n",
898 | "27 Handlers-cleaners False\n",
899 | "28 Other-service False\n",
900 | "29 not available True\n",
901 | "... ... ...\n",
902 | "26018 Transport-moving False\n",
903 | "26019 Prof-specialty False\n",
904 | "26020 Sales False\n",
905 | "26021 Adm-clerical False\n",
906 | "26022 Tech-support False\n",
907 | "26023 Adm-clerical False\n",
908 | "26024 Craft-repair False\n",
909 | "26025 Adm-clerical False\n",
910 | "26026 Adm-clerical False\n",
911 | "26027 Machine-op-inspct False\n",
912 | "26028 not available True\n",
913 | "26029 Machine-op-inspct False\n",
914 | "26030 Craft-repair False\n",
915 | "26031 Handlers-cleaners False\n",
916 | "26032 Machine-op-inspct False\n",
917 | "26033 Sales False\n",
918 | "26034 Protective-serv False\n",
919 | "26035 Farming-fishing False\n",
920 | "26036 Exec-managerial False\n",
921 | "26037 Exec-managerial False\n",
922 | "26038 Farming-fishing False\n",
923 | "26039 Other-service False\n",
924 | "26040 Prof-specialty False\n",
925 | "26041 Other-service False\n",
926 | "26042 Adm-clerical False\n",
927 | "26043 Farming-fishing False\n",
928 | "26044 Adm-clerical False\n",
929 | "26045 Other-service False\n",
930 | "26046 Sales False\n",
931 | "26047 Tech-support False\n",
932 | "\n",
933 | "[26048 rows x 2 columns]"
934 | ]
935 | },
936 | "execution_count": 23,
937 | "metadata": {},
938 | "output_type": "execute_result"
939 | }
940 | ],
941 | "source": [
942 | "pd.DataFrame(si_occ.fit_transform(X_train[[' occupation']]))"
943 | ]
944 | },
945 | {
946 | "cell_type": "code",
947 | "execution_count": 24,
948 | "metadata": {},
949 | "outputs": [
950 | {
951 | "data": {
952 | "text/plain": [
953 | "array([[48., 0.],\n",
954 | " [63., 0.],\n",
955 | " [33., 0.],\n",
956 | " ...,\n",
957 | " [48., 0.],\n",
958 | " [54., 0.],\n",
959 | " [58., 0.]])"
960 | ]
961 | },
962 | "execution_count": 24,
963 | "metadata": {},
964 | "output_type": "execute_result"
965 | }
966 | ],
967 | "source": [
968 | "si_age.transform(X_test[['age']])"
969 | ]
970 | },
971 | {
972 | "cell_type": "code",
973 | "execution_count": 26,
974 | "metadata": {},
975 | "outputs": [],
976 | "source": [
977 | "b = pd.DataFrame(si_age.transform(X_test[['age']]))"
978 | ]
979 | },
980 | {
981 | "cell_type": "code",
982 | "execution_count": 29,
983 | "metadata": {},
984 | "outputs": [
985 | {
986 | "data": {
987 | "text/html": [
988 | "\n",
989 | "\n",
1002 | "
\n",
1003 | " \n",
1004 | " \n",
1005 | " | \n",
1006 | " 0 | \n",
1007 | " 1 | \n",
1008 | "
\n",
1009 | " \n",
1010 | " \n",
1011 | " \n",
1012 | " 2526 | \n",
1013 | " 38.542017 | \n",
1014 | " 1.0 | \n",
1015 | "
\n",
1016 | " \n",
1017 | " 4068 | \n",
1018 | " 38.542017 | \n",
1019 | " 1.0 | \n",
1020 | "
\n",
1021 | " \n",
1022 | " 4111 | \n",
1023 | " 38.542017 | \n",
1024 | " 1.0 | \n",
1025 | "
\n",
1026 | " \n",
1027 | " 5324 | \n",
1028 | " 38.542017 | \n",
1029 | " 1.0 | \n",
1030 | "
\n",
1031 | " \n",
1032 | " 5930 | \n",
1033 | " 38.542017 | \n",
1034 | " 1.0 | \n",
1035 | "
\n",
1036 | " \n",
1037 | "
\n",
1038 | "
"
1039 | ],
1040 | "text/plain": [
1041 | " 0 1\n",
1042 | "2526 38.542017 1.0\n",
1043 | "4068 38.542017 1.0\n",
1044 | "4111 38.542017 1.0\n",
1045 | "5324 38.542017 1.0\n",
1046 | "5930 38.542017 1.0"
1047 | ]
1048 | },
1049 | "execution_count": 29,
1050 | "metadata": {},
1051 | "output_type": "execute_result"
1052 | }
1053 | ],
1054 | "source": [
1055 | "b[b[1] == 1]"
1056 | ]
1057 | }
1058 | ],
1059 | "metadata": {
1060 | "kernelspec": {
1061 | "display_name": "Python 3",
1062 | "language": "python",
1063 | "name": "python3"
1064 | },
1065 | "language_info": {
1066 | "codemirror_mode": {
1067 | "name": "ipython",
1068 | "version": 3
1069 | },
1070 | "file_extension": ".py",
1071 | "mimetype": "text/x-python",
1072 | "name": "python",
1073 | "nbconvert_exporter": "python",
1074 | "pygments_lexer": "ipython3",
1075 | "version": "3.7.3"
1076 | }
1077 | },
1078 | "nbformat": 4,
1079 | "nbformat_minor": 2
1080 | }
1081 |
--------------------------------------------------------------------------------
/why NEVER use pd.get_dummies.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | "\n",
21 | "\n",
34 | "
\n",
35 | " \n",
36 | " \n",
37 | " | \n",
38 | " color | \n",
39 | " height | \n",
40 | " petals | \n",
41 | " days | \n",
42 | "
\n",
43 | " \n",
44 | " \n",
45 | " \n",
46 | " 0 | \n",
47 | " red | \n",
48 | " 4.0 | \n",
49 | " 3 | \n",
50 | " 6 | \n",
51 | "
\n",
52 | " \n",
53 | " 1 | \n",
54 | " green | \n",
55 | " 9.0 | \n",
56 | " 9 | \n",
57 | " 16 | \n",
58 | "
\n",
59 | " \n",
60 | " 2 | \n",
61 | " red | \n",
62 | " 4.0 | \n",
63 | " 1 | \n",
64 | " 7 | \n",
65 | "
\n",
66 | " \n",
67 | " 3 | \n",
68 | " green | \n",
69 | " 8.0 | \n",
70 | " 8 | \n",
71 | " 15 | \n",
72 | "
\n",
73 | " \n",
74 | " 4 | \n",
75 | " red | \n",
76 | " 4.0 | \n",
77 | " 1 | \n",
78 | " 8 | \n",
79 | "
\n",
80 | " \n",
81 | " 5 | \n",
82 | " green | \n",
83 | " 7.0 | \n",
84 | " 10 | \n",
85 | " 17 | \n",
86 | "
\n",
87 | " \n",
88 | " 6 | \n",
89 | " red | \n",
90 | " 4.0 | \n",
91 | " 2 | \n",
92 | " 5 | \n",
93 | "
\n",
94 | " \n",
95 | " 7 | \n",
96 | " green | \n",
97 | " 7.5 | \n",
98 | " 8 | \n",
99 | " 12 | \n",
100 | "
\n",
101 | " \n",
102 | " 8 | \n",
103 | " blue | \n",
104 | " 20.0 | \n",
105 | " 50 | \n",
106 | " 40 | \n",
107 | "
\n",
108 | " \n",
109 | " 9 | \n",
110 | " blue | \n",
111 | " 19.0 | \n",
112 | " 47 | \n",
113 | " 45 | \n",
114 | "
\n",
115 | " \n",
116 | "
\n",
117 | "
"
118 | ],
119 | "text/plain": [
120 | " color height petals days\n",
121 | "0 red 4.0 3 6\n",
122 | "1 green 9.0 9 16\n",
123 | "2 red 4.0 1 7\n",
124 | "3 green 8.0 8 15\n",
125 | "4 red 4.0 1 8\n",
126 | "5 green 7.0 10 17\n",
127 | "6 red 4.0 2 5\n",
128 | "7 green 7.5 8 12\n",
129 | "8 blue 20.0 50 40\n",
130 | "9 blue 19.0 47 45"
131 | ]
132 | },
133 | "execution_count": 2,
134 | "metadata": {},
135 | "output_type": "execute_result"
136 | }
137 | ],
138 | "source": [
139 | "flowers = pd.DataFrame({\n",
140 | " 'color' : ['red', 'green', 'red', 'green', 'red', 'green', 'red', 'green', 'blue', 'blue'],\n",
141 | " 'height': [4,9,4,8,4,7,4,7.5,20,19],\n",
142 | " 'petals': [3,9,1,8,1,10,2,8,50,47],\n",
143 | " 'days' : [6,16,7,15,8,17,5,12,40,45]\n",
144 | "})\n",
145 | "flowers"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 3,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "from sklearn.model_selection import train_test_split\n",
155 | "X_train, X_test, y_train, y_test = train_test_split(flowers.drop('days', axis=1), flowers['days'],\n",
156 | " test_size=0.2, random_state=40\n",
157 | ")"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 4,
163 | "metadata": {},
164 | "outputs": [
165 | {
166 | "data": {
167 | "text/html": [
168 | "\n",
169 | "\n",
182 | "
\n",
183 | " \n",
184 | " \n",
185 | " | \n",
186 | " color | \n",
187 | " height | \n",
188 | " petals | \n",
189 | "
\n",
190 | " \n",
191 | " \n",
192 | " \n",
193 | " 8 | \n",
194 | " blue | \n",
195 | " 20.0 | \n",
196 | " 50 | \n",
197 | "
\n",
198 | " \n",
199 | " 1 | \n",
200 | " green | \n",
201 | " 9.0 | \n",
202 | " 9 | \n",
203 | "
\n",
204 | " \n",
205 | " 2 | \n",
206 | " red | \n",
207 | " 4.0 | \n",
208 | " 1 | \n",
209 | "
\n",
210 | " \n",
211 | " 9 | \n",
212 | " blue | \n",
213 | " 19.0 | \n",
214 | " 47 | \n",
215 | "
\n",
216 | " \n",
217 | " 0 | \n",
218 | " red | \n",
219 | " 4.0 | \n",
220 | " 3 | \n",
221 | "
\n",
222 | " \n",
223 | " 5 | \n",
224 | " green | \n",
225 | " 7.0 | \n",
226 | " 10 | \n",
227 | "
\n",
228 | " \n",
229 | " 7 | \n",
230 | " green | \n",
231 | " 7.5 | \n",
232 | " 8 | \n",
233 | "
\n",
234 | " \n",
235 | " 6 | \n",
236 | " red | \n",
237 | " 4.0 | \n",
238 | " 2 | \n",
239 | "
\n",
240 | " \n",
241 | "
\n",
242 | "
"
243 | ],
244 | "text/plain": [
245 | " color height petals\n",
246 | "8 blue 20.0 50\n",
247 | "1 green 9.0 9\n",
248 | "2 red 4.0 1\n",
249 | "9 blue 19.0 47\n",
250 | "0 red 4.0 3\n",
251 | "5 green 7.0 10\n",
252 | "7 green 7.5 8\n",
253 | "6 red 4.0 2"
254 | ]
255 | },
256 | "execution_count": 4,
257 | "metadata": {},
258 | "output_type": "execute_result"
259 | }
260 | ],
261 | "source": [
262 | "X_train"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 7,
268 | "metadata": {},
269 | "outputs": [
270 | {
271 | "name": "stdout",
272 | "output_type": "stream",
273 | "text": [
274 | "\n",
275 | "Int64Index: 8 entries, 8 to 6\n",
276 | "Data columns (total 3 columns):\n",
277 | "color 8 non-null object\n",
278 | "height 8 non-null float64\n",
279 | "petals 8 non-null int64\n",
280 | "dtypes: float64(1), int64(1), object(1)\n",
281 | "memory usage: 256.0+ bytes\n"
282 | ]
283 | }
284 | ],
285 | "source": [
286 | "X_train.info()"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 8,
292 | "metadata": {},
293 | "outputs": [
294 | {
295 | "data": {
296 | "text/plain": [
297 | "array(['blue', 'green', 'red'], dtype=object)"
298 | ]
299 | },
300 | "execution_count": 8,
301 | "metadata": {},
302 | "output_type": "execute_result"
303 | }
304 | ],
305 | "source": [
306 | "X_train['color'].unique()"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 10,
312 | "metadata": {},
313 | "outputs": [
314 | {
315 | "data": {
316 | "text/html": [
317 | "\n",
318 | "\n",
331 | "
\n",
332 | " \n",
333 | " \n",
334 | " | \n",
335 | " height | \n",
336 | " petals | \n",
337 | " color_blue | \n",
338 | " color_green | \n",
339 | " color_red | \n",
340 | " color | \n",
341 | "
\n",
342 | " \n",
343 | " \n",
344 | " \n",
345 | " 8 | \n",
346 | " 20.0 | \n",
347 | " 50 | \n",
348 | " 1 | \n",
349 | " 0 | \n",
350 | " 0 | \n",
351 | " blue | \n",
352 | "
\n",
353 | " \n",
354 | " 1 | \n",
355 | " 9.0 | \n",
356 | " 9 | \n",
357 | " 0 | \n",
358 | " 1 | \n",
359 | " 0 | \n",
360 | " green | \n",
361 | "
\n",
362 | " \n",
363 | " 2 | \n",
364 | " 4.0 | \n",
365 | " 1 | \n",
366 | " 0 | \n",
367 | " 0 | \n",
368 | " 1 | \n",
369 | " red | \n",
370 | "
\n",
371 | " \n",
372 | " 9 | \n",
373 | " 19.0 | \n",
374 | " 47 | \n",
375 | " 1 | \n",
376 | " 0 | \n",
377 | " 0 | \n",
378 | " blue | \n",
379 | "
\n",
380 | " \n",
381 | " 0 | \n",
382 | " 4.0 | \n",
383 | " 3 | \n",
384 | " 0 | \n",
385 | " 0 | \n",
386 | " 1 | \n",
387 | " red | \n",
388 | "
\n",
389 | " \n",
390 | " 5 | \n",
391 | " 7.0 | \n",
392 | " 10 | \n",
393 | " 0 | \n",
394 | " 1 | \n",
395 | " 0 | \n",
396 | " green | \n",
397 | "
\n",
398 | " \n",
399 | " 7 | \n",
400 | " 7.5 | \n",
401 | " 8 | \n",
402 | " 0 | \n",
403 | " 1 | \n",
404 | " 0 | \n",
405 | " green | \n",
406 | "
\n",
407 | " \n",
408 | " 6 | \n",
409 | " 4.0 | \n",
410 | " 2 | \n",
411 | " 0 | \n",
412 | " 0 | \n",
413 | " 1 | \n",
414 | " red | \n",
415 | "
\n",
416 | " \n",
417 | "
\n",
418 | "
"
419 | ],
420 | "text/plain": [
421 | " height petals color_blue color_green color_red color\n",
422 | "8 20.0 50 1 0 0 blue\n",
423 | "1 9.0 9 0 1 0 green\n",
424 | "2 4.0 1 0 0 1 red\n",
425 | "9 19.0 47 1 0 0 blue\n",
426 | "0 4.0 3 0 0 1 red\n",
427 | "5 7.0 10 0 1 0 green\n",
428 | "7 7.5 8 0 1 0 green\n",
429 | "6 4.0 2 0 0 1 red"
430 | ]
431 | },
432 | "execution_count": 10,
433 | "metadata": {},
434 | "output_type": "execute_result"
435 | }
436 | ],
437 | "source": [
438 | "pd.get_dummies(X_train).join(X_train['color'])"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": 11,
444 | "metadata": {},
445 | "outputs": [
446 | {
447 | "data": {
448 | "text/html": [
449 | "\n",
450 | "\n",
463 | "
\n",
464 | " \n",
465 | " \n",
466 | " | \n",
467 | " color | \n",
468 | " height | \n",
469 | " petals | \n",
470 | "
\n",
471 | " \n",
472 | " \n",
473 | " \n",
474 | " 4 | \n",
475 | " red | \n",
476 | " 4.0 | \n",
477 | " 1 | \n",
478 | "
\n",
479 | " \n",
480 | " 3 | \n",
481 | " green | \n",
482 | " 8.0 | \n",
483 | " 8 | \n",
484 | "
\n",
485 | " \n",
486 | "
\n",
487 | "
"
488 | ],
489 | "text/plain": [
490 | " color height petals\n",
491 | "4 red 4.0 1\n",
492 | "3 green 8.0 8"
493 | ]
494 | },
495 | "execution_count": 11,
496 | "metadata": {},
497 | "output_type": "execute_result"
498 | }
499 | ],
500 | "source": [
501 | "X_test"
502 | ]
503 | },
504 | {
505 | "cell_type": "code",
506 | "execution_count": 12,
507 | "metadata": {},
508 | "outputs": [
509 | {
510 | "data": {
511 | "text/html": [
512 | "\n",
513 | "\n",
526 | "
\n",
527 | " \n",
528 | " \n",
529 | " | \n",
530 | " height | \n",
531 | " petals | \n",
532 | " color_green | \n",
533 | " color_red | \n",
534 | "
\n",
535 | " \n",
536 | " \n",
537 | " \n",
538 | " 4 | \n",
539 | " 4.0 | \n",
540 | " 1 | \n",
541 | " 0 | \n",
542 | " 1 | \n",
543 | "
\n",
544 | " \n",
545 | " 3 | \n",
546 | " 8.0 | \n",
547 | " 8 | \n",
548 | " 1 | \n",
549 | " 0 | \n",
550 | "
\n",
551 | " \n",
552 | "
\n",
553 | "
"
554 | ],
555 | "text/plain": [
556 | " height petals color_green color_red\n",
557 | "4 4.0 1 0 1\n",
558 | "3 8.0 8 1 0"
559 | ]
560 | },
561 | "execution_count": 12,
562 | "metadata": {},
563 | "output_type": "execute_result"
564 | }
565 | ],
566 | "source": [
567 | "pd.get_dummies(X_test)"
568 | ]
569 | }
570 | ],
571 | "metadata": {
572 | "kernelspec": {
573 | "display_name": "Python 3",
574 | "language": "python",
575 | "name": "python3"
576 | },
577 | "language_info": {
578 | "codemirror_mode": {
579 | "name": "ipython",
580 | "version": 3
581 | },
582 | "file_extension": ".py",
583 | "mimetype": "text/x-python",
584 | "name": "python",
585 | "nbconvert_exporter": "python",
586 | "pygments_lexer": "ipython3",
587 | "version": "3.7.3"
588 | }
589 | },
590 | "nbformat": 4,
591 | "nbformat_minor": 2
592 | }
593 |
--------------------------------------------------------------------------------