├── P01-DataCleaning.ipynb
├── P02-DataCleaning.ipynb
├── P03-Correlation.ipynb
├── P04-ANOVA.ipynb
├── P05-SimpleLinearRegression.ipynb
├── P06-MultipleLinearRegression.ipynb
├── P07-PolynomialRegression.ipynb
├── P08-Model Evaluation.ipynb
├── P09-LogisticRegression.ipynb
└── README.md
/P03-Correlation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Practical - Correlation\n",
8 | "This practical session will demonstrate how to handle missing data. We assume everyone to have adequate understanding of Python programming language. For those who would like to refresh Python skill, we would like to recommend our \"Programming for Data Science Series\" where we covered almost all aspects of Python programming in data science domain.\n",
9 | "Refer below URL for full playlist of almost 10 hours video lesson in Burmese Language.\n",
10 | "URL : https://www.youtube.com/watch?v=jOZNjVVZIVs&list=PLD_eiqVVLZDi9GZZJDC8Zx4-3Np8LHs52"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "data": {
20 | "text/html": [
21 | "
\n",
22 | "\n",
35 | "
\n",
36 | " \n",
37 | " \n",
38 | " \n",
39 | " Day_No \n",
40 | " Temp \n",
41 | " Cold_Drink \n",
42 | " Hot_Drink \n",
43 | " Snacks \n",
44 | " \n",
45 | " \n",
46 | " \n",
47 | " \n",
48 | " 0 \n",
49 | " 1 \n",
50 | " 35 \n",
51 | " 120 \n",
52 | " 175 \n",
53 | " 45 \n",
54 | " \n",
55 | " \n",
56 | " 1 \n",
57 | " 2 \n",
58 | " 35 \n",
59 | " 122 \n",
60 | " 170 \n",
61 | " 50 \n",
62 | " \n",
63 | " \n",
64 | " 2 \n",
65 | " 3 \n",
66 | " 36 \n",
67 | " 125 \n",
68 | " 172 \n",
69 | " 51 \n",
70 | " \n",
71 | " \n",
72 | " 3 \n",
73 | " 4 \n",
74 | " 36 \n",
75 | " 130 \n",
76 | " 170 \n",
77 | " 52 \n",
78 | " \n",
79 | " \n",
80 | " 4 \n",
81 | " 5 \n",
82 | " 35 \n",
83 | " 128 \n",
84 | " 177 \n",
85 | " 45 \n",
86 | " \n",
87 | " \n",
88 | "
\n",
89 | "
"
90 | ],
91 | "text/plain": [
92 | " Day_No Temp Cold_Drink Hot_Drink Snacks\n",
93 | "0 1 35 120 175 45\n",
94 | "1 2 35 122 170 50\n",
95 | "2 3 36 125 172 51\n",
96 | "3 4 36 130 170 52\n",
97 | "4 5 35 128 177 45"
98 | ]
99 | },
100 | "execution_count": 2,
101 | "metadata": {},
102 | "output_type": "execute_result"
103 | }
104 | ],
105 | "source": [
106 | "import pandas as pd\n",
107 | "data = pd.read_csv('https://raw.githubusercontent.com/myanmards/resource_files/master/correlation.csv')\n",
108 | "data.head()"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 3,
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "data": {
118 | "text/html": [
119 | "\n",
120 | "\n",
133 | "
\n",
134 | " \n",
135 | " \n",
136 | " \n",
137 | " Day_No \n",
138 | " Temp \n",
139 | " Cold_Drink \n",
140 | " Hot_Drink \n",
141 | " Snacks \n",
142 | " \n",
143 | " \n",
144 | " \n",
145 | " \n",
146 | " count \n",
147 | " 120.000000 \n",
148 | " 120.000000 \n",
149 | " 120.000000 \n",
150 | " 120.000000 \n",
151 | " 120.000000 \n",
152 | " \n",
153 | " \n",
154 | " mean \n",
155 | " 60.500000 \n",
156 | " 38.533333 \n",
157 | " 141.750000 \n",
158 | " 128.766667 \n",
159 | " 50.066667 \n",
160 | " \n",
161 | " \n",
162 | " std \n",
163 | " 34.785054 \n",
164 | " 2.315216 \n",
165 | " 13.111833 \n",
166 | " 19.615948 \n",
167 | " 2.946963 \n",
168 | " \n",
169 | " \n",
170 | " min \n",
171 | " 1.000000 \n",
172 | " 34.000000 \n",
173 | " 115.000000 \n",
174 | " 109.000000 \n",
175 | " 45.000000 \n",
176 | " \n",
177 | " \n",
178 | " 25% \n",
179 | " 30.750000 \n",
180 | " 36.000000 \n",
181 | " 130.000000 \n",
182 | " 114.750000 \n",
183 | " 48.000000 \n",
184 | " \n",
185 | " \n",
186 | " 50% \n",
187 | " 60.500000 \n",
188 | " 39.000000 \n",
189 | " 145.000000 \n",
190 | " 120.000000 \n",
191 | " 51.000000 \n",
192 | " \n",
193 | " \n",
194 | " 75% \n",
195 | " 90.250000 \n",
196 | " 40.000000 \n",
197 | " 151.000000 \n",
198 | " 139.250000 \n",
199 | " 52.000000 \n",
200 | " \n",
201 | " \n",
202 | " max \n",
203 | " 120.000000 \n",
204 | " 42.000000 \n",
205 | " 162.000000 \n",
206 | " 177.000000 \n",
207 | " 56.000000 \n",
208 | " \n",
209 | " \n",
210 | "
\n",
211 | "
"
212 | ],
213 | "text/plain": [
214 | " Day_No Temp Cold_Drink Hot_Drink Snacks\n",
215 | "count 120.000000 120.000000 120.000000 120.000000 120.000000\n",
216 | "mean 60.500000 38.533333 141.750000 128.766667 50.066667\n",
217 | "std 34.785054 2.315216 13.111833 19.615948 2.946963\n",
218 | "min 1.000000 34.000000 115.000000 109.000000 45.000000\n",
219 | "25% 30.750000 36.000000 130.000000 114.750000 48.000000\n",
220 | "50% 60.500000 39.000000 145.000000 120.000000 51.000000\n",
221 | "75% 90.250000 40.000000 151.000000 139.250000 52.000000\n",
222 | "max 120.000000 42.000000 162.000000 177.000000 56.000000"
223 | ]
224 | },
225 | "execution_count": 3,
226 | "metadata": {},
227 | "output_type": "execute_result"
228 | }
229 | ],
230 | "source": [
231 | "data.describe()"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "There are few methods to calculate correlation statistics using python:\n",
239 | "* Pearson\n",
240 | "* Kendall\n",
241 | "* Spearman\n",
242 | "It is good to take note that result of the calculated values may differ but the meaning behind the data, i.e. how it is correlated/whether correlated or not, is the same.
"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 4,
248 | "metadata": {},
249 | "outputs": [
250 | {
251 | "data": {
252 | "text/html": [
253 | "\n",
254 | "\n",
267 | "
\n",
268 | " \n",
269 | " \n",
270 | " \n",
271 | " Day_No \n",
272 | " Temp \n",
273 | " Cold_Drink \n",
274 | " Hot_Drink \n",
275 | " Snacks \n",
276 | " \n",
277 | " \n",
278 | " \n",
279 | " \n",
280 | " Day_No \n",
281 | " 1.000000 \n",
282 | " 0.269834 \n",
283 | " 0.378274 \n",
284 | " -0.468887 \n",
285 | " 0.045251 \n",
286 | " \n",
287 | " \n",
288 | " Temp \n",
289 | " 0.269834 \n",
290 | " 1.000000 \n",
291 | " 0.890255 \n",
292 | " -0.734043 \n",
293 | " -0.021266 \n",
294 | " \n",
295 | " \n",
296 | " Cold_Drink \n",
297 | " 0.378274 \n",
298 | " 0.890255 \n",
299 | " 1.000000 \n",
300 | " -0.756659 \n",
301 | " 0.042191 \n",
302 | " \n",
303 | " \n",
304 | " Hot_Drink \n",
305 | " -0.468887 \n",
306 | " -0.734043 \n",
307 | " -0.756659 \n",
308 | " 1.000000 \n",
309 | " -0.100323 \n",
310 | " \n",
311 | " \n",
312 | " Snacks \n",
313 | " 0.045251 \n",
314 | " -0.021266 \n",
315 | " 0.042191 \n",
316 | " -0.100323 \n",
317 | " 1.000000 \n",
318 | " \n",
319 | " \n",
320 | "
\n",
321 | "
"
322 | ],
323 | "text/plain": [
324 | " Day_No Temp Cold_Drink Hot_Drink Snacks\n",
325 | "Day_No 1.000000 0.269834 0.378274 -0.468887 0.045251\n",
326 | "Temp 0.269834 1.000000 0.890255 -0.734043 -0.021266\n",
327 | "Cold_Drink 0.378274 0.890255 1.000000 -0.756659 0.042191\n",
328 | "Hot_Drink -0.468887 -0.734043 -0.756659 1.000000 -0.100323\n",
329 | "Snacks 0.045251 -0.021266 0.042191 -0.100323 1.000000"
330 | ]
331 | },
332 | "execution_count": 4,
333 | "metadata": {},
334 | "output_type": "execute_result"
335 | }
336 | ],
337 | "source": [
338 | "data.corr(method='pearson', min_periods=1)"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 5,
344 | "metadata": {},
345 | "outputs": [
346 | {
347 | "data": {
348 | "text/html": [
349 | "\n",
350 | "\n",
363 | "
\n",
364 | " \n",
365 | " \n",
366 | " \n",
367 | " Day_No \n",
368 | " Temp \n",
369 | " Cold_Drink \n",
370 | " Hot_Drink \n",
371 | " Snacks \n",
372 | " \n",
373 | " \n",
374 | " \n",
375 | " \n",
376 | " Day_No \n",
377 | " 1.000000 \n",
378 | " 0.189172 \n",
379 | " 0.275782 \n",
380 | " -0.341609 \n",
381 | " 0.009087 \n",
382 | " \n",
383 | " \n",
384 | " Temp \n",
385 | " 0.189172 \n",
386 | " 1.000000 \n",
387 | " 0.736854 \n",
388 | " -0.551354 \n",
389 | " -0.033551 \n",
390 | " \n",
391 | " \n",
392 | " Cold_Drink \n",
393 | " 0.275782 \n",
394 | " 0.736854 \n",
395 | " 1.000000 \n",
396 | " -0.511530 \n",
397 | " 0.025593 \n",
398 | " \n",
399 | " \n",
400 | " Hot_Drink \n",
401 | " -0.341609 \n",
402 | " -0.551354 \n",
403 | " -0.511530 \n",
404 | " 1.000000 \n",
405 | " -0.036256 \n",
406 | " \n",
407 | " \n",
408 | " Snacks \n",
409 | " 0.009087 \n",
410 | " -0.033551 \n",
411 | " 0.025593 \n",
412 | " -0.036256 \n",
413 | " 1.000000 \n",
414 | " \n",
415 | " \n",
416 | "
\n",
417 | "
"
418 | ],
419 | "text/plain": [
420 | " Day_No Temp Cold_Drink Hot_Drink Snacks\n",
421 | "Day_No 1.000000 0.189172 0.275782 -0.341609 0.009087\n",
422 | "Temp 0.189172 1.000000 0.736854 -0.551354 -0.033551\n",
423 | "Cold_Drink 0.275782 0.736854 1.000000 -0.511530 0.025593\n",
424 | "Hot_Drink -0.341609 -0.551354 -0.511530 1.000000 -0.036256\n",
425 | "Snacks 0.009087 -0.033551 0.025593 -0.036256 1.000000"
426 | ]
427 | },
428 | "execution_count": 5,
429 | "metadata": {},
430 | "output_type": "execute_result"
431 | }
432 | ],
433 | "source": [
434 | "data.corr(method='kendall', min_periods=1)"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 6,
440 | "metadata": {},
441 | "outputs": [
442 | {
443 | "data": {
444 | "text/html": [
445 | "\n",
446 | "\n",
459 | "
\n",
460 | " \n",
461 | " \n",
462 | " \n",
463 | " Day_No \n",
464 | " Temp \n",
465 | " Cold_Drink \n",
466 | " Hot_Drink \n",
467 | " Snacks \n",
468 | " \n",
469 | " \n",
470 | " \n",
471 | " \n",
472 | " Day_No \n",
473 | " 1.000000 \n",
474 | " 0.264297 \n",
475 | " 0.380530 \n",
476 | " -0.457385 \n",
477 | " 0.009190 \n",
478 | " \n",
479 | " \n",
480 | " Temp \n",
481 | " 0.264297 \n",
482 | " 1.000000 \n",
483 | " 0.870657 \n",
484 | " -0.690304 \n",
485 | " -0.043479 \n",
486 | " \n",
487 | " \n",
488 | " Cold_Drink \n",
489 | " 0.380530 \n",
490 | " 0.870657 \n",
491 | " 1.000000 \n",
492 | " -0.686986 \n",
493 | " 0.029932 \n",
494 | " \n",
495 | " \n",
496 | " Hot_Drink \n",
497 | " -0.457385 \n",
498 | " -0.690304 \n",
499 | " -0.686986 \n",
500 | " 1.000000 \n",
501 | " -0.045372 \n",
502 | " \n",
503 | " \n",
504 | " Snacks \n",
505 | " 0.009190 \n",
506 | " -0.043479 \n",
507 | " 0.029932 \n",
508 | " -0.045372 \n",
509 | " 1.000000 \n",
510 | " \n",
511 | " \n",
512 | "
\n",
513 | "
"
514 | ],
515 | "text/plain": [
516 | " Day_No Temp Cold_Drink Hot_Drink Snacks\n",
517 | "Day_No 1.000000 0.264297 0.380530 -0.457385 0.009190\n",
518 | "Temp 0.264297 1.000000 0.870657 -0.690304 -0.043479\n",
519 | "Cold_Drink 0.380530 0.870657 1.000000 -0.686986 0.029932\n",
520 | "Hot_Drink -0.457385 -0.690304 -0.686986 1.000000 -0.045372\n",
521 | "Snacks 0.009190 -0.043479 0.029932 -0.045372 1.000000"
522 | ]
523 | },
524 | "execution_count": 6,
525 | "metadata": {},
526 | "output_type": "execute_result"
527 | }
528 | ],
529 | "source": [
530 | "data.corr(method='spearman', min_periods=1)"
531 | ]
532 | },
533 | {
534 | "cell_type": "markdown",
535 | "metadata": {},
536 | "source": [
537 | "Note: We can find correlation between two variables only as shown below. "
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 7,
543 | "metadata": {},
544 | "outputs": [
545 | {
546 | "data": {
547 | "text/html": [
548 | "\n",
549 | "\n",
562 | "
\n",
563 | " \n",
564 | " \n",
565 | " \n",
566 | " Temp \n",
567 | " Cold_Drink \n",
568 | " \n",
569 | " \n",
570 | " \n",
571 | " \n",
572 | " Temp \n",
573 | " 1.000000 \n",
574 | " 0.890255 \n",
575 | " \n",
576 | " \n",
577 | " Cold_Drink \n",
578 | " 0.890255 \n",
579 | " 1.000000 \n",
580 | " \n",
581 | " \n",
582 | "
\n",
583 | "
"
584 | ],
585 | "text/plain": [
586 | " Temp Cold_Drink\n",
587 | "Temp 1.000000 0.890255\n",
588 | "Cold_Drink 0.890255 1.000000"
589 | ]
590 | },
591 | "execution_count": 7,
592 | "metadata": {},
593 | "output_type": "execute_result"
594 | }
595 | ],
596 | "source": [
597 | "data[['Temp', 'Cold_Drink']].corr()"
598 | ]
599 | },
600 | {
601 | "cell_type": "markdown",
602 | "metadata": {},
603 | "source": [
604 | "Since the correlation value between Temperature and Cold_Drink gave us positive and somewhere close to one, we can conclude that these two variables has positive correlation."
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": 8,
610 | "metadata": {},
611 | "outputs": [
612 | {
613 | "data": {
614 | "text/html": [
615 | "\n",
616 | "\n",
629 | "
\n",
630 | " \n",
631 | " \n",
632 | " \n",
633 | " Temp \n",
634 | " Hot_Drink \n",
635 | " \n",
636 | " \n",
637 | " \n",
638 | " \n",
639 | " Temp \n",
640 | " 1.000000 \n",
641 | " -0.734043 \n",
642 | " \n",
643 | " \n",
644 | " Hot_Drink \n",
645 | " -0.734043 \n",
646 | " 1.000000 \n",
647 | " \n",
648 | " \n",
649 | "
\n",
650 | "
"
651 | ],
652 | "text/plain": [
653 | " Temp Hot_Drink\n",
654 | "Temp 1.000000 -0.734043\n",
655 | "Hot_Drink -0.734043 1.000000"
656 | ]
657 | },
658 | "execution_count": 8,
659 | "metadata": {},
660 | "output_type": "execute_result"
661 | }
662 | ],
663 | "source": [
664 | "data[['Temp', 'Hot_Drink']].corr()"
665 | ]
666 | },
667 | {
668 | "cell_type": "markdown",
669 | "metadata": {},
670 | "source": [
671 | "Since the correlation value between Temperature and Hot_Drink gave us negative and somewhere close to minus one, we can conclude that these two variables has negative correlation."
672 | ]
673 | },
674 | {
675 | "cell_type": "code",
676 | "execution_count": 9,
677 | "metadata": {},
678 | "outputs": [
679 | {
680 | "data": {
681 | "text/html": [
682 | "\n",
683 | "\n",
696 | "
\n",
697 | " \n",
698 | " \n",
699 | " \n",
700 | " Temp \n",
701 | " Snacks \n",
702 | " \n",
703 | " \n",
704 | " \n",
705 | " \n",
706 | " Temp \n",
707 | " 1.000000 \n",
708 | " -0.021266 \n",
709 | " \n",
710 | " \n",
711 | " Snacks \n",
712 | " -0.021266 \n",
713 | " 1.000000 \n",
714 | " \n",
715 | " \n",
716 | "
\n",
717 | "
"
718 | ],
719 | "text/plain": [
720 | " Temp Snacks\n",
721 | "Temp 1.000000 -0.021266\n",
722 | "Snacks -0.021266 1.000000"
723 | ]
724 | },
725 | "execution_count": 9,
726 | "metadata": {},
727 | "output_type": "execute_result"
728 | }
729 | ],
730 | "source": [
731 | "data[['Temp', 'Snacks']].corr()"
732 | ]
733 | },
734 | {
735 | "cell_type": "markdown",
736 | "metadata": {},
737 | "source": [
738 | "Since the correlation value between Temperature and Snacks is far from one and minus one, we can conclude that these two variables has no correlation."
739 | ]
740 | },
741 | {
742 | "cell_type": "code",
743 | "execution_count": 10,
744 | "metadata": {},
745 | "outputs": [
746 | {
747 | "data": {
748 | "text/html": [
749 | " \n",
801 | " \n",
802 | " \n",
803 | " \n",
804 | " Day_No \n",
805 | " Temp \n",
806 | " Cold_Drink \n",
807 | " Hot_Drink \n",
808 | " Snacks \n",
809 | " \n",
810 | " \n",
811 | " Day_No \n",
812 | " 1 \n",
813 | " 0.269834 \n",
814 | " 0.378274 \n",
815 | " -0.468887 \n",
816 | " 0.0452506 \n",
817 | " \n",
818 | " Temp \n",
819 | " 0.269834 \n",
820 | " 1 \n",
821 | " 0.890255 \n",
822 | " -0.734043 \n",
823 | " -0.0212665 \n",
824 | " \n",
825 | " Cold_Drink \n",
826 | " 0.378274 \n",
827 | " 0.890255 \n",
828 | " 1 \n",
829 | " -0.756659 \n",
830 | " 0.0421907 \n",
831 | " \n",
832 | " Hot_Drink \n",
833 | " -0.468887 \n",
834 | " -0.734043 \n",
835 | " -0.756659 \n",
836 | " 1 \n",
837 | " -0.100323 \n",
838 | " \n",
839 | " Snacks \n",
840 | " 0.0452506 \n",
841 | " -0.0212665 \n",
842 | " 0.0421907 \n",
843 | " -0.100323 \n",
844 | " 1 \n",
845 | " \n",
846 | "
"
847 | ],
848 | "text/plain": [
849 | ""
850 | ]
851 | },
852 | "execution_count": 10,
853 | "metadata": {},
854 | "output_type": "execute_result"
855 | }
856 | ],
857 | "source": [
858 | "import matplotlib.pyplot as plt\n",
859 | "%matplotlib inline\n",
860 | "\n",
861 | "data.corr().style.background_gradient(cmap='coolwarm')"
862 | ]
863 | },
864 | {
865 | "cell_type": "code",
866 | "execution_count": 11,
867 | "metadata": {},
868 | "outputs": [
869 | {
870 | "data": {
871 | "text/html": [
872 | " \n",
924 | " \n",
925 | " \n",
926 | " \n",
927 | " Day_No \n",
928 | " Temp \n",
929 | " Cold_Drink \n",
930 | " Hot_Drink \n",
931 | " Snacks \n",
932 | " \n",
933 | " \n",
934 | " Day_No \n",
935 | " 1 \n",
936 | " 0.27 \n",
937 | " 0.38 \n",
938 | " -0.47 \n",
939 | " 0.045 \n",
940 | " \n",
941 | " Temp \n",
942 | " 0.27 \n",
943 | " 1 \n",
944 | " 0.89 \n",
945 | " -0.73 \n",
946 | " -0.021 \n",
947 | " \n",
948 | " Cold_Drink \n",
949 | " 0.38 \n",
950 | " 0.89 \n",
951 | " 1 \n",
952 | " -0.76 \n",
953 | " 0.042 \n",
954 | " \n",
955 | " Hot_Drink \n",
956 | " -0.47 \n",
957 | " -0.73 \n",
958 | " -0.76 \n",
959 | " 1 \n",
960 | " -0.1 \n",
961 | " \n",
962 | " Snacks \n",
963 | " 0.045 \n",
964 | " -0.021 \n",
965 | " 0.042 \n",
966 | " -0.1 \n",
967 | " 1 \n",
968 | " \n",
969 | "
"
970 | ],
971 | "text/plain": [
972 | ""
973 | ]
974 | },
975 | "execution_count": 11,
976 | "metadata": {},
977 | "output_type": "execute_result"
978 | }
979 | ],
980 | "source": [
981 | "data.corr().style.background_gradient(cmap='coolwarm').set_precision(2)"
982 | ]
983 | },
984 | {
985 | "cell_type": "markdown",
986 | "metadata": {},
987 | "source": [
988 | "Note: Scatter plots are very common visualization technique to see correlation between two variables. "
989 | ]
990 | },
991 | {
992 | "cell_type": "code",
993 | "execution_count": 12,
994 | "metadata": {},
995 | "outputs": [
996 | {
997 | "data": {
998 | "image/png": "\n",
999 | "text/plain": [
1000 | ""
1001 | ]
1002 | },
1003 | "metadata": {},
1004 | "output_type": "display_data"
1005 | }
1006 | ],
1007 | "source": [
1008 | "plt.scatter(data['Temp'], data['Cold_Drink'])\n",
1009 | "plt.show()"
1010 | ]
1011 | },
1012 | {
1013 | "cell_type": "code",
1014 | "execution_count": 13,
1015 | "metadata": {},
1016 | "outputs": [
1017 | {
1018 | "data": {
1019 | "image/png": "\n",
1020 | "text/plain": [
1021 | ""
1022 | ]
1023 | },
1024 | "metadata": {},
1025 | "output_type": "display_data"
1026 | }
1027 | ],
1028 | "source": [
1029 | "plt.scatter(data['Temp'], data['Hot_Drink'])\n",
1030 | "plt.show()"
1031 | ]
1032 | },
1033 | {
1034 | "cell_type": "code",
1035 | "execution_count": 14,
1036 | "metadata": {},
1037 | "outputs": [
1038 | {
1039 | "data": {
1040 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAFkxJREFUeJzt3XuMlfWdx/H3xynSWVdlKYOtIEuLt02Ll2aqWNKLdhFrXWXddisLXXsRUttuG02xJZIYG4kmbKxJk20DXVt3wVtWmTZqVVLWbWsAOxQVN967VIVuGdcSL8UL43f/OM8gDjOc8xzmnOfHbz6vxDDnN2c4nzy/w8czzzxzvooIzMzswHdQ1QHMzGxkuNDNzDLhQjczy4QL3cwsEy50M7NMuNDNzDLhQjczy4QL3cwsEy50M7NMvKOdDzZhwoSYOnVqOx/SzOyAt3Hjxucjoqve/dpa6FOnTqW3t7edD2lmdsCT9LtG7udTLmZmmXChm5llwoVuZpYJF7qZWSZc6GZmmWjoKhdJW4CXgH5gV0R0F+v/BHwN2AXcGRGXtSinNWhJz2Zu2vAs/RF0SMw99SiumjO96lj0bNrKsnseZ9uOnRw5rpNFs49jzsmTqo6VbK4TrribF1/r3337sLEdPHzlWRUmqpm3Yh33P/3C7tszp41n1YLTKkxkeyrzCv30iDhpjzI/HTgPOCEi3g/8cysCWuOW9Gxm5fpn6C+mUPVHsHL9Myzp2Vxprp5NW1l8+2a27thJAFt37GTx7Zvp2bTVuYYwuMwBXnytnxOuuLuiRDWDyxzg/qdfYN6KdRUlssH255TLxcA1EfEaQERsH5lI1qybNjxbar1dlt3zODvfeHtB7Xyjn2X3PF5RoppUcw0u83rr7TK4zOutW/s1WugB3Ctpo6SFxdqxwEckbZD0X5I+NNQXSlooqVdSb19f30hktmH0DzMfdrj1dtm2Y2ep9XZJNZdZsxot9JkR8UHgk8BXJX2U2vn3vwBmAIuAWyVp8BdGxPKI6I6I7q6uur+5avuhY+/Dv8/1djlyXGep9XZJNZdZsxoq9IjYVvy5HVgNnAI8B9weNQ8AbwITWhXU6pt76lGl1ttl0ezj6BzT8ba1zjEdLJp9XEWJalLNddjYjlLr7TJz2vhS69Z+dQtd0iGSDh34GDgTeAToAc4o1o8FDgaeb11Uq+eqOdOZP2PK7lfkHRLzZ0yp/CqXOSdP4urzpzNpXCcCJo3r5Orzp1d+NUmquR6+8qy9yjuFq1xWLThtr/L2VS5pUdQ5vyrpfdRelUPtNMuNEbFU0sHA9cBJwOvANyNi7b7+ru7u7vCbc5mZlSNp48AVhvtS9zr0iPgtcOIQ668D85uLZ2ZmI82/KWpmlgkXuplZJlzoZmaZcKGbmWXChW5mlgkXuplZJlzoZmaZcKGbmWXChW5mlgkXuplZJlzoZmaZcKGbmWXChW5mlgkXuplZJlzoZmaZcKGbmWXChW5mlgkXuplZJlzoZmaZcKGbmWXChW5mlgkXuplZJlzoZmaZcKGbmWXChW5mlgkXuplZJlzoZmaZaKjQJW2RtFnSg5J6B33um5JC0oTWRDQzs0a8o8R9T4+I5/dckHQUMAt4ZkRTmZlZaft7yuW7wGVAjEAWMzPbD40WegD3StooaSGApHOBrRHx0L6+UNJCSb2Sevv6+vYzrpmZDafRUy4zI2KbpInAGkmPAZcDZ9b7wohYDiwH6O7u9it5M7MWaegVekRsK/7cDqwGPga8F3hI0hZgMvAbSe9uUU4zM6ujbqFLOkTSoQMfU3tV/uuImBgRUyNiKvAc8MGI+N+WpjUzs2E1csrlCGC1pIH73xgRd7c0lZmZlVa30CPit8CJde4zdaQCmZlZc/ybomZmmXChm5llwoVuZpYJF7qZWSZc6GZmmXChm5llwoVuZpYJF7qZWSZc6GZmmSgz4MIOAD2btrLsnsfZtmMnR47rZNHs45hz8qSqY1lJ3kdrhgs9Iz2btrL49s3sfKMfgK07drL49s0ALoMDiPfRmuVTLhlZds/ju0tgwM43+ll2z+MVJbJmeB+tWS70jGzbsbPUuqXJ+2jNcqFn5MhxnaXWLU3eR2uWCz0ji2YfR+eYjretdY7pYNHs4ypKZM3wPlqz/EPRjAz8wMxXRxzYvI/WLEW0b25zd3d39Pb2tu3xzMxyIGljRHTXu59PuZiZZcKFbmaWCRe6mVkmXOhmZplwoZuZZcKFbmaWCRe6mVkmXOhmZplwoZuZZcKFbmaWiYbey0XSFuAloB/YFRHdkpYBfwO8DjwNfCEidrQqaGpOXbqGP7z0+u7bRxx6MBsun1VhopolPZu5acOz9EfQITH31KO4as70qmMlmyvVyUDzVqzj/qdf2H175rTxrFpwWoWJaryP5bT7eJV5hX56RJy0x/sJrAE+EBEnAE8Ai0c8XaIGlznAH156nVOXrqkoUc2Sns2sXP8M/cX78/RHsHL9Myzp2excQxiYDLR1x06CtyYD9WzaWmmuwWUOcP/TLzBvxbqKEtV4H8up4ng1fcolIu6NiF3FzfXA5JGJlL7BZV5vvV1u2vBsqfV2STVXqpOBBpd5vfV28T6WU8XxarTQA7hX0kZJC4f4/BeBnw31hZIWSuqV1NvX19dsTmtA/zDvnDncerukmsuTgcrxPpZTxfFqtNBnRsQHgU8CX5X00YFPSLoc2AWsGuoLI2J5RHRHRHdXV9d+B7bhdUil1tsl1VyeDFSO97GcKo5XQ4UeEduKP7cDq4FTACRdCJwDzIt2vrF6xY449OBS6+0y99SjSq23S6q5Up0MNHPa+FLr7eJ9LKeK41W30CUdIunQgY+BM4FHJJ0FfAs4NyL+1LKECdpw+ay9yjuFq1yumjOd+TOm7H4F0CExf8aUyq9CSDXXnJMncfX505k0rhMBk8Z1cvX50yu/OmLVgtP2Ku8UrnLxPpZTxfGqO7FI0vuovSqH2mWON0bEUklPAWOB/ys+tz4ivryvv8sTi8zMymt0YlHd69Aj4rfAiUOsH91kNjMzawH/pqiZWSZc6GZmmXChm5llwoVuZpYJF7qZWSZc6GZmmXChm5llwoVuZpYJF7qZWSYamlhke0t1Qsrxl9/Fq/1vvZ3DOzvEY0vPrjBRzaxr7+PJ7a/svn3MxENYc+nHqwuUOO9jOalOUjrhirt58bW33qv9sLEdPHzlWS17PL9Cb0KqE1IGlwDAq/3B8ZffVVGimsElAPDk9leYde191QRKnPexnFQnKQ0uc4AXX+vnhCvubtljutCbkOqElMElUG+9XQaXQL310c77WE6qk5QGl3m99ZHgQm9CqhNSzEajVCcpVcGF3oRUJ6SYjUapTlKqggu9CalOSHlnx9BP4OHW2+WYiYeUWh/tvI/lpDpJ6bCxHaXWR4ILvQmpTkh5bOnZe/2jT+HqiDWXfnyvf/SpXB2RIu9jOalOUnr4yrP2Ku9WX+VSd2LRSPLEIjOz8hqdWORX6GZmmXChm5llwoVuZpYJF7qZWSZc6GZmmXChm5llwoVuZpYJF7qZWSZc6GZmmWhowIWkLcBLQD+wKyK6JY0HbgGmAluAv4+IP7YmppmZ1VPmFfrpEXHSHr9++m3g5xFxDPDz4vao0bNpKzOvWct7v30nM69ZW/lwCzOz/Tnlch5wQ/HxDcCc/Y9zYEh1YpGZjW6NFnoA90raKGlhsXZERPweoPhzYisCpijViUVmNro1OiR6ZkRskzQRWCPpsUYfoPgfwEKAKVOmNBExPZ5YZGYpaugVekRsK/7cDqwGTgH+IOk9AMWf24f52uUR0R0R3V1dXSOTumKeWGRmKapb6JIOkXTowMfAmcAjwE+BC4u7XQj8pFUhU5PqxCIzG90aOeVyBLBatWkg7wBujIi7Jf0auFXSl4BngM+0LmZaBiYTLbvncbbt2MmR4zpZNPu4yicWmdno5olFZmaJ88QiM7NRxoVuZpYJF7qZWSZc6GZmmXChm5llwoVuZpYJF7qZWSZc6GZmmXChm5llwoVuZpaJRt8+1w4QS3o2c9OGZ+mPoENi7qlHcdWc6VXHomfTVr/3TQmpHi8/v8qZt2Id9z/9wu7bM6eNZ9WC01r2eH6FnpElPZtZuf4Z+ov35+mPYOX6Z1jSs7nSXJ7wVE6qx8vPr3IGlznA/U+/wLwV61r2mC70jNy04dlS6+3iCU/lpHq8/PwqZ3CZ11sfCS70jPQP886Zw623iyc8lZPq8fLzK30u9Ix01N6zvuH1dvGEp3JSPV5+fqXPhZ6RuaceVWq9XTzhqZxUj5efX+XMnDa+1PpIcKFn5Ko505k/Y8ruV0wdEvNnTKn8KoQ5J0/i6vOnM2lcJwImjevk6vOnJ3EVQopSPV5+fpWzasFpe5V3q69y8cQiM7PEeWKRmdko40I3M8uEC93MLBMudDOzTLjQzcwy4UI3M8uEC93MLBMudDOzTLjQzcwy0XChS+qQtEnSHcXtT0j6jaQHJf1K0tGti2lmZvWUmVj0DeBR4LDi9veB8yLiUUlfAZYAnx/ZeOlKdUJKqhNl2j25pVGp7uOsa+/jye2v7L59zMRDWHPpx6sLVEh1H1N93rc7V0Ov0CVNBj4F/HCP5eCtcj8c2Day0dKV6oSUVCfKVDG5pRGp7uPgMgd4cvsrzLr2vmoCFVLdx1Sf91XkavSUy3XAZcCbe6xdBNwl6Tngc8A1I5wtWalOSEl1okwVk1sakeo+Di7zeuvtkuo+pvq8ryJX3UKXdA6wPSI2DvrUJcDZETEZ+BFw7TBfv1BSr6Tevr6+/Q6cglQnpKQ6USZVqe6jlZPq876KXI28Qp8JnCtpC3AzcIakO4ETI2JDcZ9bgA8P9cURsTwiuiOiu6urayQyVy7VCSmpTpRJVar7aOWk+ryvIlfdQo+IxRExOSKmAhcAa4HzgMMlHVvcbRa1H5iOCqlOSEl1okwVk1sakeo+HjPxkFLr7ZLqPqb6vK8iV1PXoUfELmABcJukh6idQ180ksFSluqElFQnylQxuaURqe7jmks/vld5p3CVS6r7mOrzvopcnlhkZpY4TywyMxtlXOhmZplwoZuZZcKFbmaWCRe6mVkmXOhmZplwoZuZZcKFbmaWCRe6mVkmXOhmZpkoM7HI9uAJKeWkOhko1Vyp7mOquVLdx3ZPnvIr9CZ4Qko5qU4GSjVXqvuYaq5U97GKyVMu9CZ4Qko5qU4GSjVXqvuYaq5U97GKyVMu9CZ4Qko5qU4GSjVXqvuYaq5U97EKLvQmeEJKOalOBko1V6r7mGquVPexCi70JnhCSjmpTgZKNVeq+5hqrlT3sYrJUy70JnhCSjmpTgZKNVeq+5hqrlT3sYrJU55YZGaWOE8sMjMbZVzoZmaZcKGbmWXChW5mlgkXuplZJlzoZmaZcKGbmWXChW5mlgkXuplZJhoudEkdkjZJuqO4LUlLJT0h6VFJX29dTDMzq6fMxKJvAI8ChxW3Pw8cBRwfEW9KmjjC2XZLcRpJiplSlurx8gSePKR6vNqdq6FClzQZ+BSwFLi0WL4Y+IeIeBMgIra3IuDANJKBN7AfmEYCVLZhKWZKWarHa2ACz4CBCTxApaWe6vFKVarHq4pcjZ5yuQ64DHhzj7VpwGcl9Ur6maRjRjwdaU4jSTFTylI9Xp7Ak4dUj1cVueoWuqRzgO0RsXHQp8YCrxbvALYCuH6Yr19YlH5vX19f6YApTiNJMVPKUj1ensCTh1SPVxW5GnmFPhM4V9IW4GbgDEkrgeeA24r7rAZOGOqLI2J5RHRHRHdXV1fpgClOI0kxU8pSPV6ewJOHVI9XFbnqFnpELI6IyRExFbgAWBsR84Ee4Izibh8DnmhFwBSnkaSYKWWpHi9P4MlDqserilxlrnIZ7BpglaRLgJeBi0Ym0tsN/PAgpZ9gp5gpZaker4EffKZ2lUuqxytVqR6vKnJ5YpGZWeI8scjMbJRxoZuZZcKFbmaWCRe6mVkmXOhmZplwoZuZZcKFbmaWCRe6mVkmXOhmZplwoZuZZWJ/3stlVEt10o3lwRN4rBku9CakOunG8uAJPNYsn3JpQqqTbiwPnsBjzXKhNyHVSTeWB0/gsWa50JuQ6qQby4Mn8FizXOhNSHXSjeXBE3isWf6haBNSnXRjefAEHmuWJxaZmSXOE4vMzEYZF7qZWSZc6GZmmXChm5llwoVuZpaJtl7lIqkP+N1+/BUTgOdHKM5ISTETOFdZzlWOc5Wzv7n+MiK66t2prYW+vyT1NnLpTjulmAmcqyznKse5ymlXLp9yMTPLhAvdzCwTB1qhL686wBBSzATOVZZzleNc5bQl1wF1Dt3MzIZ3oL1CNzOzYSRZ6JLeKekBSQ9J+m9JVw76/PckvZxKLkk/lvQ/kh4s/jspkVyStFTSE5IelfT1RHL9co9jtU1STyK5PiHpN0WuX0k6OpFcZxS5HpF0g6S2v0uqpA5JmyTdUdx+r6QNkp6UdIukg9udaZhcX5P0lKSQNKGKTMPkWiXp8WIPr5c0piUPHBHJ/QcI+PPi4zHABmBGcbsb+Hfg5VRyAT8GPp3a8QK+APwbcFDxuYkp5Bp0n9uAf0whF/AE8FfF+leAHyeQ68PAs8Cxxfp3gC9V8By7FLgRuKO4fStwQfHxD4CL251pmFwnA1OBLcCEKjINk+vsYn8F3NSq45XkK/SoGXgFPqb4LyR1AMuAy1LKVUWWPe0j18XAdyLizeJ+2xPJBYCkQ4EzgLa+Qt9HrgAOK9YPB7YlkKsfeC0inijW1wB/185ckiYDnwJ+WNwWtX37j+IuNwBz2plpqFwAEbEpIra0O8uehsl1V7G/ATwATG7FYydZ6LD7W5YHge3AmojYAHwN+GlE/D6xXABLJT0s6buSxiaSaxrwWUm9kn4m6ZhEcg34W+DnEfFiIrkuAu6S9BzwOeCaqnNR+8c/RtLAL6V8Gmj3aKzrqL2IerO4/S5gR0TsKm4/B1Qx5WJwrlQMm6s41fI54O5WPHCyhR4R/RFxErX/k50i6aPAZ4DvJZbrA8Bi4HjgQ8B44FuJ5BoLvBq131BbAVyfSK4Bc6l9+9l2w+S6BDg7IiYDPwKurToX8H7gAuC7kh4AXgJ27eOvGFGSzgG2R8TGPZeHuGtbv1MdJlflGsj1L8AvIuKXrXj8ZAt9QETsAO4DTgeOBp6StAX4M0lPJZDrrIj4ffHd1GvUiuCUFHJRe+V0W/Gp1cAJFcUanAtJ76J2nO6sKhO8LdcngRP3+A7iFmrnr6vOdVZErIuIj0TEKcAvgCfbGGUmcG7xb+5maqdargPG7fHD2cm0+fTUULkkrWxzhqEMm0vSFUAXtfPrLZFkoUvqkjSu+LgT+GtgY0S8OyKmRsRU4E8R0e6rEIbK9Zik9xRronYu8ZEUclE7N31GcbePUfuhXwq5oPbd1h0R8Wo7M+0j16PA4ZKOLe42q1irOtdjkiYWa2Opfff3g3ZliojFETG5+Dd3AbA2IuYB/0nt9A/AhcBP2pVpH7nmtzPDUIbLJekiYDYwd+BnWq2Q6pDo9wA3FD8EPQi4NSLuqDgTDJNL0lpJXdS+FX0Q+HIiuX4FrJJ0CfAytXPElecqPncBFZyj3lcuSQuA2yS9CfwR+GIiuZYV38ofBHw/Ita2OddQvgXcLOkqYBPwrxXnAUC1S3MvA94NPCzproho9/N+KD+g9k6z62qv+7g9Ir4z0g/i3xQ1M8tEkqdczMysPBe6mVkmXOhmZplwoZuZZcKFbmaWCRe6mVkmXOhmZplwoZuZZeL/AZPXmi4jaUzYAAAAAElFTkSuQmCC\n",
1041 | "text/plain": [
1042 | ""
1043 | ]
1044 | },
1045 | "metadata": {},
1046 | "output_type": "display_data"
1047 | }
1048 | ],
1049 | "source": [
1050 | "plt.scatter(data['Temp'], data['Snacks'])\n",
1051 | "plt.show()"
1052 | ]
1053 | },
1054 | {
1055 | "cell_type": "code",
1056 | "execution_count": null,
1057 | "metadata": {},
1058 | "outputs": [],
1059 | "source": []
1060 | }
1061 | ],
1062 | "metadata": {
1063 | "kernelspec": {
1064 | "display_name": "Python 3",
1065 | "language": "python",
1066 | "name": "python3"
1067 | },
1068 | "language_info": {
1069 | "codemirror_mode": {
1070 | "name": "ipython",
1071 | "version": 3
1072 | },
1073 | "file_extension": ".py",
1074 | "mimetype": "text/x-python",
1075 | "name": "python",
1076 | "nbconvert_exporter": "python",
1077 | "pygments_lexer": "ipython3",
1078 | "version": "3.6.4"
1079 | }
1080 | },
1081 | "nbformat": 4,
1082 | "nbformat_minor": 2
1083 | }
1084 |
--------------------------------------------------------------------------------
/P04-ANOVA.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Practical - ANOVA\n",
8 | "This practical session will demonstrate how to handle missing data. We assume everyone to have adequate understanding of Python programming language. For those who would like to refresh Python skill, we would like to recommend our \"Programming for Data Science Series\" where we covered almost all aspects of Python programming in data science domain.\n",
9 | "Refer below URL for full playlist of almost 10 hours video lesson in Burmese Language.\n",
10 | "URL : https://www.youtube.com/watch?v=jOZNjVVZIVs&list=PLD_eiqVVLZDi9GZZJDC8Zx4-3Np8LHs52"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "data": {
20 | "text/html": [
21 | "\n",
22 | "\n",
35 | "
\n",
36 | " \n",
37 | " \n",
38 | " \n",
39 | " emp_id \n",
40 | " first_name \n",
41 | " last_name \n",
42 | " gender \n",
43 | " salary \n",
44 | " work_exp \n",
45 | " \n",
46 | " \n",
47 | " \n",
48 | " \n",
49 | " 0 \n",
50 | " 1 \n",
51 | " Georgi \n",
52 | " Facello \n",
53 | " M \n",
54 | " 500000 \n",
55 | " Mid \n",
56 | " \n",
57 | " \n",
58 | " 1 \n",
59 | " 2 \n",
60 | " Bezalel \n",
61 | " Simmel \n",
62 | " F \n",
63 | " 120000 \n",
64 | " Junior \n",
65 | " \n",
66 | " \n",
67 | " 2 \n",
68 | " 3 \n",
69 | " Parto \n",
70 | " Bamford \n",
71 | " M \n",
72 | " 350000 \n",
73 | " Junior \n",
74 | " \n",
75 | " \n",
76 | " 3 \n",
77 | " 4 \n",
78 | " Chirstian \n",
79 | " Koblick \n",
80 | " M \n",
81 | " 400000 \n",
82 | " Mid \n",
83 | " \n",
84 | " \n",
85 | " 4 \n",
86 | " 5 \n",
87 | " Kyoichi \n",
88 | " Maliniak \n",
89 | " M \n",
90 | " 200000 \n",
91 | " Junior \n",
92 | " \n",
93 | " \n",
94 | "
\n",
95 | "
"
96 | ],
97 | "text/plain": [
98 | " emp_id first_name last_name gender salary work_exp\n",
99 | "0 1 Georgi Facello M 500000 Mid\n",
100 | "1 2 Bezalel Simmel F 120000 Junior\n",
101 | "2 3 Parto Bamford M 350000 Junior\n",
102 | "3 4 Chirstian Koblick M 400000 Mid\n",
103 | "4 5 Kyoichi Maliniak M 200000 Junior"
104 | ]
105 | },
106 | "execution_count": 1,
107 | "metadata": {},
108 | "output_type": "execute_result"
109 | }
110 | ],
111 | "source": [
112 | "import pandas as pd\n",
113 | "data = pd.read_csv('https://raw.githubusercontent.com/myanmards/resource_files/master/sample_anova.csv')\n",
114 | "data.head()"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 2,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/html": [
125 | "\n",
126 | "\n",
139 | "
\n",
140 | " \n",
141 | " \n",
142 | " \n",
143 | " emp_id \n",
144 | " first_name \n",
145 | " last_name \n",
146 | " gender \n",
147 | " salary \n",
148 | " work_exp \n",
149 | " \n",
150 | " \n",
151 | " \n",
152 | " \n",
153 | " 0 \n",
154 | " 1 \n",
155 | " Georgi \n",
156 | " Facello \n",
157 | " M \n",
158 | " 500000 \n",
159 | " Mid \n",
160 | " \n",
161 | " \n",
162 | " 1 \n",
163 | " 2 \n",
164 | " Bezalel \n",
165 | " Simmel \n",
166 | " F \n",
167 | " 120000 \n",
168 | " Junior \n",
169 | " \n",
170 | " \n",
171 | " 2 \n",
172 | " 3 \n",
173 | " Parto \n",
174 | " Bamford \n",
175 | " M \n",
176 | " 350000 \n",
177 | " Junior \n",
178 | " \n",
179 | " \n",
180 | " 3 \n",
181 | " 4 \n",
182 | " Chirstian \n",
183 | " Koblick \n",
184 | " M \n",
185 | " 400000 \n",
186 | " Mid \n",
187 | " \n",
188 | " \n",
189 | " 4 \n",
190 | " 5 \n",
191 | " Kyoichi \n",
192 | " Maliniak \n",
193 | " M \n",
194 | " 200000 \n",
195 | " Junior \n",
196 | " \n",
197 | " \n",
198 | " 5 \n",
199 | " 6 \n",
200 | " Anneke \n",
201 | " Preusig \n",
202 | " F \n",
203 | " 300000 \n",
204 | " Junior \n",
205 | " \n",
206 | " \n",
207 | " 6 \n",
208 | " 7 \n",
209 | " Tzvetan \n",
210 | " Zielinski \n",
211 | " F \n",
212 | " 150000 \n",
213 | " Junior \n",
214 | " \n",
215 | " \n",
216 | " 7 \n",
217 | " 8 \n",
218 | " Saniya \n",
219 | " Kalloufi \n",
220 | " M \n",
221 | " 750000 \n",
222 | " Mid \n",
223 | " \n",
224 | " \n",
225 | " 8 \n",
226 | " 9 \n",
227 | " Sumant \n",
228 | " Peac \n",
229 | " F \n",
230 | " 750000 \n",
231 | " Senior \n",
232 | " \n",
233 | " \n",
234 | " 9 \n",
235 | " 10 \n",
236 | " Duangkaew \n",
237 | " Piveteau \n",
238 | " F \n",
239 | " 200000 \n",
240 | " Junior \n",
241 | " \n",
242 | " \n",
243 | " 10 \n",
244 | " 11 \n",
245 | " Mary \n",
246 | " Sluis \n",
247 | " F \n",
248 | " 400000 \n",
249 | " Mid \n",
250 | " \n",
251 | " \n",
252 | " 11 \n",
253 | " 12 \n",
254 | " Patricio \n",
255 | " Bridgland \n",
256 | " M \n",
257 | " 200000 \n",
258 | " Junior \n",
259 | " \n",
260 | " \n",
261 | " 12 \n",
262 | " 13 \n",
263 | " Eberhardt \n",
264 | " Terkki \n",
265 | " M \n",
266 | " 300000 \n",
267 | " Junior \n",
268 | " \n",
269 | " \n",
270 | " 13 \n",
271 | " 14 \n",
272 | " Berni \n",
273 | " Genin \n",
274 | " M \n",
275 | " 150000 \n",
276 | " Junior \n",
277 | " \n",
278 | " \n",
279 | " 14 \n",
280 | " 15 \n",
281 | " Guoxiang \n",
282 | " Nooteboom \n",
283 | " M \n",
284 | " 300000 \n",
285 | " Mid \n",
286 | " \n",
287 | " \n",
288 | " 15 \n",
289 | " 16 \n",
290 | " Kazuhito \n",
291 | " Cappelletti \n",
292 | " M \n",
293 | " 150000 \n",
294 | " Junior \n",
295 | " \n",
296 | " \n",
297 | " 16 \n",
298 | " 17 \n",
299 | " Cristinel \n",
300 | " Bouloucos \n",
301 | " F \n",
302 | " 750000 \n",
303 | " Senior \n",
304 | " \n",
305 | " \n",
306 | " 17 \n",
307 | " 18 \n",
308 | " Kazuhide \n",
309 | " Peha \n",
310 | " F \n",
311 | " 200000 \n",
312 | " Junior \n",
313 | " \n",
314 | " \n",
315 | " 18 \n",
316 | " 19 \n",
317 | " Lillian \n",
318 | " Haddadi \n",
319 | " M \n",
320 | " 400000 \n",
321 | " Junior \n",
322 | " \n",
323 | " \n",
324 | " 19 \n",
325 | " 20 \n",
326 | " Mayuko \n",
327 | " Warwick \n",
328 | " M \n",
329 | " 200000 \n",
330 | " Mid \n",
331 | " \n",
332 | " \n",
333 | "
\n",
334 | "
"
335 | ],
336 | "text/plain": [
337 | " emp_id first_name last_name gender salary work_exp\n",
338 | "0 1 Georgi Facello M 500000 Mid\n",
339 | "1 2 Bezalel Simmel F 120000 Junior\n",
340 | "2 3 Parto Bamford M 350000 Junior\n",
341 | "3 4 Chirstian Koblick M 400000 Mid\n",
342 | "4 5 Kyoichi Maliniak M 200000 Junior\n",
343 | "5 6 Anneke Preusig F 300000 Junior\n",
344 | "6 7 Tzvetan Zielinski F 150000 Junior\n",
345 | "7 8 Saniya Kalloufi M 750000 Mid\n",
346 | "8 9 Sumant Peac F 750000 Senior\n",
347 | "9 10 Duangkaew Piveteau F 200000 Junior\n",
348 | "10 11 Mary Sluis F 400000 Mid\n",
349 | "11 12 Patricio Bridgland M 200000 Junior\n",
350 | "12 13 Eberhardt Terkki M 300000 Junior\n",
351 | "13 14 Berni Genin M 150000 Junior\n",
352 | "14 15 Guoxiang Nooteboom M 300000 Mid\n",
353 | "15 16 Kazuhito Cappelletti M 150000 Junior\n",
354 | "16 17 Cristinel Bouloucos F 750000 Senior\n",
355 | "17 18 Kazuhide Peha F 200000 Junior\n",
356 | "18 19 Lillian Haddadi M 400000 Junior\n",
357 | "19 20 Mayuko Warwick M 200000 Mid"
358 | ]
359 | },
360 | "execution_count": 2,
361 | "metadata": {},
362 | "output_type": "execute_result"
363 | }
364 | ],
365 | "source": [
366 | "data.head(20)"
367 | ]
368 | },
369 | {
370 | "cell_type": "markdown",
371 | "metadata": {},
372 | "source": [
373 | "First, we will create a dataframe consists with features we would like to perform ANOVA, i.e. salary & work_exp in this case "
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": 3,
379 | "metadata": {},
380 | "outputs": [
381 | {
382 | "data": {
383 | "text/html": [
384 | "\n",
385 | "\n",
398 | "
\n",
399 | " \n",
400 | " \n",
401 | " \n",
402 | " salary \n",
403 | " work_exp \n",
404 | " \n",
405 | " \n",
406 | " \n",
407 | " \n",
408 | " 0 \n",
409 | " 500000 \n",
410 | " Mid \n",
411 | " \n",
412 | " \n",
413 | " 1 \n",
414 | " 120000 \n",
415 | " Junior \n",
416 | " \n",
417 | " \n",
418 | " 2 \n",
419 | " 350000 \n",
420 | " Junior \n",
421 | " \n",
422 | " \n",
423 | " 3 \n",
424 | " 400000 \n",
425 | " Mid \n",
426 | " \n",
427 | " \n",
428 | " 4 \n",
429 | " 200000 \n",
430 | " Junior \n",
431 | " \n",
432 | " \n",
433 | "
\n",
434 | "
"
435 | ],
436 | "text/plain": [
437 | " salary work_exp\n",
438 | "0 500000 Mid\n",
439 | "1 120000 Junior\n",
440 | "2 350000 Junior\n",
441 | "3 400000 Mid\n",
442 | "4 200000 Junior"
443 | ]
444 | },
445 | "execution_count": 3,
446 | "metadata": {},
447 | "output_type": "execute_result"
448 | }
449 | ],
450 | "source": [
451 | "df = data[['salary', 'work_exp']]\n",
452 | "df.head()"
453 | ]
454 | },
455 | {
456 | "cell_type": "markdown",
457 | "metadata": {},
458 | "source": [
459 | "Following step, we will import the required library and perform oneway ANOVA "
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 4,
465 | "metadata": {},
466 | "outputs": [
467 | {
468 | "data": {
469 | "text/plain": [
470 | "(43.767065333017584, 2.84255584357536e-14)"
471 | ]
472 | },
473 | "execution_count": 4,
474 | "metadata": {},
475 | "output_type": "execute_result"
476 | }
477 | ],
478 | "source": [
479 | "from scipy import stats\n",
480 | "\n",
481 | "F, p = stats.f_oneway(df[df.work_exp == 'Junior'].salary,\n",
482 | " df[df.work_exp == 'Mid'].salary,\n",
483 | " df[df.work_exp == 'Senior'].salary)\n",
484 | "F, p"
485 | ]
486 | },
487 | {
488 | "cell_type": "markdown",
489 | "metadata": {},
490 | "source": [
491 | "Now we will perform few things to understand the concept better:\n",
492 | "* Create 3 bins based on salary and name them as Low, Medium, High\n",
493 | "* Notice that these bins are created according to the salary value inside dataset\n",
494 | "* Now we will perform oneway ANOVA based on newly created bins "
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": 5,
500 | "metadata": {},
501 | "outputs": [
502 | {
503 | "data": {
504 | "text/plain": [
505 | "array([120000., 330000., 540000., 750000.])"
506 | ]
507 | },
508 | "execution_count": 5,
509 | "metadata": {},
510 | "output_type": "execute_result"
511 | }
512 | ],
513 | "source": [
514 | "import numpy as np\n",
515 | "bins = np.linspace(min(df['salary']), max(df['salary']), 4)\n",
516 | "bins"
517 | ]
518 | },
519 | {
520 | "cell_type": "code",
521 | "execution_count": 6,
522 | "metadata": {},
523 | "outputs": [
524 | {
525 | "data": {
526 | "text/html": [
527 | "\n",
528 | "\n",
541 | "
\n",
542 | " \n",
543 | " \n",
544 | " \n",
545 | " emp_id \n",
546 | " first_name \n",
547 | " last_name \n",
548 | " gender \n",
549 | " salary \n",
550 | " work_exp \n",
551 | " new_salary_group \n",
552 | " \n",
553 | " \n",
554 | " \n",
555 | " \n",
556 | " 0 \n",
557 | " 1 \n",
558 | " Georgi \n",
559 | " Facello \n",
560 | " M \n",
561 | " 500000 \n",
562 | " Mid \n",
563 | " Medium \n",
564 | " \n",
565 | " \n",
566 | " 1 \n",
567 | " 2 \n",
568 | " Bezalel \n",
569 | " Simmel \n",
570 | " F \n",
571 | " 120000 \n",
572 | " Junior \n",
573 | " Low \n",
574 | " \n",
575 | " \n",
576 | " 2 \n",
577 | " 3 \n",
578 | " Parto \n",
579 | " Bamford \n",
580 | " M \n",
581 | " 350000 \n",
582 | " Junior \n",
583 | " Medium \n",
584 | " \n",
585 | " \n",
586 | " 3 \n",
587 | " 4 \n",
588 | " Chirstian \n",
589 | " Koblick \n",
590 | " M \n",
591 | " 400000 \n",
592 | " Mid \n",
593 | " Medium \n",
594 | " \n",
595 | " \n",
596 | " 4 \n",
597 | " 5 \n",
598 | " Kyoichi \n",
599 | " Maliniak \n",
600 | " M \n",
601 | " 200000 \n",
602 | " Junior \n",
603 | " Low \n",
604 | " \n",
605 | " \n",
606 | " 5 \n",
607 | " 6 \n",
608 | " Anneke \n",
609 | " Preusig \n",
610 | " F \n",
611 | " 300000 \n",
612 | " Junior \n",
613 | " Low \n",
614 | " \n",
615 | " \n",
616 | " 6 \n",
617 | " 7 \n",
618 | " Tzvetan \n",
619 | " Zielinski \n",
620 | " F \n",
621 | " 150000 \n",
622 | " Junior \n",
623 | " Low \n",
624 | " \n",
625 | " \n",
626 | " 7 \n",
627 | " 8 \n",
628 | " Saniya \n",
629 | " Kalloufi \n",
630 | " M \n",
631 | " 750000 \n",
632 | " Mid \n",
633 | " High \n",
634 | " \n",
635 | " \n",
636 | " 8 \n",
637 | " 9 \n",
638 | " Sumant \n",
639 | " Peac \n",
640 | " F \n",
641 | " 750000 \n",
642 | " Senior \n",
643 | " High \n",
644 | " \n",
645 | " \n",
646 | " 9 \n",
647 | " 10 \n",
648 | " Duangkaew \n",
649 | " Piveteau \n",
650 | " F \n",
651 | " 200000 \n",
652 | " Junior \n",
653 | " Low \n",
654 | " \n",
655 | " \n",
656 | "
\n",
657 | "
"
658 | ],
659 | "text/plain": [
660 | " emp_id first_name last_name gender salary work_exp new_salary_group\n",
661 | "0 1 Georgi Facello M 500000 Mid Medium\n",
662 | "1 2 Bezalel Simmel F 120000 Junior Low\n",
663 | "2 3 Parto Bamford M 350000 Junior Medium\n",
664 | "3 4 Chirstian Koblick M 400000 Mid Medium\n",
665 | "4 5 Kyoichi Maliniak M 200000 Junior Low\n",
666 | "5 6 Anneke Preusig F 300000 Junior Low\n",
667 | "6 7 Tzvetan Zielinski F 150000 Junior Low\n",
668 | "7 8 Saniya Kalloufi M 750000 Mid High\n",
669 | "8 9 Sumant Peac F 750000 Senior High\n",
670 | "9 10 Duangkaew Piveteau F 200000 Junior Low"
671 | ]
672 | },
673 | "execution_count": 6,
674 | "metadata": {},
675 | "output_type": "execute_result"
676 | }
677 | ],
678 | "source": [
679 | "bin_names = ['Low', 'Medium', 'High']\n",
680 | "data['new_salary_group'] = pd.cut(data['salary'], bins, labels=bin_names, include_lowest=True)\n",
681 | "data.head(10)"
682 | ]
683 | },
684 | {
685 | "cell_type": "markdown",
686 | "metadata": {},
687 | "source": [
688 | "Since lowest salary will fall into employee type \"Low\" for newly created bin, the f-test score will be high "
689 | ]
690 | },
691 | {
692 | "cell_type": "code",
693 | "execution_count": 7,
694 | "metadata": {},
695 | "outputs": [
696 | {
697 | "data": {
698 | "text/plain": [
699 | "(579.7277387424342, 1.1214878408179476e-54)"
700 | ]
701 | },
702 | "execution_count": 7,
703 | "metadata": {},
704 | "output_type": "execute_result"
705 | }
706 | ],
707 | "source": [
708 | "F, p = stats.f_oneway(data[data.new_salary_group == 'Low'].salary,\n",
709 | " data[data.new_salary_group == 'Medium'].salary,\n",
710 | " data[data.new_salary_group == 'High'].salary)\n",
711 | "F, p"
712 | ]
713 | },
714 | {
715 | "cell_type": "code",
716 | "execution_count": null,
717 | "metadata": {},
718 | "outputs": [],
719 | "source": []
720 | }
721 | ],
722 | "metadata": {
723 | "kernelspec": {
724 | "display_name": "Python 3",
725 | "language": "python",
726 | "name": "python3"
727 | },
728 | "language_info": {
729 | "codemirror_mode": {
730 | "name": "ipython",
731 | "version": 3
732 | },
733 | "file_extension": ".py",
734 | "mimetype": "text/x-python",
735 | "name": "python",
736 | "nbconvert_exporter": "python",
737 | "pygments_lexer": "ipython3",
738 | "version": "3.9.1"
739 | }
740 | },
741 | "nbformat": 4,
742 | "nbformat_minor": 4
743 | }
744 |
--------------------------------------------------------------------------------
/P06-MultipleLinearRegression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "%matplotlib inline"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 3,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/html": [
23 | "\n",
24 | "\n",
37 | "
\n",
38 | " \n",
39 | " \n",
40 | " \n",
41 | " Country \n",
42 | " Year \n",
43 | " Status \n",
44 | " Life expectancy \n",
45 | " Adult Mortality \n",
46 | " infant deaths \n",
47 | " Alcohol \n",
48 | " percentage expenditure \n",
49 | " Hepatitis B \n",
50 | " Measles \n",
51 | " ... \n",
52 | " Polio \n",
53 | " Total expenditure \n",
54 | " Diphtheria \n",
55 | " HIV/AIDS \n",
56 | " GDP \n",
57 | " Population \n",
58 | " thinness 1-19 years \n",
59 | " thinness 5-9 years \n",
60 | " Income composition of resources \n",
61 | " Schooling \n",
62 | " \n",
63 | " \n",
64 | " \n",
65 | " \n",
66 | " 0 \n",
67 | " Afghanistan \n",
68 | " 2015 \n",
69 | " Developing \n",
70 | " 65.0 \n",
71 | " 263.0 \n",
72 | " 62 \n",
73 | " 0.01 \n",
74 | " 71.279624 \n",
75 | " 65.0 \n",
76 | " 1154 \n",
77 | " ... \n",
78 | " 6.0 \n",
79 | " 8.16 \n",
80 | " 65.0 \n",
81 | " 0.1 \n",
82 | " 584.259210 \n",
83 | " 33736494.0 \n",
84 | " 17.2 \n",
85 | " 17.3 \n",
86 | " 0.479 \n",
87 | " 10.1 \n",
88 | " \n",
89 | " \n",
90 | " 1 \n",
91 | " Afghanistan \n",
92 | " 2014 \n",
93 | " Developing \n",
94 | " 59.9 \n",
95 | " 271.0 \n",
96 | " 64 \n",
97 | " 0.01 \n",
98 | " 73.523582 \n",
99 | " 62.0 \n",
100 | " 492 \n",
101 | " ... \n",
102 | " 58.0 \n",
103 | " 8.18 \n",
104 | " 62.0 \n",
105 | " 0.1 \n",
106 | " 612.696514 \n",
107 | " 327582.0 \n",
108 | " 17.5 \n",
109 | " 17.5 \n",
110 | " 0.476 \n",
111 | " 10.0 \n",
112 | " \n",
113 | " \n",
114 | " 2 \n",
115 | " Afghanistan \n",
116 | " 2013 \n",
117 | " Developing \n",
118 | " 59.9 \n",
119 | " 268.0 \n",
120 | " 66 \n",
121 | " 0.01 \n",
122 | " 73.219243 \n",
123 | " 64.0 \n",
124 | " 430 \n",
125 | " ... \n",
126 | " 62.0 \n",
127 | " 8.13 \n",
128 | " 64.0 \n",
129 | " 0.1 \n",
130 | " 631.744976 \n",
131 | " 31731688.0 \n",
132 | " 17.7 \n",
133 | " 17.7 \n",
134 | " 0.470 \n",
135 | " 9.9 \n",
136 | " \n",
137 | " \n",
138 | " 3 \n",
139 | " Afghanistan \n",
140 | " 2012 \n",
141 | " Developing \n",
142 | " 59.5 \n",
143 | " 272.0 \n",
144 | " 69 \n",
145 | " 0.01 \n",
146 | " 78.184215 \n",
147 | " 67.0 \n",
148 | " 2787 \n",
149 | " ... \n",
150 | " 67.0 \n",
151 | " 8.52 \n",
152 | " 67.0 \n",
153 | " 0.1 \n",
154 | " 669.959000 \n",
155 | " 3696958.0 \n",
156 | " 17.9 \n",
157 | " 18.0 \n",
158 | " 0.463 \n",
159 | " 9.8 \n",
160 | " \n",
161 | " \n",
162 | " 4 \n",
163 | " Afghanistan \n",
164 | " 2011 \n",
165 | " Developing \n",
166 | " 59.2 \n",
167 | " 275.0 \n",
168 | " 71 \n",
169 | " 0.01 \n",
170 | " 7.097109 \n",
171 | " 68.0 \n",
172 | " 3013 \n",
173 | " ... \n",
174 | " 68.0 \n",
175 | " 7.87 \n",
176 | " 68.0 \n",
177 | " 0.1 \n",
178 | " 63.537231 \n",
179 | " 2978599.0 \n",
180 | " 18.2 \n",
181 | " 18.2 \n",
182 | " 0.454 \n",
183 | " 9.5 \n",
184 | " \n",
185 | " \n",
186 | "
\n",
187 | "
5 rows × 22 columns
\n",
188 | "
"
189 | ],
190 | "text/plain": [
191 | " Country Year Status Life expectancy Adult Mortality \\\n",
192 | "0 Afghanistan 2015 Developing 65.0 263.0 \n",
193 | "1 Afghanistan 2014 Developing 59.9 271.0 \n",
194 | "2 Afghanistan 2013 Developing 59.9 268.0 \n",
195 | "3 Afghanistan 2012 Developing 59.5 272.0 \n",
196 | "4 Afghanistan 2011 Developing 59.2 275.0 \n",
197 | "\n",
198 | " infant deaths Alcohol percentage expenditure Hepatitis B Measles \\\n",
199 | "0 62 0.01 71.279624 65.0 1154 \n",
200 | "1 64 0.01 73.523582 62.0 492 \n",
201 | "2 66 0.01 73.219243 64.0 430 \n",
202 | "3 69 0.01 78.184215 67.0 2787 \n",
203 | "4 71 0.01 7.097109 68.0 3013 \n",
204 | "\n",
205 | " ... Polio Total expenditure Diphtheria HIV/AIDS GDP \\\n",
206 | "0 ... 6.0 8.16 65.0 0.1 584.259210 \n",
207 | "1 ... 58.0 8.18 62.0 0.1 612.696514 \n",
208 | "2 ... 62.0 8.13 64.0 0.1 631.744976 \n",
209 | "3 ... 67.0 8.52 67.0 0.1 669.959000 \n",
210 | "4 ... 68.0 7.87 68.0 0.1 63.537231 \n",
211 | "\n",
212 | " Population thinness 1-19 years thinness 5-9 years \\\n",
213 | "0 33736494.0 17.2 17.3 \n",
214 | "1 327582.0 17.5 17.5 \n",
215 | "2 31731688.0 17.7 17.7 \n",
216 | "3 3696958.0 17.9 18.0 \n",
217 | "4 2978599.0 18.2 18.2 \n",
218 | "\n",
219 | " Income composition of resources Schooling \n",
220 | "0 0.479 10.1 \n",
221 | "1 0.476 10.0 \n",
222 | "2 0.470 9.9 \n",
223 | "3 0.463 9.8 \n",
224 | "4 0.454 9.5 \n",
225 | "\n",
226 | "[5 rows x 22 columns]"
227 | ]
228 | },
229 | "execution_count": 3,
230 | "metadata": {},
231 | "output_type": "execute_result"
232 | }
233 | ],
234 | "source": [
235 | "df = pd.read_csv('https://raw.githubusercontent.com/myanmards/resource_files/master/regression-one.csv')\n",
236 | "df.head()"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 4,
242 | "metadata": {},
243 | "outputs": [
244 | {
245 | "data": {
246 | "text/html": [
247 | "\n",
248 | "\n",
261 | "
\n",
262 | " \n",
263 | " \n",
264 | " \n",
265 | " Country \n",
266 | " Year \n",
267 | " Status \n",
268 | " life-expect \n",
269 | " Adult Mortality \n",
270 | " infant deaths \n",
271 | " Alcohol \n",
272 | " percentage expenditure \n",
273 | " Hepatitis B \n",
274 | " Measles \n",
275 | " ... \n",
276 | " Polio \n",
277 | " Total expenditure \n",
278 | " Diphtheria \n",
279 | " HIV/AIDS \n",
280 | " GDP \n",
281 | " Population \n",
282 | " thinness 1-19 years \n",
283 | " thinness 5-9 years \n",
284 | " Income composition of resources \n",
285 | " Schooling \n",
286 | " \n",
287 | " \n",
288 | " \n",
289 | " \n",
290 | " 0 \n",
291 | " Afghanistan \n",
292 | " 2015 \n",
293 | " Developing \n",
294 | " 65.0 \n",
295 | " 263.0 \n",
296 | " 62 \n",
297 | " 0.01 \n",
298 | " 71.279624 \n",
299 | " 65.0 \n",
300 | " 1154 \n",
301 | " ... \n",
302 | " 6.0 \n",
303 | " 8.16 \n",
304 | " 65.0 \n",
305 | " 0.1 \n",
306 | " 584.259210 \n",
307 | " 33736494.0 \n",
308 | " 17.2 \n",
309 | " 17.3 \n",
310 | " 0.479 \n",
311 | " 10.1 \n",
312 | " \n",
313 | " \n",
314 | " 1 \n",
315 | " Afghanistan \n",
316 | " 2014 \n",
317 | " Developing \n",
318 | " 59.9 \n",
319 | " 271.0 \n",
320 | " 64 \n",
321 | " 0.01 \n",
322 | " 73.523582 \n",
323 | " 62.0 \n",
324 | " 492 \n",
325 | " ... \n",
326 | " 58.0 \n",
327 | " 8.18 \n",
328 | " 62.0 \n",
329 | " 0.1 \n",
330 | " 612.696514 \n",
331 | " 327582.0 \n",
332 | " 17.5 \n",
333 | " 17.5 \n",
334 | " 0.476 \n",
335 | " 10.0 \n",
336 | " \n",
337 | " \n",
338 | " 2 \n",
339 | " Afghanistan \n",
340 | " 2013 \n",
341 | " Developing \n",
342 | " 59.9 \n",
343 | " 268.0 \n",
344 | " 66 \n",
345 | " 0.01 \n",
346 | " 73.219243 \n",
347 | " 64.0 \n",
348 | " 430 \n",
349 | " ... \n",
350 | " 62.0 \n",
351 | " 8.13 \n",
352 | " 64.0 \n",
353 | " 0.1 \n",
354 | " 631.744976 \n",
355 | " 31731688.0 \n",
356 | " 17.7 \n",
357 | " 17.7 \n",
358 | " 0.470 \n",
359 | " 9.9 \n",
360 | " \n",
361 | " \n",
362 | " 3 \n",
363 | " Afghanistan \n",
364 | " 2012 \n",
365 | " Developing \n",
366 | " 59.5 \n",
367 | " 272.0 \n",
368 | " 69 \n",
369 | " 0.01 \n",
370 | " 78.184215 \n",
371 | " 67.0 \n",
372 | " 2787 \n",
373 | " ... \n",
374 | " 67.0 \n",
375 | " 8.52 \n",
376 | " 67.0 \n",
377 | " 0.1 \n",
378 | " 669.959000 \n",
379 | " 3696958.0 \n",
380 | " 17.9 \n",
381 | " 18.0 \n",
382 | " 0.463 \n",
383 | " 9.8 \n",
384 | " \n",
385 | " \n",
386 | " 4 \n",
387 | " Afghanistan \n",
388 | " 2011 \n",
389 | " Developing \n",
390 | " 59.2 \n",
391 | " 275.0 \n",
392 | " 71 \n",
393 | " 0.01 \n",
394 | " 7.097109 \n",
395 | " 68.0 \n",
396 | " 3013 \n",
397 | " ... \n",
398 | " 68.0 \n",
399 | " 7.87 \n",
400 | " 68.0 \n",
401 | " 0.1 \n",
402 | " 63.537231 \n",
403 | " 2978599.0 \n",
404 | " 18.2 \n",
405 | " 18.2 \n",
406 | " 0.454 \n",
407 | " 9.5 \n",
408 | " \n",
409 | " \n",
410 | "
\n",
411 | "
5 rows × 22 columns
\n",
412 | "
"
413 | ],
414 | "text/plain": [
415 | " Country Year Status life-expect Adult Mortality infant deaths \\\n",
416 | "0 Afghanistan 2015 Developing 65.0 263.0 62 \n",
417 | "1 Afghanistan 2014 Developing 59.9 271.0 64 \n",
418 | "2 Afghanistan 2013 Developing 59.9 268.0 66 \n",
419 | "3 Afghanistan 2012 Developing 59.5 272.0 69 \n",
420 | "4 Afghanistan 2011 Developing 59.2 275.0 71 \n",
421 | "\n",
422 | " Alcohol percentage expenditure Hepatitis B Measles ... Polio \\\n",
423 | "0 0.01 71.279624 65.0 1154 ... 6.0 \n",
424 | "1 0.01 73.523582 62.0 492 ... 58.0 \n",
425 | "2 0.01 73.219243 64.0 430 ... 62.0 \n",
426 | "3 0.01 78.184215 67.0 2787 ... 67.0 \n",
427 | "4 0.01 7.097109 68.0 3013 ... 68.0 \n",
428 | "\n",
429 | " Total expenditure Diphtheria HIV/AIDS GDP Population \\\n",
430 | "0 8.16 65.0 0.1 584.259210 33736494.0 \n",
431 | "1 8.18 62.0 0.1 612.696514 327582.0 \n",
432 | "2 8.13 64.0 0.1 631.744976 31731688.0 \n",
433 | "3 8.52 67.0 0.1 669.959000 3696958.0 \n",
434 | "4 7.87 68.0 0.1 63.537231 2978599.0 \n",
435 | "\n",
436 | " thinness 1-19 years thinness 5-9 years Income composition of resources \\\n",
437 | "0 17.2 17.3 0.479 \n",
438 | "1 17.5 17.5 0.476 \n",
439 | "2 17.7 17.7 0.470 \n",
440 | "3 17.9 18.0 0.463 \n",
441 | "4 18.2 18.2 0.454 \n",
442 | "\n",
443 | " Schooling \n",
444 | "0 10.1 \n",
445 | "1 10.0 \n",
446 | "2 9.9 \n",
447 | "3 9.8 \n",
448 | "4 9.5 \n",
449 | "\n",
450 | "[5 rows x 22 columns]"
451 | ]
452 | },
453 | "execution_count": 4,
454 | "metadata": {},
455 | "output_type": "execute_result"
456 | }
457 | ],
458 | "source": [
459 | "df.rename(columns= {'Life expectancy': 'life-expect'}, inplace = True)\n",
460 | "df.head()"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 5,
466 | "metadata": {},
467 | "outputs": [
468 | {
469 | "data": {
470 | "text/plain": [
471 | "(1649, 22)"
472 | ]
473 | },
474 | "execution_count": 5,
475 | "metadata": {},
476 | "output_type": "execute_result"
477 | }
478 | ],
479 | "source": [
480 | "df = df.dropna()\n",
481 | "df.shape"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": 7,
487 | "metadata": {},
488 | "outputs": [
489 | {
490 | "data": {
491 | "text/html": [
492 | "\n",
493 | "\n",
506 | "
\n",
507 | " \n",
508 | " \n",
509 | " \n",
510 | " Country \n",
511 | " Year \n",
512 | " Status \n",
513 | " life-expect \n",
514 | " Adult Mortality \n",
515 | " infant deaths \n",
516 | " Alcohol \n",
517 | " percentage expenditure \n",
518 | " Hepatitis B \n",
519 | " Measles \n",
520 | " ... \n",
521 | " Polio \n",
522 | " Total expenditure \n",
523 | " Diphtheria \n",
524 | " HIV/AIDS \n",
525 | " GDP \n",
526 | " Population \n",
527 | " thinness 1-19 years \n",
528 | " thinness 5-9 years \n",
529 | " Income composition of resources \n",
530 | " Schooling \n",
531 | " \n",
532 | " \n",
533 | " \n",
534 | " \n",
535 | "
\n",
536 | "
0 rows × 22 columns
\n",
537 | "
"
538 | ],
539 | "text/plain": [
540 | "Empty DataFrame\n",
541 | "Columns: [Country, Year, Status, life-expect, Adult Mortality, infant deaths, Alcohol, percentage expenditure, Hepatitis B, Measles, BMI, under-five deaths, Polio, Total expenditure, Diphtheria, HIV/AIDS, GDP, Population, thinness 1-19 years, thinness 5-9 years, Income composition of resources, Schooling]\n",
542 | "Index: []\n",
543 | "\n",
544 | "[0 rows x 22 columns]"
545 | ]
546 | },
547 | "execution_count": 7,
548 | "metadata": {},
549 | "output_type": "execute_result"
550 | }
551 | ],
552 | "source": [
553 | "df[df.isnull().any(axis=1)]"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 8,
559 | "metadata": {},
560 | "outputs": [],
561 | "source": [
562 | "X = df[['GDP', 'Alcohol', 'BMI', 'Schooling']]\n",
563 | "Y = df['life-expect']"
564 | ]
565 | },
566 | {
567 | "cell_type": "code",
568 | "execution_count": 11,
569 | "metadata": {},
570 | "outputs": [
571 | {
572 | "data": {
573 | "text/plain": [
574 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
575 | ]
576 | },
577 | "execution_count": 11,
578 | "metadata": {},
579 | "output_type": "execute_result"
580 | }
581 | ],
582 | "source": [
583 | "from sklearn.linear_model import LinearRegression\n",
584 | "lm = LinearRegression()\n",
585 | "lm.fit(X, Y)"
586 | ]
587 | },
588 | {
589 | "cell_type": "markdown",
590 | "metadata": {},
591 | "source": [
592 | "lm.intercept_"
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "execution_count": 15,
598 | "metadata": {},
599 | "outputs": [
600 | {
601 | "data": {
602 | "text/plain": [
603 | "array([ 1.15265737e-04, -2.52732304e-01, 8.91904815e-02, 1.94339619e+00])"
604 | ]
605 | },
606 | "execution_count": 15,
607 | "metadata": {},
608 | "output_type": "execute_result"
609 | }
610 | ],
611 | "source": [
612 | "lm.coef_"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": 16,
618 | "metadata": {},
619 | "outputs": [],
620 | "source": [
621 | "Y_hat = lm.predict(X)"
622 | ]
623 | },
624 | {
625 | "cell_type": "code",
626 | "execution_count": 17,
627 | "metadata": {},
628 | "outputs": [
629 | {
630 | "data": {
631 | "text/html": [
632 | "\n",
633 | "\n",
646 | "
\n",
647 | " \n",
648 | " \n",
649 | " \n",
650 | " Actual \n",
651 | " Predicted \n",
652 | " \n",
653 | " \n",
654 | " \n",
655 | " \n",
656 | " 0 \n",
657 | " 65.0 \n",
658 | " 64.248614 \n",
659 | " \n",
660 | " \n",
661 | " 1 \n",
662 | " 59.9 \n",
663 | " 64.012957 \n",
664 | " \n",
665 | " \n",
666 | " 2 \n",
667 | " 59.9 \n",
668 | " 63.776218 \n",
669 | " \n",
670 | " \n",
671 | " 3 \n",
672 | " 59.5 \n",
673 | " 63.541688 \n",
674 | " \n",
675 | " \n",
676 | " 4 \n",
677 | " 59.2 \n",
678 | " 62.853093 \n",
679 | " \n",
680 | " \n",
681 | " 5 \n",
682 | " 58.8 \n",
683 | " 62.281935 \n",
684 | " \n",
685 | " \n",
686 | " 6 \n",
687 | " 58.6 \n",
688 | " 61.641938 \n",
689 | " \n",
690 | " \n",
691 | " 7 \n",
692 | " 58.1 \n",
693 | " 61.195248 \n",
694 | " \n",
695 | " \n",
696 | " 8 \n",
697 | " 57.5 \n",
698 | " 60.569755 \n",
699 | " \n",
700 | " \n",
701 | " 9 \n",
702 | " 57.3 \n",
703 | " 59.928401 \n",
704 | " \n",
705 | " \n",
706 | " 10 \n",
707 | " 57.3 \n",
708 | " 59.469152 \n",
709 | " \n",
710 | " \n",
711 | " 11 \n",
712 | " 57.0 \n",
713 | " 57.318084 \n",
714 | " \n",
715 | " \n",
716 | " 12 \n",
717 | " 56.7 \n",
718 | " 56.699564 \n",
719 | " \n",
720 | " \n",
721 | " 13 \n",
722 | " 56.2 \n",
723 | " 56.079614 \n",
724 | " \n",
725 | " \n",
726 | " 14 \n",
727 | " 55.3 \n",
728 | " 55.452810 \n",
729 | " \n",
730 | " \n",
731 | " 15 \n",
732 | " 54.8 \n",
733 | " 54.639437 \n",
734 | " \n",
735 | " \n",
736 | " 16 \n",
737 | " 77.8 \n",
738 | " 74.914449 \n",
739 | " \n",
740 | " \n",
741 | " 17 \n",
742 | " 77.5 \n",
743 | " 74.937484 \n",
744 | " \n",
745 | " \n",
746 | " 18 \n",
747 | " 77.2 \n",
748 | " 74.793305 \n",
749 | " \n",
750 | " \n",
751 | " 19 \n",
752 | " 76.9 \n",
753 | " 74.615572 \n",
754 | " \n",
755 | " \n",
756 | "
\n",
757 | "
"
758 | ],
759 | "text/plain": [
760 | " Actual Predicted\n",
761 | "0 65.0 64.248614\n",
762 | "1 59.9 64.012957\n",
763 | "2 59.9 63.776218\n",
764 | "3 59.5 63.541688\n",
765 | "4 59.2 62.853093\n",
766 | "5 58.8 62.281935\n",
767 | "6 58.6 61.641938\n",
768 | "7 58.1 61.195248\n",
769 | "8 57.5 60.569755\n",
770 | "9 57.3 59.928401\n",
771 | "10 57.3 59.469152\n",
772 | "11 57.0 57.318084\n",
773 | "12 56.7 56.699564\n",
774 | "13 56.2 56.079614\n",
775 | "14 55.3 55.452810\n",
776 | "15 54.8 54.639437\n",
777 | "16 77.8 74.914449\n",
778 | "17 77.5 74.937484\n",
779 | "18 77.2 74.793305\n",
780 | "19 76.9 74.615572"
781 | ]
782 | },
783 | "execution_count": 17,
784 | "metadata": {},
785 | "output_type": "execute_result"
786 | }
787 | ],
788 | "source": [
789 | "data = pd.DataFrame({'Actual':df['life-expect'], 'Predicted': Y_hat})\n",
790 | "data.head(20)"
791 | ]
792 | },
793 | {
794 | "cell_type": "code",
795 | "execution_count": 18,
796 | "metadata": {},
797 | "outputs": [
798 | {
799 | "data": {
800 | "image/png": "\n",
801 | "text/plain": [
802 | ""
803 | ]
804 | },
805 | "metadata": {},
806 | "output_type": "display_data"
807 | }
808 | ],
809 | "source": [
810 | "import seaborn as sns\n",
811 | "\n",
812 | "axs = sns.distplot(df['life-expect'], hist=False, color=\"r\", label=\"Actual Value\")\n",
813 | "sns.distplot(Y_hat, hist=False, color=\"b\", label=\"Fitted Values\", ax=axs)\n",
814 | "\n",
815 | "plt.title('Acutal vs Fitted Values for LE')\n",
816 | "plt.xlabel('Age Range')\n",
817 | "plt.show()"
818 | ]
819 | },
820 | {
821 | "cell_type": "code",
822 | "execution_count": null,
823 | "metadata": {},
824 | "outputs": [],
825 | "source": []
826 | }
827 | ],
828 | "metadata": {
829 | "kernelspec": {
830 | "display_name": "Python 3",
831 | "language": "python",
832 | "name": "python3"
833 | },
834 | "language_info": {
835 | "codemirror_mode": {
836 | "name": "ipython",
837 | "version": 3
838 | },
839 | "file_extension": ".py",
840 | "mimetype": "text/x-python",
841 | "name": "python",
842 | "nbconvert_exporter": "python",
843 | "pygments_lexer": "ipython3",
844 | "version": "3.6.4"
845 | }
846 | },
847 | "nbformat": 4,
848 | "nbformat_minor": 2
849 | }
850 |
--------------------------------------------------------------------------------
/P08-Model Evaluation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "%matplotlib inline"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 6,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/html": [
23 | "\n",
24 | "\n",
37 | "
\n",
38 | " \n",
39 | " \n",
40 | " \n",
41 | " Country \n",
42 | " Year \n",
43 | " Status \n",
44 | " Life expectancy \n",
45 | " Adult Mortality \n",
46 | " infant deaths \n",
47 | " Alcohol \n",
48 | " percentage expenditure \n",
49 | " Hepatitis B \n",
50 | " Measles \n",
51 | " ... \n",
52 | " Polio \n",
53 | " Total expenditure \n",
54 | " Diphtheria \n",
55 | " HIV/AIDS \n",
56 | " GDP \n",
57 | " Population \n",
58 | " thinness 1-19 years \n",
59 | " thinness 5-9 years \n",
60 | " Income composition of resources \n",
61 | " Schooling \n",
62 | " \n",
63 | " \n",
64 | " \n",
65 | " \n",
66 | " 0 \n",
67 | " Afghanistan \n",
68 | " 2015 \n",
69 | " Developing \n",
70 | " 65.0 \n",
71 | " 263.0 \n",
72 | " 62 \n",
73 | " 0.01 \n",
74 | " 71.279624 \n",
75 | " 65.0 \n",
76 | " 1154 \n",
77 | " ... \n",
78 | " 6.0 \n",
79 | " 8.16 \n",
80 | " 65.0 \n",
81 | " 0.1 \n",
82 | " 584.259210 \n",
83 | " 33736494.0 \n",
84 | " 17.2 \n",
85 | " 17.3 \n",
86 | " 0.479 \n",
87 | " 10.1 \n",
88 | " \n",
89 | " \n",
90 | " 1 \n",
91 | " Afghanistan \n",
92 | " 2014 \n",
93 | " Developing \n",
94 | " 59.9 \n",
95 | " 271.0 \n",
96 | " 64 \n",
97 | " 0.01 \n",
98 | " 73.523582 \n",
99 | " 62.0 \n",
100 | " 492 \n",
101 | " ... \n",
102 | " 58.0 \n",
103 | " 8.18 \n",
104 | " 62.0 \n",
105 | " 0.1 \n",
106 | " 612.696514 \n",
107 | " 327582.0 \n",
108 | " 17.5 \n",
109 | " 17.5 \n",
110 | " 0.476 \n",
111 | " 10.0 \n",
112 | " \n",
113 | " \n",
114 | " 2 \n",
115 | " Afghanistan \n",
116 | " 2013 \n",
117 | " Developing \n",
118 | " 59.9 \n",
119 | " 268.0 \n",
120 | " 66 \n",
121 | " 0.01 \n",
122 | " 73.219243 \n",
123 | " 64.0 \n",
124 | " 430 \n",
125 | " ... \n",
126 | " 62.0 \n",
127 | " 8.13 \n",
128 | " 64.0 \n",
129 | " 0.1 \n",
130 | " 631.744976 \n",
131 | " 31731688.0 \n",
132 | " 17.7 \n",
133 | " 17.7 \n",
134 | " 0.470 \n",
135 | " 9.9 \n",
136 | " \n",
137 | " \n",
138 | " 3 \n",
139 | " Afghanistan \n",
140 | " 2012 \n",
141 | " Developing \n",
142 | " 59.5 \n",
143 | " 272.0 \n",
144 | " 69 \n",
145 | " 0.01 \n",
146 | " 78.184215 \n",
147 | " 67.0 \n",
148 | " 2787 \n",
149 | " ... \n",
150 | " 67.0 \n",
151 | " 8.52 \n",
152 | " 67.0 \n",
153 | " 0.1 \n",
154 | " 669.959000 \n",
155 | " 3696958.0 \n",
156 | " 17.9 \n",
157 | " 18.0 \n",
158 | " 0.463 \n",
159 | " 9.8 \n",
160 | " \n",
161 | " \n",
162 | " 4 \n",
163 | " Afghanistan \n",
164 | " 2011 \n",
165 | " Developing \n",
166 | " 59.2 \n",
167 | " 275.0 \n",
168 | " 71 \n",
169 | " 0.01 \n",
170 | " 7.097109 \n",
171 | " 68.0 \n",
172 | " 3013 \n",
173 | " ... \n",
174 | " 68.0 \n",
175 | " 7.87 \n",
176 | " 68.0 \n",
177 | " 0.1 \n",
178 | " 63.537231 \n",
179 | " 2978599.0 \n",
180 | " 18.2 \n",
181 | " 18.2 \n",
182 | " 0.454 \n",
183 | " 9.5 \n",
184 | " \n",
185 | " \n",
186 | "
\n",
187 | "
5 rows × 22 columns
\n",
188 | "
"
189 | ],
190 | "text/plain": [
191 | " Country Year Status Life expectancy Adult Mortality \\\n",
192 | "0 Afghanistan 2015 Developing 65.0 263.0 \n",
193 | "1 Afghanistan 2014 Developing 59.9 271.0 \n",
194 | "2 Afghanistan 2013 Developing 59.9 268.0 \n",
195 | "3 Afghanistan 2012 Developing 59.5 272.0 \n",
196 | "4 Afghanistan 2011 Developing 59.2 275.0 \n",
197 | "\n",
198 | " infant deaths Alcohol percentage expenditure Hepatitis B Measles ... \\\n",
199 | "0 62 0.01 71.279624 65.0 1154 ... \n",
200 | "1 64 0.01 73.523582 62.0 492 ... \n",
201 | "2 66 0.01 73.219243 64.0 430 ... \n",
202 | "3 69 0.01 78.184215 67.0 2787 ... \n",
203 | "4 71 0.01 7.097109 68.0 3013 ... \n",
204 | "\n",
205 | " Polio Total expenditure Diphtheria HIV/AIDS GDP Population \\\n",
206 | "0 6.0 8.16 65.0 0.1 584.259210 33736494.0 \n",
207 | "1 58.0 8.18 62.0 0.1 612.696514 327582.0 \n",
208 | "2 62.0 8.13 64.0 0.1 631.744976 31731688.0 \n",
209 | "3 67.0 8.52 67.0 0.1 669.959000 3696958.0 \n",
210 | "4 68.0 7.87 68.0 0.1 63.537231 2978599.0 \n",
211 | "\n",
212 | " thinness 1-19 years thinness 5-9 years Income composition of resources \\\n",
213 | "0 17.2 17.3 0.479 \n",
214 | "1 17.5 17.5 0.476 \n",
215 | "2 17.7 17.7 0.470 \n",
216 | "3 17.9 18.0 0.463 \n",
217 | "4 18.2 18.2 0.454 \n",
218 | "\n",
219 | " Schooling \n",
220 | "0 10.1 \n",
221 | "1 10.0 \n",
222 | "2 9.9 \n",
223 | "3 9.8 \n",
224 | "4 9.5 \n",
225 | "\n",
226 | "[5 rows x 22 columns]"
227 | ]
228 | },
229 | "execution_count": 6,
230 | "metadata": {},
231 | "output_type": "execute_result"
232 | }
233 | ],
234 | "source": [
235 | "df = pd.read_csv('https://raw.githubusercontent.com/myanmards/resource_files/master/regression-one.csv')\n",
236 | "df.head()"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 7,
242 | "metadata": {},
243 | "outputs": [
244 | {
245 | "data": {
246 | "text/plain": [
247 | "(1649, 22)"
248 | ]
249 | },
250 | "execution_count": 7,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "df.rename(columns={'Life expectancy': 'life-expect'}, inplace=True)\n",
257 | "df = df.dropna()\n",
258 | "df.shape"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 9,
264 | "metadata": {},
265 | "outputs": [
266 | {
267 | "data": {
268 | "text/html": [
269 | "\n",
270 | "\n",
283 | "
\n",
284 | " \n",
285 | " \n",
286 | " \n",
287 | " Country \n",
288 | " Year \n",
289 | " Status \n",
290 | " life-expect \n",
291 | " Adult Mortality \n",
292 | " infant deaths \n",
293 | " Alcohol \n",
294 | " percentage expenditure \n",
295 | " Hepatitis B \n",
296 | " Measles \n",
297 | " ... \n",
298 | " Polio \n",
299 | " Total expenditure \n",
300 | " Diphtheria \n",
301 | " HIV/AIDS \n",
302 | " GDP \n",
303 | " Population \n",
304 | " thinness 1-19 years \n",
305 | " thinness 5-9 years \n",
306 | " Income composition of resources \n",
307 | " Schooling \n",
308 | " \n",
309 | " \n",
310 | " \n",
311 | " \n",
312 | "
\n",
313 | "
0 rows × 22 columns
\n",
314 | "
"
315 | ],
316 | "text/plain": [
317 | "Empty DataFrame\n",
318 | "Columns: [Country, Year, Status, life-expect, Adult Mortality, infant deaths, Alcohol, percentage expenditure, Hepatitis B, Measles, BMI, under-five deaths, Polio, Total expenditure, Diphtheria, HIV/AIDS, GDP, Population, thinness 1-19 years, thinness 5-9 years, Income composition of resources, Schooling]\n",
319 | "Index: []\n",
320 | "\n",
321 | "[0 rows x 22 columns]"
322 | ]
323 | },
324 | "execution_count": 9,
325 | "metadata": {},
326 | "output_type": "execute_result"
327 | }
328 | ],
329 | "source": [
330 | "df[df.isnull().any(axis=1)]"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": 11,
336 | "metadata": {},
337 | "outputs": [
338 | {
339 | "name": "stdout",
340 | "output_type": "stream",
341 | "text": [
342 | "Collecting sklearn\n",
343 | " Downloading sklearn-0.0.tar.gz (1.1 kB)\n",
344 | "Collecting scikit-learn\n",
345 | " Downloading scikit_learn-0.24.0-cp39-cp39-win_amd64.whl (6.9 MB)\n",
346 | "Requirement already satisfied: numpy>=1.13.3 in c:\\users\\zinmy\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from scikit-learn->sklearn) (1.19.4)\n",
347 | "Collecting joblib>=0.11\n",
348 | " Downloading joblib-1.0.0-py3-none-any.whl (302 kB)\n",
349 | "Collecting scipy>=0.19.1\n",
350 | " Downloading scipy-1.5.4-cp39-cp39-win_amd64.whl (31.4 MB)\n",
351 | "Collecting threadpoolctl>=2.0.0\n",
352 | " Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)\n",
353 | "Using legacy 'setup.py install' for sklearn, since package 'wheel' is not installed.\n",
354 | "Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn, sklearn\n",
355 | " Running setup.py install for sklearn: started\n",
356 | " Running setup.py install for sklearn: finished with status 'done'\n",
357 | "Successfully installed joblib-1.0.0 scikit-learn-0.24.0 scipy-1.5.4 sklearn-0.0 threadpoolctl-2.1.0\n"
358 | ]
359 | }
360 | ],
361 | "source": [
362 | "!pip install sklearn"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 12,
368 | "metadata": {},
369 | "outputs": [
370 | {
371 | "data": {
372 | "text/plain": [
373 | "LinearRegression()"
374 | ]
375 | },
376 | "execution_count": 12,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "from sklearn.linear_model import LinearRegression\n",
383 | "\n",
384 | "slr = LinearRegression()\n",
385 | "SLRX = df[['Schooling']]\n",
386 | "Y = df['life-expect']\n",
387 | "slr.fit(SLRX, Y)"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 13,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "SLRYhat = slr.predict(SLRX)"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": 14,
402 | "metadata": {},
403 | "outputs": [
404 | {
405 | "name": "stdout",
406 | "output_type": "stream",
407 | "text": [
408 | "[64.6771856 64.44820695 64.2192283 63.99024965]\n"
409 | ]
410 | }
411 | ],
412 | "source": [
413 | "print(SLRYhat[0:4])"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 16,
419 | "metadata": {},
420 | "outputs": [
421 | {
422 | "name": "stdout",
423 | "output_type": "stream",
424 | "text": [
425 | "MSE for Simple Linear Regression : 36.39144686879598\n"
426 | ]
427 | }
428 | ],
429 | "source": [
430 | "from sklearn.metrics import mean_squared_error\n",
431 | "\n",
432 | "SLR_MSE = mean_squared_error(df['life-expect'], SLRYhat)\n",
433 | "print('MSE for Simple Linear Regression : ', SLR_MSE)"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 17,
439 | "metadata": {},
440 | "outputs": [
441 | {
442 | "name": "stdout",
443 | "output_type": "stream",
444 | "text": [
445 | "R2 for Simple Linear Regression : 0.5294454639356108\n"
446 | ]
447 | }
448 | ],
449 | "source": [
450 | "SLR_R2 = slr.score(SLRX, Y)\n",
451 | "print('R2 for Simple Linear Regression : ', SLR_R2)"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": 18,
457 | "metadata": {},
458 | "outputs": [
459 | {
460 | "data": {
461 | "text/plain": [
462 | "LinearRegression()"
463 | ]
464 | },
465 | "execution_count": 18,
466 | "metadata": {},
467 | "output_type": "execute_result"
468 | }
469 | ],
470 | "source": [
471 | "MLRX = df[['GDP', 'Alcohol', 'BMI', 'Schooling']]\n",
472 | "mlr = LinearRegression()\n",
473 | "mlr.fit(MLRX, Y)"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": 19,
479 | "metadata": {},
480 | "outputs": [
481 | {
482 | "name": "stdout",
483 | "output_type": "stream",
484 | "text": [
485 | "[64.24861418 64.01295717 63.77621795 63.54168785]\n"
486 | ]
487 | }
488 | ],
489 | "source": [
490 | "MLRYhat = mlr.predict(MLRX)\n",
491 | "print(MLRYhat[0:4])"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": 20,
497 | "metadata": {},
498 | "outputs": [
499 | {
500 | "name": "stdout",
501 | "output_type": "stream",
502 | "text": [
503 | "MSE for Multiple Linear Regression : 32.66263978796142\n"
504 | ]
505 | }
506 | ],
507 | "source": [
508 | "MLR_MSE = mean_squared_error(df['life-expect'], MLRYhat)\n",
509 | "print('MSE for Multiple Linear Regression : ', MLR_MSE)"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 22,
515 | "metadata": {},
516 | "outputs": [
517 | {
518 | "name": "stdout",
519 | "output_type": "stream",
520 | "text": [
521 | "R2 for Multiple Linear Regression : 0.5776602846411929\n"
522 | ]
523 | }
524 | ],
525 | "source": [
526 | "MLR_R2 = mlr.score(MLRX, Y)\n",
527 | "print('R2 for Multiple Linear Regression : ', MLR_R2)"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": 23,
533 | "metadata": {},
534 | "outputs": [],
535 | "source": [
536 | "PRX = df['Schooling']\n",
537 | "f = np.polyfit(PRX, Y, 3)\n",
538 | "PR = np.poly1d(f)"
539 | ]
540 | },
541 | {
542 | "cell_type": "code",
543 | "execution_count": 24,
544 | "metadata": {},
545 | "outputs": [
546 | {
547 | "name": "stdout",
548 | "output_type": "stream",
549 | "text": [
550 | "[63.54170723 63.28970051 63.04117145 62.79627248]\n"
551 | ]
552 | }
553 | ],
554 | "source": [
555 | "PRYhat = PR(PRX)\n",
556 | "print(PRYhat[0:4])"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 25,
562 | "metadata": {},
563 | "outputs": [
564 | {
565 | "name": "stdout",
566 | "output_type": "stream",
567 | "text": [
568 | "MSE for Polynomial Regression : 34.73390295312004\n"
569 | ]
570 | }
571 | ],
572 | "source": [
573 | "PR_MSE = mean_squared_error(df['life-expect'], PRYhat)\n",
574 | "print('MSE for Polynomial Regression : ', PR_MSE)"
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": 26,
580 | "metadata": {},
581 | "outputs": [
582 | {
583 | "name": "stdout",
584 | "output_type": "stream",
585 | "text": [
586 | "R2 for Polynomial Regression : 0.5508781047168166\n"
587 | ]
588 | }
589 | ],
590 | "source": [
591 | "from sklearn.metrics import r2_score\n",
592 | "PR_R2 = r2_score(df['life-expect'], PRYhat)\n",
593 | "print('R2 for Polynomial Regression : ', PR_R2)"
594 | ]
595 | },
596 | {
597 | "cell_type": "code",
598 | "execution_count": 27,
599 | "metadata": {},
600 | "outputs": [
601 | {
602 | "name": "stdout",
603 | "output_type": "stream",
604 | "text": [
605 | "Simple Linear Regression\n",
606 | "MSE for Simple Linear Regression : 36.39144686879598\n",
607 | "R2 for Simple Linear Regression : 0.5294454639356108\n"
608 | ]
609 | }
610 | ],
611 | "source": [
612 | "print('Simple Linear Regression')\n",
613 | "print('MSE for Simple Linear Regression : ', SLR_MSE)\n",
614 | "print('R2 for Simple Linear Regression : ', SLR_R2)"
615 | ]
616 | },
617 | {
618 | "cell_type": "code",
619 | "execution_count": 28,
620 | "metadata": {},
621 | "outputs": [
622 | {
623 | "name": "stdout",
624 | "output_type": "stream",
625 | "text": [
626 | "Multiple Linear Regression\n",
627 | "MSE for Multiple Linear Regression : 32.66263978796142\n",
628 | "R2 for Multiple Linear Regression : 0.5776602846411929\n"
629 | ]
630 | }
631 | ],
632 | "source": [
633 | "print('Multiple Linear Regression')\n",
634 | "print('MSE for Multiple Linear Regression : ', MLR_MSE)\n",
635 | "print('R2 for Multiple Linear Regression : ', MLR_R2)"
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": 29,
641 | "metadata": {},
642 | "outputs": [
643 | {
644 | "name": "stdout",
645 | "output_type": "stream",
646 | "text": [
647 | "Polynomial Regression\n",
648 | "MSE for Polynomial Regression : 34.73390295312004\n",
649 | "R2 for Polynomial Regression : 0.5508781047168166\n"
650 | ]
651 | }
652 | ],
653 | "source": [
654 | "print('Polynomial Regression')\n",
655 | "print('MSE for Polynomial Regression : ', PR_MSE)\n",
656 | "print('R2 for Polynomial Regression : ', PR_R2)"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": null,
662 | "metadata": {},
663 | "outputs": [],
664 | "source": []
665 | }
666 | ],
667 | "metadata": {
668 | "kernelspec": {
669 | "display_name": "Python 3",
670 | "language": "python",
671 | "name": "python3"
672 | },
673 | "language_info": {
674 | "codemirror_mode": {
675 | "name": "ipython",
676 | "version": 3
677 | },
678 | "file_extension": ".py",
679 | "mimetype": "text/x-python",
680 | "name": "python",
681 | "nbconvert_exporter": "python",
682 | "pygments_lexer": "ipython3",
683 | "version": "3.9.1"
684 | }
685 | },
686 | "nbformat": 4,
687 | "nbformat_minor": 4
688 | }
689 |
--------------------------------------------------------------------------------
/P09-LogisticRegression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Practical - Logistic Regression\n",
8 | "\n",
9 | "Please refer to two parts of lecture before proceeding to this lab session.\n",
10 | "Part 1 - https://www.youtube.com/watch?v=jLe1nILDRbU\n",
11 | "Part 2 - https://www.youtube.com/watch?v=wD5wJvwVohU\n",
12 | "For those who would like to refresh Python skill, we would like to recommend our \"Programming for Data Science Series\" where we covered almost all aspects of Python programming in data science domain. Refer below URL for full playlist of almost 10 hours video lesson in Burmese Language. URL : https://www.youtube.com/watch?v=jOZNjVVZIVs&list=PLD_eiqVVLZDi9GZZJDC8Zx4-3Np8LHs52"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/html": [
23 | "\n",
24 | "\n",
37 | "
\n",
38 | " \n",
39 | " \n",
40 | " \n",
41 | " age \n",
42 | " education \n",
43 | " experience \n",
44 | " selected \n",
45 | " \n",
46 | " \n",
47 | " \n",
48 | " \n",
49 | " 0 \n",
50 | " 24 \n",
51 | " 1.0 \n",
52 | " 2 \n",
53 | " 0 \n",
54 | " \n",
55 | " \n",
56 | " 1 \n",
57 | " 35 \n",
58 | " 3.5 \n",
59 | " 8 \n",
60 | " 1 \n",
61 | " \n",
62 | " \n",
63 | " 2 \n",
64 | " 23 \n",
65 | " 1.0 \n",
66 | " 1 \n",
67 | " 0 \n",
68 | " \n",
69 | " \n",
70 | " 3 \n",
71 | " 35 \n",
72 | " 3.0 \n",
73 | " 7 \n",
74 | " 1 \n",
75 | " \n",
76 | " \n",
77 | " 4 \n",
78 | " 45 \n",
79 | " 1.0 \n",
80 | " 10 \n",
81 | " 0 \n",
82 | " \n",
83 | " \n",
84 | "
\n",
85 | "
"
86 | ],
87 | "text/plain": [
88 | " age education experience selected\n",
89 | "0 24 1.0 2 0\n",
90 | "1 35 3.5 8 1\n",
91 | "2 23 1.0 1 0\n",
92 | "3 35 3.0 7 1\n",
93 | "4 45 1.0 10 0"
94 | ]
95 | },
96 | "execution_count": 1,
97 | "metadata": {},
98 | "output_type": "execute_result"
99 | }
100 | ],
101 | "source": [
102 | "import pandas as pd\n",
103 | "from sklearn.model_selection import train_test_split\n",
104 | "from sklearn.linear_model import LogisticRegression\n",
105 | "from sklearn import metrics\n",
106 | "import seaborn as sn\n",
107 | "\n",
108 | "df = pd.read_csv('https://raw.githubusercontent.com/myanmards/resource_files/master/job-fit.csv')\n",
109 | "df.head()"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 2,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "X = df[['age', 'education', 'experience']]\n",
119 | "y = df['selected']"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 3,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.25, random_state=0)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 4,
134 | "metadata": {},
135 | "outputs": [
136 | {
137 | "data": {
138 | "text/plain": [
139 | "LogisticRegression()"
140 | ]
141 | },
142 | "execution_count": 4,
143 | "metadata": {},
144 | "output_type": "execute_result"
145 | }
146 | ],
147 | "source": [
148 | "LR = LogisticRegression()\n",
149 | "LR.fit(X_train, y_train)"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 5,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "data": {
159 | "text/plain": [
160 | "array([1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1], dtype=int64)"
161 | ]
162 | },
163 | "execution_count": 5,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "y_pred = LR.predict(X_test)\n",
170 | "y_pred"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 6,
176 | "metadata": {},
177 | "outputs": [
178 | {
179 | "data": {
180 | "text/plain": [
181 | ""
182 | ]
183 | },
184 | "execution_count": 6,
185 | "metadata": {},
186 | "output_type": "execute_result"
187 | },
188 | {
189 | "data": {
190 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWIAAAEKCAYAAAAo+19NAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAARFElEQVR4nO3deZClZXXH8e9v2JdhHTYBAZElaCmYiRExCC7IIrE0VgQilTLEBqIoxFQgJlUWxlRZiRoJ4tKAA6iAE5CUkhJRhAwkggw6UMC4UIiRxQICKhAidPfJH31HW5y5c3u4t5/bl+9n6qnp+953OQNTh8N5n+d9U1VIktpZ0DoASXquMxFLUmMmYklqzEQsSY2ZiCWpMROxJDVmIpakPkuyVZLLknwvycokB3bbf/25CkySnkPOAq6qqrcm2RDYtNvOcUGHJPVPki2BFcALqscEO7QV8ZPn/5X/hdBvWXjyJa1D0BCaeOq+PNtzPP3w3T3nnA232/NEYGzGpvGqGu/8vAfwELAkyUuBW4D3VtUTazqfPWJJmqWqGq+qxTPG+Iyv1wdeBnyqqg4AngDO6HY+E7EkAUxN9j66uxe4t6pu6ny+jOnEvEZD25qQpDk1OdGX01TVT5P8JMk+VfV94LXAnd2OMRFLElA11c/TnQJ8oTNj4m7gHd12NhFLEsBU/xJxVa0AFve6v4lYkgD6WxHPiolYkqCXm3ADYyKWJLAilqTWqk+zJtaFiViSoK8362bLRCxJYGtCkprzZp0kNWZFLEmNebNOkhrzZp0ktVVlj1iS2rJHLEmN2ZqQpMasiCWpscmnm13aRCxJYGtCkpqzNSFJjVkRS1JjJmJJaqu8WSdJjdkjlqTGbE1IUmNWxJLUmBWxJDVmRSxJjU34YHhJasuKWJIas0csSY1ZEUtSY1bEktSYFbEkNeasCUlqrKrZpU3EkgT2iCWpOROxJDXWx5t1Se4BHgMmgYmqWtxtfxOxJAFMTvb7jIdW1cO97GgiliRo2ppY0OzKkjRMpqZ6HknGkiyfMcaecbYCrk5yy2q++y1WxJIEs+oRV9U4MN5ll1dV1X1Jtge+nuR7VbVsTTtbEUsSUFPV81jruaru6/z+IHAF8PJu+5uIJQlm1ZroJslmSRau+hk4DLi92zG2JiQJ+jlrYgfgiiQwnWMvrqqruh1gIpYk6Nusiaq6G3jpbI4xEUsSuLJOa3bEp7/BZhuuz4IFYf2Ei//04NYhqbFzxz/KUUe+jgcfepj9D3ht63BGhw/9UTfnHnMgW2+6UeswNCQuumgpn/zkEpYsOat1KKNlFCviJPsCbwJ27my6D/hyVa0c1DWl54Lrb7iJ3XbbpXUYo6eHaWmDMpDpa0lOBy4FAny7MwJckuSMQVxzVCVw8tIbOfbCZVy24setw5FG1+Rk76PPBlURnwC8qKqenrkxyceAO4APr+6gzlLAMYCzj389J7z6JQMKb/5YctxB7LBwEx554pectPRG9th2c353121bhyWNnBrBZ01MAc9bzfadOt+tVlWNV9XiqlpsEp62w8JNANhms404dK8duf2Bn7UNSBpVU9X76LNBVcSnAtck+SHwk8625wMvBN49oGuOnCefmmCqYLON1ufJpyb41j0PceIr924dljSaRu3loVV1VZK9mV5fPfNm3c1V1f8Gy4j6n//9JX95xXIAJqamOGK/nTnoBds3jkqtff5z5/Dqgw9k0aJtuOfu5Zz5wY+w5IJLW4c1/zW8WTewWRNVNQXcOKjzPxfsstVmLH3Hq1uHoSHz9uPf1TqE0TTRrkZ0HrEkwei1JiRp3hnF1oQkzSctp6+ZiCUJrIglqTkTsSQ1NoCly70yEUsS9PQuukExEUsS2JqQpOacNSFJjVkRS1JjJmJJaqsmbU1IUltWxJLUltPXJKk1E7EkNdauRWwiliSAmvBmnSS1ZUUsSW15s06SWrMilqS2rIglqTUrYklqqybaXdtELElANayIF7S7tCQNkalZjB4kWS/Jd5NcubZ9rYgliYFUxO8FVgJbrG1HK2JJYjoR9zrWJskuwFHAeb1c20QsSUBNpueRZCzJ8hlj7Bmn+zjw1/TYyLA1IUnMrjVRVePA+Oq+S/JG4MGquiXJIb2cz0QsSUBNpV+nOgj4wyRHAhsDWyT5fFW9fU0H2JqQJPrXI66qv6mqXapqd+AY4JvdkjBYEUsSAFV9q4hnzUQsSQxmQUdVXQdct7b9TMSSBExNWhFLUlN9vFk3ayZiScJELEnNVbvHEa85ESc5G1hjaFX1noFEJEkNDGtFvHzOopCkxoZy+lpVXTiXgUhSS5PDPGsiyXbA6cB+TC/XA6CqXjPAuCRpTrWsiHtZ4vwFpp+puQdwJnAPcPMAY5KkOVdT6Xn0Wy+JeNuqOh94uqr+o6r+DLAaljRSqnof/dbL9LWnO78/kOQo4H5gm/6HIkntDOusiVU+lGRL4H3A2Uy/9uO0gUYlSXNscqrdwyjXmoiratWL734OHDrYcCSpjaFc0LFKkiWsZmFHp1csSSNhahjnEc8w81XQGwNvZrpPLEkjYygXdKxSVZfP/JzkEuCGgUUkSQ0MdWtiNfYCtu93IM+08ORLBn0JzUNP3n996xA0ooa6NZHkMX6zR/xTplfaSdLIGPZZEwvnIhBJaqlhZ2LtK+uSXNPLNkmaz6YqPY9+6/Y84o2BTYFFSbYGVl19C2DnvkciSQ0N66yJE4FTgecBt/DrRPwL4BODDUuS5tYAXuLcs27PIz4LOCvJKVV19hzGJElzrhjux2BOJdlq1YckWyf5i8GFJElzb6LS8+i3XhLxO6vqZ6s+VNWjwDv7HokkNVSk59FvvSzoWC9JqqbXnSRZD9iw75FIUkND2SOe4Srgi0k+0/l8IvDVwYUkSXOvZY+4l0R8OjAGnNT5fBuw48AikqQGhroirqqpJDcBewJ/DCwCLu9+lCTNL5PDWBEn2Rs4tjMeBr4IUFU+HF7SyGn4pqSuFfH3gOuBN1bVXQBJfEWSpJE0NaTziN8CPABcm+TcJK+FhpFK0gDVLEa/rTERV9W/VdUxwL7AtUwvd94+yaeSHDaAWCSpmalZjH5b64KOqnqiqi6uqqOBXYDv4vOIJY2YqaTn0W+zekNHZ1XdeGdI0siYbHjtdXlVkiSNnH7Nmug8QngZsBHTOfayqvpAt2NMxJJEX2dN/BJ4TVU9nmQD4IYkX62qG9d0gIlYkujfbIjOc3ke73zcoDO6nr7d2/IkaYhMpfeRZCzJ8hljbOa5kqyXZAXwIPD1qrqp27WtiCWJ2U1Lq6qukxaqahLYv/Ms9yuSvLiqbl/T/lbEkgRMpvfRq86z3K8FDu+2n4lYkujfgo4k2616q1GSTYDXM/3IiDWyNSFJ9HXF3E7AhZ2XaCwAllbVld0OMBFLEtCvV9FV1W3AAbM5xkQsSQz5g+El6bnAJc6S1NiwPhhekp4zbE1IUmMmYklqbBBv3uiViViSsEcsSc05a0KSGptq2JwwEUsS3qyTpOa8WSdJjVkRS1JjE7FHLElN2ZqQpMZsTUhSY05fk6TGbE1IUmO2JiSpsUlbE5LUlhWxJDVWVsSS1FbLinhBw2trLc4d/yj333srK757TetQNGR+8djjnPa3H+LoY9/J0ceNseL2la1DmvemqJ5Hv5mIh9hFFy3lqDf+SeswNIQ+/PFPc9DvL+Yrl5zLly48hxfstmvrkOa9msXoNxPxELv+hpt45NGftQ5DQ+axx5/glltv54+OfgMAG2ywAVss3LxxVPPfBNXz6Dd7xNI8c9/9P2Xrrbbk7/7hY3z/rrvZb5+9OOPUk9h0k41bhzavtbxZN+cVcZJ3dPluLMnyJMunpp6Yy7CkeWNicpKVP7iLt735KC674Bw22WRjzv/c0tZhzXtTsxj91qI1ceaavqiq8apaXFWLFyzYbC5jkuaNHbdfxA7bLeIlL9oXgMMOeRV3/uCuxlHNfzWLX/02kNZEktvW9BWwwyCuKT1XLNp2G3bcfjt+9ON72WO3XbjxlhXsufvzW4c1743igo4dgDcAjz5je4D/GtA1R87nP3cOrz74QBYt2oZ77l7OmR/8CEsuuLR1WBoC7z/tZE4/8x95euJpdn3eTvz9+09rHdK8N1mjt6DjSmDzqlrxzC+SXDega46ctx//rtYhaEjtu/eeLP3sv7QOY6SM3GMwq+qELt8dN4hrStKz4RJnSWpsFHvEkjSvtGxNuLJOkujf9LUkuya5NsmdSe5I8t61XduKWJLo66yJCeB9VfWdJAuBW5J8varuXNMBJmJJon+tiap6AHig8/NjSVYCOwNrTMS2JiSJ2S1xnvk4hs4YW905k+wOHADc1O3aVsSSxOymr1XVODDebZ8kmwOXA6dW1S+67WsiliT6O2siyQZMJ+EvVNWX1ra/iViSgOrTzbokAc4HVlbVx3o5xh6xJAGTVM9jLQ4Cjgdek2RFZxzZ7QArYkmir7MmbmD6AWc9MxFLEv1rTawLE7EkMYJPX5Ok+canr0lSY6P4YHhJmldsTUhSYyZiSWrMWROS1JgVsSQ15qwJSWpsstq9tc5ELEnYI5ak5uwRS1Jj9oglqbEpWxOS1JYVsSQ15qwJSWrM1oQkNWZrQpIasyKWpMasiCWpscmabHZtE7Ek4RJnSWrOJc6S1JgVsSQ15qwJSWrMWROS1JhLnCWpMXvEktSYPWJJasyKWJIacx6xJDVmRSxJjTlrQpIa82adJDXWsjWxoNmVJWmI1Cx+rU2SzyZ5MMntvVzbRCxJTFfEvY4eXAAc3uu1bU1IEv3tEVfVsiS797r/0CbiiafuS+sYhkWSsaoabx2Hhot/L/prNjknyRgwNmPT+LP5d5GWDWr1JsnyqlrcOg4NF/9eDLdORXxlVb14bfvaI5akxkzEktSYiXh+sA+o1fHvxZBKcgnwLWCfJPcmOaHr/vaIJaktK2JJasxELEmNmYiHXJLDk3w/yV1Jzmgdj9qb7fJZDT8T8RBLsh5wDnAEsB9wbJL92kalIXABs1g+q+FnIh5uLwfuqqq7q+op4FLgTY1jUmNVtQx4pHUc6h8T8XDbGfjJjM/3drZJGiEmYklqzEQ83O4Ddp3xeZfONkkjxEQ83G4G9kqyR5INgWOALzeOSVKfmYiHWFVNAO8GvgasBJZW1R1to1Jrs10+q+HnEmdJasyKWJIaMxFLUmMmYklqzEQsSY2ZiCWpMROxBiLJZJIVSW5P8q9JNn0W57ogyVs7P5/X7cFHSQ5J8sp1uMY9SRata4zSs2Ei1qA8WVX7d95g+xRw0swvk6y/Lietqj+vqju77HIIMOtELLVkItZcuB54YadavT7Jl4E7k6yX5J+S3JzktiQnAmTaJzrPYf4GsP2qEyW5Lsnizs+HJ/lOkluTXNN5fflJwGmdavwPkmyX5PLONW5OclDn2G2TXJ3kjiTnAZnjfybSr6xTVSL1qlP5HgFc1dn0MuDFVfWjJGPAz6vq95JsBPxnkquBA4B9mH4G8w7AncBnn3He7YBzgYM759qmqh5J8mng8ar6SGe/i4F/rqobkjyf6VWKvwN8ALihqj6Y5CjA1WlqxkSsQdkkyYrOz9cD5zPdMvh2Vf2os/0w4CWr+r/AlsBewMHAJVU1Cdyf5JurOf8rgGWrzlVVa3o+7+uA/ZJfFbxbJNm8c423dI799ySPrtsfU3r2TMQalCerav+ZGzrJ8ImZm4BTquprz9jvyD7GsQB4RVX932pikYaCPWK19DXg5CQbACTZO8lmwDLgbZ0e8k7Aoas59kbg4CR7dI7dprP9MWDhjP2uBk5Z9SHJ/p0flwHHdbYdAWzdrz+UNFsmYrV0HtP93+90XoT5Gab/L+0K4Ied7y5i+kljv6GqHgLGgC8luRX4YuerrwBvXnWzDngPsLhzM/BOfj1740ymE/kdTLco/ntAf0ZprXz6miQ1ZkUsSY2ZiCWpMROxJDVmIpakxkzEktSYiViSGjMRS1Jj/w/ny0nOoEIk5AAAAABJRU5ErkJggg==\n",
191 | "text/plain": [
192 | ""
193 | ]
194 | },
195 | "metadata": {
196 | "needs_background": "light"
197 | },
198 | "output_type": "display_data"
199 | }
200 | ],
201 | "source": [
202 | "confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])\n",
203 | "sn.heatmap(confusion_matrix, annot=True)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 7,
209 | "metadata": {},
210 | "outputs": [
211 | {
212 | "name": "stdout",
213 | "output_type": "stream",
214 | "text": [
215 | "Accuracy Score : 0.8461538461538461\n"
216 | ]
217 | }
218 | ],
219 | "source": [
220 | "print('Accuracy Score : ', metrics.accuracy_score(y_test, y_pred))"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 10,
226 | "metadata": {},
227 | "outputs": [
228 | {
229 | "data": {
230 | "text/html": [
231 | "\n",
232 | "\n",
245 | "
\n",
246 | " \n",
247 | " \n",
248 | " \n",
249 | " age \n",
250 | " education \n",
251 | " experience \n",
252 | " \n",
253 | " \n",
254 | " \n",
255 | " \n",
256 | " 0 \n",
257 | " 25 \n",
258 | " 3 \n",
259 | " 5 \n",
260 | " \n",
261 | " \n",
262 | "
\n",
263 | "
"
264 | ],
265 | "text/plain": [
266 | " age education experience\n",
267 | "0 25 3 5"
268 | ]
269 | },
270 | "execution_count": 10,
271 | "metadata": {},
272 | "output_type": "execute_result"
273 | }
274 | ],
275 | "source": [
276 | "new_candidate = {\n",
277 | " 'age': [25],\n",
278 | " 'education' : [3],\n",
279 | " 'experience' : [5]\n",
280 | "}\n",
281 | "\n",
282 | "new_df = pd.DataFrame(new_candidate, columns=['age', 'education', 'experience'])\n",
283 | "new_df"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 11,
289 | "metadata": {},
290 | "outputs": [
291 | {
292 | "name": "stdout",
293 | "output_type": "stream",
294 | "text": [
295 | "[1]\n"
296 | ]
297 | }
298 | ],
299 | "source": [
300 | "y_new_pred = LR.predict(new_df)\n",
301 | "print(y_new_pred)"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "metadata": {},
308 | "outputs": [],
309 | "source": []
310 | }
311 | ],
312 | "metadata": {
313 | "kernelspec": {
314 | "display_name": "Python 3",
315 | "language": "python",
316 | "name": "python3"
317 | },
318 | "language_info": {
319 | "codemirror_mode": {
320 | "name": "ipython",
321 | "version": 3
322 | },
323 | "file_extension": ".py",
324 | "mimetype": "text/x-python",
325 | "name": "python",
326 | "nbconvert_exporter": "python",
327 | "pygments_lexer": "ipython3",
328 | "version": "3.9.1"
329 | }
330 | },
331 | "nbformat": 4,
332 | "nbformat_minor": 4
333 | }
334 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # data-analysis
--------------------------------------------------------------------------------