├── Lecture_1.ipynb
├── Lecture_2.ipynb
├── Lecture_4.ipynb
├── Lecture_5.ipynb
├── Lecture_6.ipynb
├── Lecture_7.ipynb
├── Lecture_8.ipynb
├── README.md
├── bmw.csv
├── friends.xlsx
├── gre.csv
├── most_runs_in_test_cricket.csv
├── most_runs_in_test_cricket.txt
├── test_cricket.xlsx
└── wickets.csv
/Lecture_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Analysis with Python\n",
8 | "## Lecture 01: Importing Data with Pandas\n",
9 | "
Instructor: Md Shahidullah Kawsar\n",
10 | "
Data Scientist, IDARE, Houston, TX, USA\n",
11 | "\n",
12 | "**Objectives:**\n",
13 | "- challenges of reading a .csv file\n",
14 | "- How to deal with UnicodeDecodeError?\n",
15 | "- reading a csv file by changing the engine\n",
16 | "- choose columns by name before reading a csv file\n",
17 | "- choose columns by number before reading a csv file\n",
18 | "- reading only the first n number of rows\n",
19 | "\n",
20 | "**References:**\n",
21 | "
[1] Data Source: https://stats.espncricinfo.com/ci/content/records/223646.html\n",
22 | "
[2] https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html\n",
23 | "
[3] https://docs.python.org/3/library/codecs.html#standard-encodings\n",
24 | "
[4] https://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "#### Import required libraries"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 49,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import pandas as pd\n",
41 | "import numpy as np"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "#### How to read a csv file?\n",
49 | "#### How to deal with UnicodeDecodeError?"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 50,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/html": [
60 | "
\n",
61 | "\n",
74 | "
\n",
75 | " \n",
76 | " \n",
77 | " | \n",
78 | " Player | \n",
79 | " Span | \n",
80 | " Mat | \n",
81 | " Inns | \n",
82 | " NO | \n",
83 | " Runs | \n",
84 | " HS | \n",
85 | " Ave | \n",
86 | " 100 | \n",
87 | " 50 | \n",
88 | " 0 | \n",
89 | "
\n",
90 | " \n",
91 | " \n",
92 | " \n",
93 | " 0 | \n",
94 | " SR Tendulkar (INDIA) | \n",
95 | " 1989-2013 | \n",
96 | " 200 | \n",
97 | " 329 | \n",
98 | " 33 | \n",
99 | " 15921 | \n",
100 | " 248* | \n",
101 | " 53.78 | \n",
102 | " 51 | \n",
103 | " 68 | \n",
104 | " 14 | \n",
105 | "
\n",
106 | " \n",
107 | " 1 | \n",
108 | " RT Ponting (AUS) | \n",
109 | " 1995-2012 | \n",
110 | " 168 | \n",
111 | " 287 | \n",
112 | " 29 | \n",
113 | " 13378 | \n",
114 | " 257 | \n",
115 | " 51.85 | \n",
116 | " 41 | \n",
117 | " 62 | \n",
118 | " 17 | \n",
119 | "
\n",
120 | " \n",
121 | " 2 | \n",
122 | " JH Kallis (ICC/SA) | \n",
123 | " 1995-2013 | \n",
124 | " 166 | \n",
125 | " 280 | \n",
126 | " 40 | \n",
127 | " 13289 | \n",
128 | " 224 | \n",
129 | " 55.37 | \n",
130 | " 45 | \n",
131 | " 58 | \n",
132 | " 16 | \n",
133 | "
\n",
134 | " \n",
135 | " 3 | \n",
136 | " R Dravid (ICC/INDIA) | \n",
137 | " 1996-2012 | \n",
138 | " 164 | \n",
139 | " 286 | \n",
140 | " 32 | \n",
141 | " 13288 | \n",
142 | " 270 | \n",
143 | " 52.31 | \n",
144 | " 36 | \n",
145 | " 63 | \n",
146 | " 8 | \n",
147 | "
\n",
148 | " \n",
149 | " 4 | \n",
150 | " AN Cook (ENG) | \n",
151 | " 2006-2018 | \n",
152 | " 161 | \n",
153 | " 291 | \n",
154 | " 16 | \n",
155 | " 12472 | \n",
156 | " 294 | \n",
157 | " 45.35 | \n",
158 | " 33 | \n",
159 | " 57 | \n",
160 | " 9 | \n",
161 | "
\n",
162 | " \n",
163 | "
\n",
164 | "
"
165 | ],
166 | "text/plain": [
167 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n",
168 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n",
169 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n",
170 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n",
171 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n",
172 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n",
173 | "\n",
174 | " 50 0 \n",
175 | "0 68 14 \n",
176 | "1 62 17 \n",
177 | "2 58 16 \n",
178 | "3 63 8 \n",
179 | "4 57 9 "
180 | ]
181 | },
182 | "metadata": {},
183 | "output_type": "display_data"
184 | },
185 | {
186 | "name": "stdout",
187 | "output_type": "stream",
188 | "text": [
189 | "(97, 11)\n"
190 | ]
191 | }
192 | ],
193 | "source": [
194 | "# method 1\n",
195 | "# reading a csv file \n",
196 | "df = pd.read_csv(\"most_runs_in_test_cricket.csv\", encoding = \"ISO-8859-1\")\n",
197 | "\n",
198 | "display(df.head())\n",
199 | "# print(df.tail())\n",
200 | "print(df.shape)"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 51,
206 | "metadata": {},
207 | "outputs": [
208 | {
209 | "data": {
210 | "text/html": [
211 | "\n",
212 | "\n",
225 | "
\n",
226 | " \n",
227 | " \n",
228 | " | \n",
229 | " Player | \n",
230 | " Span | \n",
231 | " Mat | \n",
232 | " Inns | \n",
233 | " NO | \n",
234 | " Runs | \n",
235 | " HS | \n",
236 | " Ave | \n",
237 | " 100 | \n",
238 | " 50 | \n",
239 | " 0 | \n",
240 | "
\n",
241 | " \n",
242 | " \n",
243 | " \n",
244 | " 0 | \n",
245 | " SR Tendulkar (INDIA) | \n",
246 | " 1989-2013 | \n",
247 | " 200 | \n",
248 | " 329 | \n",
249 | " 33 | \n",
250 | " 15921 | \n",
251 | " 248* | \n",
252 | " 53.78 | \n",
253 | " 51 | \n",
254 | " 68 | \n",
255 | " 14 | \n",
256 | "
\n",
257 | " \n",
258 | " 1 | \n",
259 | " RT Ponting (AUS) | \n",
260 | " 1995-2012 | \n",
261 | " 168 | \n",
262 | " 287 | \n",
263 | " 29 | \n",
264 | " 13378 | \n",
265 | " 257 | \n",
266 | " 51.85 | \n",
267 | " 41 | \n",
268 | " 62 | \n",
269 | " 17 | \n",
270 | "
\n",
271 | " \n",
272 | " 2 | \n",
273 | " JH Kallis (ICC/SA) | \n",
274 | " 1995-2013 | \n",
275 | " 166 | \n",
276 | " 280 | \n",
277 | " 40 | \n",
278 | " 13289 | \n",
279 | " 224 | \n",
280 | " 55.37 | \n",
281 | " 45 | \n",
282 | " 58 | \n",
283 | " 16 | \n",
284 | "
\n",
285 | " \n",
286 | " 3 | \n",
287 | " R Dravid (ICC/INDIA) | \n",
288 | " 1996-2012 | \n",
289 | " 164 | \n",
290 | " 286 | \n",
291 | " 32 | \n",
292 | " 13288 | \n",
293 | " 270 | \n",
294 | " 52.31 | \n",
295 | " 36 | \n",
296 | " 63 | \n",
297 | " 8 | \n",
298 | "
\n",
299 | " \n",
300 | " 4 | \n",
301 | " AN Cook (ENG) | \n",
302 | " 2006-2018 | \n",
303 | " 161 | \n",
304 | " 291 | \n",
305 | " 16 | \n",
306 | " 12472 | \n",
307 | " 294 | \n",
308 | " 45.35 | \n",
309 | " 33 | \n",
310 | " 57 | \n",
311 | " 9 | \n",
312 | "
\n",
313 | " \n",
314 | "
\n",
315 | "
"
316 | ],
317 | "text/plain": [
318 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n",
319 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n",
320 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n",
321 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n",
322 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n",
323 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n",
324 | "\n",
325 | " 50 0 \n",
326 | "0 68 14 \n",
327 | "1 62 17 \n",
328 | "2 58 16 \n",
329 | "3 63 8 \n",
330 | "4 57 9 "
331 | ]
332 | },
333 | "metadata": {},
334 | "output_type": "display_data"
335 | }
336 | ],
337 | "source": [
338 | "# method 2\n",
339 | "df = pd.read_csv(\"most_runs_in_test_cricket.csv\", encoding = 'unicode_escape')\n",
340 | "\n",
341 | "display(df.head())"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 52,
347 | "metadata": {},
348 | "outputs": [
349 | {
350 | "data": {
351 | "text/html": [
352 | "\n",
353 | "\n",
366 | "
\n",
367 | " \n",
368 | " \n",
369 | " | \n",
370 | " Player | \n",
371 | " Span | \n",
372 | " Mat | \n",
373 | " Inns | \n",
374 | " NO | \n",
375 | " Runs | \n",
376 | " HS | \n",
377 | " Ave | \n",
378 | " 100 | \n",
379 | " 50 | \n",
380 | " 0 | \n",
381 | "
\n",
382 | " \n",
383 | " \n",
384 | " \n",
385 | " 0 | \n",
386 | " SR Tendulkar (INDIA) | \n",
387 | " 1989-2013 | \n",
388 | " 200 | \n",
389 | " 329 | \n",
390 | " 33 | \n",
391 | " 15921 | \n",
392 | " 248* | \n",
393 | " 53.78 | \n",
394 | " 51 | \n",
395 | " 68 | \n",
396 | " 14 | \n",
397 | "
\n",
398 | " \n",
399 | " 1 | \n",
400 | " RT Ponting (AUS) | \n",
401 | " 1995-2012 | \n",
402 | " 168 | \n",
403 | " 287 | \n",
404 | " 29 | \n",
405 | " 13378 | \n",
406 | " 257 | \n",
407 | " 51.85 | \n",
408 | " 41 | \n",
409 | " 62 | \n",
410 | " 17 | \n",
411 | "
\n",
412 | " \n",
413 | " 2 | \n",
414 | " JH Kallis (ICC/SA) | \n",
415 | " 1995-2013 | \n",
416 | " 166 | \n",
417 | " 280 | \n",
418 | " 40 | \n",
419 | " 13289 | \n",
420 | " 224 | \n",
421 | " 55.37 | \n",
422 | " 45 | \n",
423 | " 58 | \n",
424 | " 16 | \n",
425 | "
\n",
426 | " \n",
427 | " 3 | \n",
428 | " R Dravid (ICC/INDIA) | \n",
429 | " 1996-2012 | \n",
430 | " 164 | \n",
431 | " 286 | \n",
432 | " 32 | \n",
433 | " 13288 | \n",
434 | " 270 | \n",
435 | " 52.31 | \n",
436 | " 36 | \n",
437 | " 63 | \n",
438 | " 8 | \n",
439 | "
\n",
440 | " \n",
441 | " 4 | \n",
442 | " AN Cook (ENG) | \n",
443 | " 2006-2018 | \n",
444 | " 161 | \n",
445 | " 291 | \n",
446 | " 16 | \n",
447 | " 12472 | \n",
448 | " 294 | \n",
449 | " 45.35 | \n",
450 | " 33 | \n",
451 | " 57 | \n",
452 | " 9 | \n",
453 | "
\n",
454 | " \n",
455 | "
\n",
456 | "
"
457 | ],
458 | "text/plain": [
459 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n",
460 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n",
461 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n",
462 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n",
463 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n",
464 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n",
465 | "\n",
466 | " 50 0 \n",
467 | "0 68 14 \n",
468 | "1 62 17 \n",
469 | "2 58 16 \n",
470 | "3 63 8 \n",
471 | "4 57 9 "
472 | ]
473 | },
474 | "metadata": {},
475 | "output_type": "display_data"
476 | }
477 | ],
478 | "source": [
479 | "# method 3\n",
480 | "# reading a csv file by changing the engine\n",
481 | "df = pd.read_csv(\"most_runs_in_test_cricket.csv\", engine = 'python')\n",
482 | "\n",
483 | "# removing the weird \"�\" symbol from the 'Player' column\n",
484 | "df['Player'] = df['Player'].str.replace(\"�\", \" \")\n",
485 | "display(df.head())"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": 53,
491 | "metadata": {},
492 | "outputs": [
493 | {
494 | "name": "stdout",
495 | "output_type": "stream",
496 | "text": [
497 | "number of rows = 97\n",
498 | "number of columns = 11\n"
499 | ]
500 | }
501 | ],
502 | "source": [
503 | "# number of rows\n",
504 | "print(\"number of rows = \", df.shape[0])\n",
505 | "\n",
506 | "# number of columns\n",
507 | "print(\"number of columns = \", df.shape[1])"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": 54,
513 | "metadata": {},
514 | "outputs": [
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | "\n",
520 | "RangeIndex: 97 entries, 0 to 96\n",
521 | "Data columns (total 11 columns):\n",
522 | " # Column Non-Null Count Dtype \n",
523 | "--- ------ -------------- ----- \n",
524 | " 0 Player 97 non-null object \n",
525 | " 1 Span 97 non-null object \n",
526 | " 2 Mat 97 non-null int64 \n",
527 | " 3 Inns 97 non-null int64 \n",
528 | " 4 NO 97 non-null int64 \n",
529 | " 5 Runs 97 non-null int64 \n",
530 | " 6 HS 97 non-null object \n",
531 | " 7 Ave 97 non-null float64\n",
532 | " 8 100 97 non-null int64 \n",
533 | " 9 50 97 non-null int64 \n",
534 | " 10 0 97 non-null int64 \n",
535 | "dtypes: float64(1), int64(7), object(3)\n",
536 | "memory usage: 8.5+ KB\n",
537 | "None\n"
538 | ]
539 | }
540 | ],
541 | "source": [
542 | "# checking for missing values and data types of each column\n",
543 | "print(df.info())"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": 55,
549 | "metadata": {},
550 | "outputs": [
551 | {
552 | "data": {
553 | "text/html": [
554 | "\n",
555 | "\n",
568 | "
\n",
569 | " \n",
570 | " \n",
571 | " | \n",
572 | " Mat | \n",
573 | " Inns | \n",
574 | " NO | \n",
575 | " Runs | \n",
576 | " Ave | \n",
577 | " 100 | \n",
578 | " 50 | \n",
579 | " 0 | \n",
580 | "
\n",
581 | " \n",
582 | " \n",
583 | " \n",
584 | " count | \n",
585 | " 97.000000 | \n",
586 | " 97.000000 | \n",
587 | " 97.000000 | \n",
588 | " 97.000000 | \n",
589 | " 97.000000 | \n",
590 | " 97.000000 | \n",
591 | " 97.000000 | \n",
592 | " 97.000000 | \n",
593 | "
\n",
594 | " \n",
595 | " mean | \n",
596 | " 104.979381 | \n",
597 | " 178.752577 | \n",
598 | " 16.051546 | \n",
599 | " 7574.175258 | \n",
600 | " 46.781031 | \n",
601 | " 20.546392 | \n",
602 | " 35.474227 | \n",
603 | " 11.329897 | \n",
604 | "
\n",
605 | " \n",
606 | " std | \n",
607 | " 27.064729 | \n",
608 | " 44.963418 | \n",
609 | " 8.754012 | \n",
610 | " 2224.255278 | \n",
611 | " 8.168268 | \n",
612 | " 8.226001 | \n",
613 | " 11.499178 | \n",
614 | " 4.147594 | \n",
615 | "
\n",
616 | " \n",
617 | " min | \n",
618 | " 52.000000 | \n",
619 | " 80.000000 | \n",
620 | " 5.000000 | \n",
621 | " 5062.000000 | \n",
622 | " 30.300000 | \n",
623 | " 4.000000 | \n",
624 | " 13.000000 | \n",
625 | " 2.000000 | \n",
626 | "
\n",
627 | " \n",
628 | " 25% | \n",
629 | " 86.000000 | \n",
630 | " 146.000000 | \n",
631 | " 10.000000 | \n",
632 | " 5825.000000 | \n",
633 | " 42.290000 | \n",
634 | " 15.000000 | \n",
635 | " 27.000000 | \n",
636 | " 9.000000 | \n",
637 | "
\n",
638 | " \n",
639 | " 50% | \n",
640 | " 102.000000 | \n",
641 | " 176.000000 | \n",
642 | " 15.000000 | \n",
643 | " 7214.000000 | \n",
644 | " 45.840000 | \n",
645 | " 19.000000 | \n",
646 | " 33.000000 | \n",
647 | " 11.000000 | \n",
648 | "
\n",
649 | " \n",
650 | " 75% | \n",
651 | " 117.000000 | \n",
652 | " 200.000000 | \n",
653 | " 20.000000 | \n",
654 | " 8540.000000 | \n",
655 | " 50.660000 | \n",
656 | " 24.000000 | \n",
657 | " 42.000000 | \n",
658 | " 14.000000 | \n",
659 | "
\n",
660 | " \n",
661 | " max | \n",
662 | " 200.000000 | \n",
663 | " 329.000000 | \n",
664 | " 49.000000 | \n",
665 | " 15921.000000 | \n",
666 | " 99.940000 | \n",
667 | " 51.000000 | \n",
668 | " 68.000000 | \n",
669 | " 22.000000 | \n",
670 | "
\n",
671 | " \n",
672 | "
\n",
673 | "
"
674 | ],
675 | "text/plain": [
676 | " Mat Inns NO Runs Ave 100 \\\n",
677 | "count 97.000000 97.000000 97.000000 97.000000 97.000000 97.000000 \n",
678 | "mean 104.979381 178.752577 16.051546 7574.175258 46.781031 20.546392 \n",
679 | "std 27.064729 44.963418 8.754012 2224.255278 8.168268 8.226001 \n",
680 | "min 52.000000 80.000000 5.000000 5062.000000 30.300000 4.000000 \n",
681 | "25% 86.000000 146.000000 10.000000 5825.000000 42.290000 15.000000 \n",
682 | "50% 102.000000 176.000000 15.000000 7214.000000 45.840000 19.000000 \n",
683 | "75% 117.000000 200.000000 20.000000 8540.000000 50.660000 24.000000 \n",
684 | "max 200.000000 329.000000 49.000000 15921.000000 99.940000 51.000000 \n",
685 | "\n",
686 | " 50 0 \n",
687 | "count 97.000000 97.000000 \n",
688 | "mean 35.474227 11.329897 \n",
689 | "std 11.499178 4.147594 \n",
690 | "min 13.000000 2.000000 \n",
691 | "25% 27.000000 9.000000 \n",
692 | "50% 33.000000 11.000000 \n",
693 | "75% 42.000000 14.000000 \n",
694 | "max 68.000000 22.000000 "
695 | ]
696 | },
697 | "metadata": {},
698 | "output_type": "display_data"
699 | }
700 | ],
701 | "source": [
702 | "# checking data statistics\n",
703 | "display(df.describe())"
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": 56,
709 | "metadata": {},
710 | "outputs": [
711 | {
712 | "name": "stdout",
713 | "output_type": "stream",
714 | "text": [
715 | "Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', '100', '50',\n",
716 | " '0'],\n",
717 | " dtype='object')\n"
718 | ]
719 | }
720 | ],
721 | "source": [
722 | "# column names\n",
723 | "print(df.columns)"
724 | ]
725 | },
726 | {
727 | "cell_type": "markdown",
728 | "metadata": {},
729 | "source": [
730 | "#### choose columns by name to read a csv file"
731 | ]
732 | },
733 | {
734 | "cell_type": "code",
735 | "execution_count": 57,
736 | "metadata": {},
737 | "outputs": [
738 | {
739 | "data": {
740 | "text/html": [
741 | "\n",
742 | "\n",
755 | "
\n",
756 | " \n",
757 | " \n",
758 | " | \n",
759 | " Player | \n",
760 | " Mat | \n",
761 | " Runs | \n",
762 | " 100 | \n",
763 | "
\n",
764 | " \n",
765 | " \n",
766 | " \n",
767 | " 0 | \n",
768 | " SR Tendulkar (INDIA) | \n",
769 | " 200 | \n",
770 | " 15921 | \n",
771 | " 51 | \n",
772 | "
\n",
773 | " \n",
774 | " 1 | \n",
775 | " RT Ponting (AUS) | \n",
776 | " 168 | \n",
777 | " 13378 | \n",
778 | " 41 | \n",
779 | "
\n",
780 | " \n",
781 | " 2 | \n",
782 | " JH Kallis (ICC/SA) | \n",
783 | " 166 | \n",
784 | " 13289 | \n",
785 | " 45 | \n",
786 | "
\n",
787 | " \n",
788 | " 3 | \n",
789 | " R Dravid (ICC/INDIA) | \n",
790 | " 164 | \n",
791 | " 13288 | \n",
792 | " 36 | \n",
793 | "
\n",
794 | " \n",
795 | " 4 | \n",
796 | " AN Cook (ENG) | \n",
797 | " 161 | \n",
798 | " 12472 | \n",
799 | " 33 | \n",
800 | "
\n",
801 | " \n",
802 | " 5 | \n",
803 | " KC Sangakkara (SL) | \n",
804 | " 134 | \n",
805 | " 12400 | \n",
806 | " 38 | \n",
807 | "
\n",
808 | " \n",
809 | " 6 | \n",
810 | " BC Lara (ICC/WI) | \n",
811 | " 131 | \n",
812 | " 11953 | \n",
813 | " 34 | \n",
814 | "
\n",
815 | " \n",
816 | " 7 | \n",
817 | " S Chanderpaul (WI) | \n",
818 | " 164 | \n",
819 | " 11867 | \n",
820 | " 30 | \n",
821 | "
\n",
822 | " \n",
823 | " 8 | \n",
824 | " DPMD Jayawardene (SL) | \n",
825 | " 149 | \n",
826 | " 11814 | \n",
827 | " 34 | \n",
828 | "
\n",
829 | " \n",
830 | " 9 | \n",
831 | " AR Border (AUS) | \n",
832 | " 156 | \n",
833 | " 11174 | \n",
834 | " 27 | \n",
835 | "
\n",
836 | " \n",
837 | "
\n",
838 | "
"
839 | ],
840 | "text/plain": [
841 | " Player Mat Runs 100\n",
842 | "0 SR Tendulkar (INDIA) 200 15921 51\n",
843 | "1 RT Ponting (AUS) 168 13378 41\n",
844 | "2 JH Kallis (ICC/SA) 166 13289 45\n",
845 | "3 R Dravid (ICC/INDIA) 164 13288 36\n",
846 | "4 AN Cook (ENG) 161 12472 33\n",
847 | "5 KC Sangakkara (SL) 134 12400 38\n",
848 | "6 BC Lara (ICC/WI) 131 11953 34\n",
849 | "7 S Chanderpaul (WI) 164 11867 30\n",
850 | "8 DPMD Jayawardene (SL) 149 11814 34\n",
851 | "9 AR Border (AUS) 156 11174 27"
852 | ]
853 | },
854 | "metadata": {},
855 | "output_type": "display_data"
856 | }
857 | ],
858 | "source": [
859 | "col_names = ['Player', 'Mat', 'Runs', '100']\n",
860 | "df_usecols = pd.read_csv(\"most_runs_in_test_cricket.csv\", encoding = 'unicode_escape', usecols=col_names)\n",
861 | "\n",
862 | "display(df_usecols.head(10))"
863 | ]
864 | },
865 | {
866 | "cell_type": "code",
867 | "execution_count": 58,
868 | "metadata": {},
869 | "outputs": [
870 | {
871 | "name": "stdout",
872 | "output_type": "stream",
873 | "text": [
874 | "(97, 11)\n",
875 | "(97, 4)\n"
876 | ]
877 | }
878 | ],
879 | "source": [
880 | "print(df.shape)\n",
881 | "\n",
882 | "# selecting columns after data importing\n",
883 | "df = df[col_names]\n",
884 | "\n",
885 | "print(df.shape)"
886 | ]
887 | },
888 | {
889 | "cell_type": "markdown",
890 | "metadata": {},
891 | "source": [
892 | "#### choose columns by number to read a csv file"
893 | ]
894 | },
895 | {
896 | "cell_type": "code",
897 | "execution_count": 59,
898 | "metadata": {},
899 | "outputs": [
900 | {
901 | "data": {
902 | "text/html": [
903 | "\n",
904 | "\n",
917 | "
\n",
918 | " \n",
919 | " \n",
920 | " | \n",
921 | " Player | \n",
922 | " Mat | \n",
923 | " Runs | \n",
924 | " 100 | \n",
925 | "
\n",
926 | " \n",
927 | " \n",
928 | " \n",
929 | " 0 | \n",
930 | " SR Tendulkar (INDIA) | \n",
931 | " 200 | \n",
932 | " 15921 | \n",
933 | " 51 | \n",
934 | "
\n",
935 | " \n",
936 | " 1 | \n",
937 | " RT Ponting (AUS) | \n",
938 | " 168 | \n",
939 | " 13378 | \n",
940 | " 41 | \n",
941 | "
\n",
942 | " \n",
943 | " 2 | \n",
944 | " JH Kallis (ICC/SA) | \n",
945 | " 166 | \n",
946 | " 13289 | \n",
947 | " 45 | \n",
948 | "
\n",
949 | " \n",
950 | " 3 | \n",
951 | " R Dravid (ICC/INDIA) | \n",
952 | " 164 | \n",
953 | " 13288 | \n",
954 | " 36 | \n",
955 | "
\n",
956 | " \n",
957 | " 4 | \n",
958 | " AN Cook (ENG) | \n",
959 | " 161 | \n",
960 | " 12472 | \n",
961 | " 33 | \n",
962 | "
\n",
963 | " \n",
964 | " 5 | \n",
965 | " KC Sangakkara (SL) | \n",
966 | " 134 | \n",
967 | " 12400 | \n",
968 | " 38 | \n",
969 | "
\n",
970 | " \n",
971 | " 6 | \n",
972 | " BC Lara (ICC/WI) | \n",
973 | " 131 | \n",
974 | " 11953 | \n",
975 | " 34 | \n",
976 | "
\n",
977 | " \n",
978 | " 7 | \n",
979 | " S Chanderpaul (WI) | \n",
980 | " 164 | \n",
981 | " 11867 | \n",
982 | " 30 | \n",
983 | "
\n",
984 | " \n",
985 | " 8 | \n",
986 | " DPMD Jayawardene (SL) | \n",
987 | " 149 | \n",
988 | " 11814 | \n",
989 | " 34 | \n",
990 | "
\n",
991 | " \n",
992 | " 9 | \n",
993 | " AR Border (AUS) | \n",
994 | " 156 | \n",
995 | " 11174 | \n",
996 | " 27 | \n",
997 | "
\n",
998 | " \n",
999 | "
\n",
1000 | "
"
1001 | ],
1002 | "text/plain": [
1003 | " Player Mat Runs 100\n",
1004 | "0 SR Tendulkar (INDIA) 200 15921 51\n",
1005 | "1 RT Ponting (AUS) 168 13378 41\n",
1006 | "2 JH Kallis (ICC/SA) 166 13289 45\n",
1007 | "3 R Dravid (ICC/INDIA) 164 13288 36\n",
1008 | "4 AN Cook (ENG) 161 12472 33\n",
1009 | "5 KC Sangakkara (SL) 134 12400 38\n",
1010 | "6 BC Lara (ICC/WI) 131 11953 34\n",
1011 | "7 S Chanderpaul (WI) 164 11867 30\n",
1012 | "8 DPMD Jayawardene (SL) 149 11814 34\n",
1013 | "9 AR Border (AUS) 156 11174 27"
1014 | ]
1015 | },
1016 | "metadata": {},
1017 | "output_type": "display_data"
1018 | },
1019 | {
1020 | "name": "stdout",
1021 | "output_type": "stream",
1022 | "text": [
1023 | "(97, 4)\n"
1024 | ]
1025 | }
1026 | ],
1027 | "source": [
1028 | "col_nums = [0, 2, 5, 8]\n",
1029 | "df_usecols_index = pd.read_csv(\"most_runs_in_test_cricket.csv\", encoding = 'unicode_escape', usecols=col_nums)\n",
1030 | "\n",
1031 | "display(df_usecols_index.head(10))\n",
1032 | "print(df_usecols_index.shape)"
1033 | ]
1034 | },
1035 | {
1036 | "cell_type": "markdown",
1037 | "metadata": {},
1038 | "source": [
1039 | "#### reading only the first n number of rows"
1040 | ]
1041 | },
1042 | {
1043 | "cell_type": "code",
1044 | "execution_count": 60,
1045 | "metadata": {},
1046 | "outputs": [
1047 | {
1048 | "data": {
1049 | "text/html": [
1050 | "\n",
1051 | "\n",
1064 | "
\n",
1065 | " \n",
1066 | " \n",
1067 | " | \n",
1068 | " Player | \n",
1069 | " Span | \n",
1070 | " Mat | \n",
1071 | " Inns | \n",
1072 | " NO | \n",
1073 | " Runs | \n",
1074 | " HS | \n",
1075 | " Ave | \n",
1076 | " 100 | \n",
1077 | " 50 | \n",
1078 | " 0 | \n",
1079 | "
\n",
1080 | " \n",
1081 | " \n",
1082 | " \n",
1083 | " 0 | \n",
1084 | " SR Tendulkar (INDIA) | \n",
1085 | " 1989-2013 | \n",
1086 | " 200 | \n",
1087 | " 329 | \n",
1088 | " 33 | \n",
1089 | " 15921 | \n",
1090 | " 248* | \n",
1091 | " 53.78 | \n",
1092 | " 51 | \n",
1093 | " 68 | \n",
1094 | " 14 | \n",
1095 | "
\n",
1096 | " \n",
1097 | " 1 | \n",
1098 | " RT Ponting (AUS) | \n",
1099 | " 1995-2012 | \n",
1100 | " 168 | \n",
1101 | " 287 | \n",
1102 | " 29 | \n",
1103 | " 13378 | \n",
1104 | " 257 | \n",
1105 | " 51.85 | \n",
1106 | " 41 | \n",
1107 | " 62 | \n",
1108 | " 17 | \n",
1109 | "
\n",
1110 | " \n",
1111 | " 2 | \n",
1112 | " JH Kallis (ICC/SA) | \n",
1113 | " 1995-2013 | \n",
1114 | " 166 | \n",
1115 | " 280 | \n",
1116 | " 40 | \n",
1117 | " 13289 | \n",
1118 | " 224 | \n",
1119 | " 55.37 | \n",
1120 | " 45 | \n",
1121 | " 58 | \n",
1122 | " 16 | \n",
1123 | "
\n",
1124 | " \n",
1125 | " 3 | \n",
1126 | " R Dravid (ICC/INDIA) | \n",
1127 | " 1996-2012 | \n",
1128 | " 164 | \n",
1129 | " 286 | \n",
1130 | " 32 | \n",
1131 | " 13288 | \n",
1132 | " 270 | \n",
1133 | " 52.31 | \n",
1134 | " 36 | \n",
1135 | " 63 | \n",
1136 | " 8 | \n",
1137 | "
\n",
1138 | " \n",
1139 | " 4 | \n",
1140 | " AN Cook (ENG) | \n",
1141 | " 2006-2018 | \n",
1142 | " 161 | \n",
1143 | " 291 | \n",
1144 | " 16 | \n",
1145 | " 12472 | \n",
1146 | " 294 | \n",
1147 | " 45.35 | \n",
1148 | " 33 | \n",
1149 | " 57 | \n",
1150 | " 9 | \n",
1151 | "
\n",
1152 | " \n",
1153 | " 5 | \n",
1154 | " KC Sangakkara (SL) | \n",
1155 | " 2000-2015 | \n",
1156 | " 134 | \n",
1157 | " 233 | \n",
1158 | " 17 | \n",
1159 | " 12400 | \n",
1160 | " 319 | \n",
1161 | " 57.40 | \n",
1162 | " 38 | \n",
1163 | " 52 | \n",
1164 | " 11 | \n",
1165 | "
\n",
1166 | " \n",
1167 | " 6 | \n",
1168 | " BC Lara (ICC/WI) | \n",
1169 | " 1990-2006 | \n",
1170 | " 131 | \n",
1171 | " 232 | \n",
1172 | " 6 | \n",
1173 | " 11953 | \n",
1174 | " 400* | \n",
1175 | " 52.88 | \n",
1176 | " 34 | \n",
1177 | " 48 | \n",
1178 | " 17 | \n",
1179 | "
\n",
1180 | " \n",
1181 | " 7 | \n",
1182 | " S Chanderpaul (WI) | \n",
1183 | " 1994-2015 | \n",
1184 | " 164 | \n",
1185 | " 280 | \n",
1186 | " 49 | \n",
1187 | " 11867 | \n",
1188 | " 203* | \n",
1189 | " 51.37 | \n",
1190 | " 30 | \n",
1191 | " 66 | \n",
1192 | " 15 | \n",
1193 | "
\n",
1194 | " \n",
1195 | " 8 | \n",
1196 | " DPMD Jayawardene (SL) | \n",
1197 | " 1997-2014 | \n",
1198 | " 149 | \n",
1199 | " 252 | \n",
1200 | " 15 | \n",
1201 | " 11814 | \n",
1202 | " 374 | \n",
1203 | " 49.84 | \n",
1204 | " 34 | \n",
1205 | " 50 | \n",
1206 | " 15 | \n",
1207 | "
\n",
1208 | " \n",
1209 | " 9 | \n",
1210 | " AR Border (AUS) | \n",
1211 | " 1978-1994 | \n",
1212 | " 156 | \n",
1213 | " 265 | \n",
1214 | " 44 | \n",
1215 | " 11174 | \n",
1216 | " 205 | \n",
1217 | " 50.56 | \n",
1218 | " 27 | \n",
1219 | " 63 | \n",
1220 | " 11 | \n",
1221 | "
\n",
1222 | " \n",
1223 | " 10 | \n",
1224 | " SR Waugh (AUS) | \n",
1225 | " 1985-2004 | \n",
1226 | " 168 | \n",
1227 | " 260 | \n",
1228 | " 46 | \n",
1229 | " 10927 | \n",
1230 | " 200 | \n",
1231 | " 51.06 | \n",
1232 | " 32 | \n",
1233 | " 50 | \n",
1234 | " 22 | \n",
1235 | "
\n",
1236 | " \n",
1237 | " 11 | \n",
1238 | " SM Gavaskar (INDIA) | \n",
1239 | " 1971-1987 | \n",
1240 | " 125 | \n",
1241 | " 214 | \n",
1242 | " 16 | \n",
1243 | " 10122 | \n",
1244 | " 236* | \n",
1245 | " 51.12 | \n",
1246 | " 34 | \n",
1247 | " 45 | \n",
1248 | " 12 | \n",
1249 | "
\n",
1250 | " \n",
1251 | " 12 | \n",
1252 | " Younis Khan (PAK) | \n",
1253 | " 2000-2017 | \n",
1254 | " 118 | \n",
1255 | " 213 | \n",
1256 | " 19 | \n",
1257 | " 10099 | \n",
1258 | " 313 | \n",
1259 | " 52.05 | \n",
1260 | " 34 | \n",
1261 | " 33 | \n",
1262 | " 19 | \n",
1263 | "
\n",
1264 | " \n",
1265 | " 13 | \n",
1266 | " HM Amla (SA) | \n",
1267 | " 2004-2019 | \n",
1268 | " 124 | \n",
1269 | " 215 | \n",
1270 | " 16 | \n",
1271 | " 9282 | \n",
1272 | " 311* | \n",
1273 | " 46.64 | \n",
1274 | " 28 | \n",
1275 | " 41 | \n",
1276 | " 13 | \n",
1277 | "
\n",
1278 | " \n",
1279 | " 14 | \n",
1280 | " GC Smith (ICC/SA) | \n",
1281 | " 2002-2014 | \n",
1282 | " 117 | \n",
1283 | " 205 | \n",
1284 | " 13 | \n",
1285 | " 9265 | \n",
1286 | " 277 | \n",
1287 | " 48.25 | \n",
1288 | " 27 | \n",
1289 | " 38 | \n",
1290 | " 11 | \n",
1291 | "
\n",
1292 | " \n",
1293 | "
\n",
1294 | "
"
1295 | ],
1296 | "text/plain": [
1297 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n",
1298 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n",
1299 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n",
1300 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n",
1301 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n",
1302 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n",
1303 | "5 KC Sangakkara (SL) 2000-2015 134 233 17 12400 319 57.40 38 \n",
1304 | "6 BC Lara (ICC/WI) 1990-2006 131 232 6 11953 400* 52.88 34 \n",
1305 | "7 S Chanderpaul (WI) 1994-2015 164 280 49 11867 203* 51.37 30 \n",
1306 | "8 DPMD Jayawardene (SL) 1997-2014 149 252 15 11814 374 49.84 34 \n",
1307 | "9 AR Border (AUS) 1978-1994 156 265 44 11174 205 50.56 27 \n",
1308 | "10 SR Waugh (AUS) 1985-2004 168 260 46 10927 200 51.06 32 \n",
1309 | "11 SM Gavaskar (INDIA) 1971-1987 125 214 16 10122 236* 51.12 34 \n",
1310 | "12 Younis Khan (PAK) 2000-2017 118 213 19 10099 313 52.05 34 \n",
1311 | "13 HM Amla (SA) 2004-2019 124 215 16 9282 311* 46.64 28 \n",
1312 | "14 GC Smith (ICC/SA) 2002-2014 117 205 13 9265 277 48.25 27 \n",
1313 | "\n",
1314 | " 50 0 \n",
1315 | "0 68 14 \n",
1316 | "1 62 17 \n",
1317 | "2 58 16 \n",
1318 | "3 63 8 \n",
1319 | "4 57 9 \n",
1320 | "5 52 11 \n",
1321 | "6 48 17 \n",
1322 | "7 66 15 \n",
1323 | "8 50 15 \n",
1324 | "9 63 11 \n",
1325 | "10 50 22 \n",
1326 | "11 45 12 \n",
1327 | "12 33 19 \n",
1328 | "13 41 13 \n",
1329 | "14 38 11 "
1330 | ]
1331 | },
1332 | "metadata": {},
1333 | "output_type": "display_data"
1334 | },
1335 | {
1336 | "name": "stdout",
1337 | "output_type": "stream",
1338 | "text": [
1339 | "(50, 11)\n"
1340 | ]
1341 | }
1342 | ],
1343 | "source": [
1344 | "df = pd.read_csv(\"most_runs_in_test_cricket.csv\", encoding = 'unicode_escape', nrows=50)\n",
1345 | "\n",
1346 | "display(df.head(15))\n",
1347 | "print(df.shape)"
1348 | ]
1349 | },
1350 | {
1351 | "cell_type": "code",
1352 | "execution_count": 61,
1353 | "metadata": {},
1354 | "outputs": [
1355 | {
1356 | "data": {
1357 | "text/html": [
1358 | "\n",
1359 | "\n",
1372 | "
\n",
1373 | " \n",
1374 | " \n",
1375 | " | \n",
1376 | " Player | \n",
1377 | " Span | \n",
1378 | " Mat | \n",
1379 | " Inns | \n",
1380 | " NO | \n",
1381 | " Runs | \n",
1382 | " HS | \n",
1383 | " Ave | \n",
1384 | " 100 | \n",
1385 | " 50 | \n",
1386 | " 0 | \n",
1387 | "
\n",
1388 | " \n",
1389 | " \n",
1390 | " \n",
1391 | " 38 | \n",
1392 | " SPD Smith (AUS) | \n",
1393 | " 2010-2021 | \n",
1394 | " 77 | \n",
1395 | " 139 | \n",
1396 | " 17 | \n",
1397 | " 7540 | \n",
1398 | " 239 | \n",
1399 | " 61.80 | \n",
1400 | " 27 | \n",
1401 | " 31 | \n",
1402 | " 5 | \n",
1403 | "
\n",
1404 | " \n",
1405 | " 34 | \n",
1406 | " MC Cowdrey (ENG) | \n",
1407 | " 1954-1975 | \n",
1408 | " 114 | \n",
1409 | " 188 | \n",
1410 | " 15 | \n",
1411 | " 7624 | \n",
1412 | " 182 | \n",
1413 | " 44.06 | \n",
1414 | " 22 | \n",
1415 | " 38 | \n",
1416 | " 9 | \n",
1417 | "
\n",
1418 | " \n",
1419 | "
\n",
1420 | "
"
1421 | ],
1422 | "text/plain": [
1423 | " Player Span Mat Inns NO Runs HS Ave 100 50 0\n",
1424 | "38 SPD Smith (AUS) 2010-2021 77 139 17 7540 239 61.80 27 31 5\n",
1425 | "34 MC Cowdrey (ENG) 1954-1975 114 188 15 7624 182 44.06 22 38 9"
1426 | ]
1427 | },
1428 | "execution_count": 61,
1429 | "metadata": {},
1430 | "output_type": "execute_result"
1431 | }
1432 | ],
1433 | "source": [
1434 | "# showing randomly 2 different rows\n",
1435 | "df.sample(2)"
1436 | ]
1437 | },
1438 | {
1439 | "cell_type": "code",
1440 | "execution_count": null,
1441 | "metadata": {},
1442 | "outputs": [],
1443 | "source": []
1444 | },
1445 | {
1446 | "cell_type": "code",
1447 | "execution_count": null,
1448 | "metadata": {},
1449 | "outputs": [],
1450 | "source": []
1451 | }
1452 | ],
1453 | "metadata": {
1454 | "kernelspec": {
1455 | "display_name": "Python 3",
1456 | "language": "python",
1457 | "name": "python3"
1458 | },
1459 | "language_info": {
1460 | "codemirror_mode": {
1461 | "name": "ipython",
1462 | "version": 3
1463 | },
1464 | "file_extension": ".py",
1465 | "mimetype": "text/x-python",
1466 | "name": "python",
1467 | "nbconvert_exporter": "python",
1468 | "pygments_lexer": "ipython3",
1469 | "version": "3.8.5"
1470 | }
1471 | },
1472 | "nbformat": 4,
1473 | "nbformat_minor": 4
1474 | }
1475 |
--------------------------------------------------------------------------------
/Lecture_2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Analysis with Python\n",
8 | "## Lecture 02: Data Preprocessing with Pandas\n",
9 | "
Instructor: Md Shahidullah Kawsar\n",
10 | "
Data Scientist, IDARE, Houston, TX, USA\n",
11 | "\n",
12 | "**Objectives:**\n",
13 | "- reading a .txt (text) or an excel (.xlsx) file\n",
14 | "- dealing with the UnicodeDecodeError?\n",
15 | "- renaming column names\n",
16 | "- creating a new DataFrame?\n",
17 | "- concatenation of two dataframes\n",
18 | "- column splitting\n",
19 | "- creating a new column in a dataframe\n",
20 | "- replace/removing a value from a pandas column\n",
21 | "- removing a column from the dataframe\n",
22 | "\n",
23 | "**References:**\n",
24 | "
[1] Data Source: https://stats.espncricinfo.com/ci/content/records/223646.html\n",
25 | "
[2] https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html\n",
26 | "
[3] different data sources: https://archive.ics.uci.edu/ml/index.php\n",
27 | "
[4] https://www.kaggle.com/learn\n",
28 | "
[5] https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reindex.html"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "#### Import required libraries"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 270,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import numpy as np\n",
45 | "import pandas as pd"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "#### How to read a text file?\n",
53 | "#### How to deal with UnicodeDecodeError?"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 271,
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/html": [
64 | "\n",
65 | "\n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " | \n",
82 | " Player | \n",
83 | " Span | \n",
84 | " Mat | \n",
85 | " Inns | \n",
86 | " NO | \n",
87 | " Runs | \n",
88 | " HS | \n",
89 | " Ave | \n",
90 | " 100 | \n",
91 | " 50 | \n",
92 | " 0 | \n",
93 | "
\n",
94 | " \n",
95 | " \n",
96 | " \n",
97 | " 0 | \n",
98 | " SR Tendulkar (INDIA) | \n",
99 | " 1989-2013 | \n",
100 | " 200 | \n",
101 | " 329 | \n",
102 | " 33 | \n",
103 | " 15921 | \n",
104 | " 248* | \n",
105 | " 53.78 | \n",
106 | " 51 | \n",
107 | " 68 | \n",
108 | " 14 | \n",
109 | "
\n",
110 | " \n",
111 | " 1 | \n",
112 | " RT Ponting (AUS) | \n",
113 | " 1995-2012 | \n",
114 | " 168 | \n",
115 | " 287 | \n",
116 | " 29 | \n",
117 | " 13378 | \n",
118 | " 257 | \n",
119 | " 51.85 | \n",
120 | " 41 | \n",
121 | " 62 | \n",
122 | " 17 | \n",
123 | "
\n",
124 | " \n",
125 | " 2 | \n",
126 | " JH Kallis (ICC/SA) | \n",
127 | " 1995-2013 | \n",
128 | " 166 | \n",
129 | " 280 | \n",
130 | " 40 | \n",
131 | " 13289 | \n",
132 | " 224 | \n",
133 | " 55.37 | \n",
134 | " 45 | \n",
135 | " 58 | \n",
136 | " 16 | \n",
137 | "
\n",
138 | " \n",
139 | " 3 | \n",
140 | " R Dravid (ICC/INDIA) | \n",
141 | " 1996-2012 | \n",
142 | " 164 | \n",
143 | " 286 | \n",
144 | " 32 | \n",
145 | " 13288 | \n",
146 | " 270 | \n",
147 | " 52.31 | \n",
148 | " 36 | \n",
149 | " 63 | \n",
150 | " 8 | \n",
151 | "
\n",
152 | " \n",
153 | " 4 | \n",
154 | " AN Cook (ENG) | \n",
155 | " 2006-2018 | \n",
156 | " 161 | \n",
157 | " 291 | \n",
158 | " 16 | \n",
159 | " 12472 | \n",
160 | " 294 | \n",
161 | " 45.35 | \n",
162 | " 33 | \n",
163 | " 57 | \n",
164 | " 9 | \n",
165 | "
\n",
166 | " \n",
167 | "
\n",
168 | "
"
169 | ],
170 | "text/plain": [
171 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n",
172 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n",
173 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n",
174 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n",
175 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n",
176 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n",
177 | "\n",
178 | " 50 0 \n",
179 | "0 68 14 \n",
180 | "1 62 17 \n",
181 | "2 58 16 \n",
182 | "3 63 8 \n",
183 | "4 57 9 "
184 | ]
185 | },
186 | "metadata": {},
187 | "output_type": "display_data"
188 | }
189 | ],
190 | "source": [
191 | "df = pd.read_csv(\"most_runs_in_test_cricket.txt\", encoding='unicode_escape', delimiter='\\t')\n",
192 | "\n",
193 | "display(df.head())"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "#### Reading an excel file"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 272,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "# pip install openpyxl"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 273,
215 | "metadata": {},
216 | "outputs": [
217 | {
218 | "data": {
219 | "text/html": [
220 | "\n",
221 | "\n",
234 | "
\n",
235 | " \n",
236 | " \n",
237 | " | \n",
238 | " Player | \n",
239 | " Span | \n",
240 | " Mat | \n",
241 | " Inns | \n",
242 | " NO | \n",
243 | " Runs | \n",
244 | " HS | \n",
245 | " Ave | \n",
246 | " 100 | \n",
247 | " 50 | \n",
248 | " 0 | \n",
249 | "
\n",
250 | " \n",
251 | " \n",
252 | " \n",
253 | " 0 | \n",
254 | " SR Tendulkar (INDIA) | \n",
255 | " 1989-2013 | \n",
256 | " 200 | \n",
257 | " 329 | \n",
258 | " 33 | \n",
259 | " 15921 | \n",
260 | " 248* | \n",
261 | " 53.78 | \n",
262 | " 51 | \n",
263 | " 68 | \n",
264 | " 14 | \n",
265 | "
\n",
266 | " \n",
267 | " 1 | \n",
268 | " RT Ponting (AUS) | \n",
269 | " 1995-2012 | \n",
270 | " 168 | \n",
271 | " 287 | \n",
272 | " 29 | \n",
273 | " 13378 | \n",
274 | " 257 | \n",
275 | " 51.85 | \n",
276 | " 41 | \n",
277 | " 62 | \n",
278 | " 17 | \n",
279 | "
\n",
280 | " \n",
281 | " 2 | \n",
282 | " JH Kallis (ICC/SA) | \n",
283 | " 1995-2013 | \n",
284 | " 166 | \n",
285 | " 280 | \n",
286 | " 40 | \n",
287 | " 13289 | \n",
288 | " 224 | \n",
289 | " 55.37 | \n",
290 | " 45 | \n",
291 | " 58 | \n",
292 | " 16 | \n",
293 | "
\n",
294 | " \n",
295 | " 3 | \n",
296 | " R Dravid (ICC/INDIA) | \n",
297 | " 1996-2012 | \n",
298 | " 164 | \n",
299 | " 286 | \n",
300 | " 32 | \n",
301 | " 13288 | \n",
302 | " 270 | \n",
303 | " 52.31 | \n",
304 | " 36 | \n",
305 | " 63 | \n",
306 | " 8 | \n",
307 | "
\n",
308 | " \n",
309 | " 4 | \n",
310 | " AN Cook (ENG) | \n",
311 | " 2006-2018 | \n",
312 | " 161 | \n",
313 | " 291 | \n",
314 | " 16 | \n",
315 | " 12472 | \n",
316 | " 294 | \n",
317 | " 45.35 | \n",
318 | " 33 | \n",
319 | " 57 | \n",
320 | " 9 | \n",
321 | "
\n",
322 | " \n",
323 | "
\n",
324 | "
"
325 | ],
326 | "text/plain": [
327 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n",
328 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n",
329 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n",
330 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n",
331 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n",
332 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n",
333 | "\n",
334 | " 50 0 \n",
335 | "0 68 14 \n",
336 | "1 62 17 \n",
337 | "2 58 16 \n",
338 | "3 63 8 \n",
339 | "4 57 9 "
340 | ]
341 | },
342 | "metadata": {},
343 | "output_type": "display_data"
344 | }
345 | ],
346 | "source": [
347 | "df = pd.read_excel(\"test_cricket.xlsx\", sheet_name='runs')\n",
348 | "\n",
349 | "display(df.head())"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 274,
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "# df['Player'] = df['Player'].str.replace(\"(\", \"\")\n",
359 | "# df['Player'] = df['Player'].str.replace(\")\", \"\")\n",
360 | "\n",
361 | "# display(df.head())"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": 275,
367 | "metadata": {},
368 | "outputs": [],
369 | "source": [
370 | "# df_Player = df['Player'].str.split(\" \", expand=True)\n",
371 | "\n",
372 | "# display(df_Player.head())\n",
373 | "# print(df_Player.info())"
374 | ]
375 | },
376 | {
377 | "cell_type": "markdown",
378 | "metadata": {},
379 | "source": [
380 | "#### How to rename the column names?"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 276,
386 | "metadata": {},
387 | "outputs": [
388 | {
389 | "name": "stdout",
390 | "output_type": "stream",
391 | "text": [
392 | "Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 100, 50, 0], dtype='object')\n"
393 | ]
394 | }
395 | ],
396 | "source": [
397 | "print(df.columns)"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 277,
403 | "metadata": {},
404 | "outputs": [
405 | {
406 | "data": {
407 | "text/html": [
408 | "\n",
409 | "\n",
422 | "
\n",
423 | " \n",
424 | " \n",
425 | " | \n",
426 | " Player | \n",
427 | " Span | \n",
428 | " Match | \n",
429 | " Innings | \n",
430 | " NotOut | \n",
431 | " Runs | \n",
432 | " Highest_score | \n",
433 | " Average | \n",
434 | " Centuries | \n",
435 | " Half_centuries | \n",
436 | " Ducks | \n",
437 | "
\n",
438 | " \n",
439 | " \n",
440 | " \n",
441 | " 0 | \n",
442 | " SR Tendulkar (INDIA) | \n",
443 | " 1989-2013 | \n",
444 | " 200 | \n",
445 | " 329 | \n",
446 | " 33 | \n",
447 | " 15921 | \n",
448 | " 248* | \n",
449 | " 53.78 | \n",
450 | " 51 | \n",
451 | " 68 | \n",
452 | " 14 | \n",
453 | "
\n",
454 | " \n",
455 | " 1 | \n",
456 | " RT Ponting (AUS) | \n",
457 | " 1995-2012 | \n",
458 | " 168 | \n",
459 | " 287 | \n",
460 | " 29 | \n",
461 | " 13378 | \n",
462 | " 257 | \n",
463 | " 51.85 | \n",
464 | " 41 | \n",
465 | " 62 | \n",
466 | " 17 | \n",
467 | "
\n",
468 | " \n",
469 | " 2 | \n",
470 | " JH Kallis (ICC/SA) | \n",
471 | " 1995-2013 | \n",
472 | " 166 | \n",
473 | " 280 | \n",
474 | " 40 | \n",
475 | " 13289 | \n",
476 | " 224 | \n",
477 | " 55.37 | \n",
478 | " 45 | \n",
479 | " 58 | \n",
480 | " 16 | \n",
481 | "
\n",
482 | " \n",
483 | " 3 | \n",
484 | " R Dravid (ICC/INDIA) | \n",
485 | " 1996-2012 | \n",
486 | " 164 | \n",
487 | " 286 | \n",
488 | " 32 | \n",
489 | " 13288 | \n",
490 | " 270 | \n",
491 | " 52.31 | \n",
492 | " 36 | \n",
493 | " 63 | \n",
494 | " 8 | \n",
495 | "
\n",
496 | " \n",
497 | " 4 | \n",
498 | " AN Cook (ENG) | \n",
499 | " 2006-2018 | \n",
500 | " 161 | \n",
501 | " 291 | \n",
502 | " 16 | \n",
503 | " 12472 | \n",
504 | " 294 | \n",
505 | " 45.35 | \n",
506 | " 33 | \n",
507 | " 57 | \n",
508 | " 9 | \n",
509 | "
\n",
510 | " \n",
511 | "
\n",
512 | "
"
513 | ],
514 | "text/plain": [
515 | " Player Span Match Innings NotOut Runs \\\n",
516 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 \n",
517 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 \n",
518 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 \n",
519 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 \n",
520 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 \n",
521 | "\n",
522 | " Highest_score Average Centuries Half_centuries Ducks \n",
523 | "0 248* 53.78 51 68 14 \n",
524 | "1 257 51.85 41 62 17 \n",
525 | "2 224 55.37 45 58 16 \n",
526 | "3 270 52.31 36 63 8 \n",
527 | "4 294 45.35 33 57 9 "
528 | ]
529 | },
530 | "metadata": {},
531 | "output_type": "display_data"
532 | }
533 | ],
534 | "source": [
535 | "df = df.rename(columns={'Mat':'Match', \n",
536 | " 'Inns':'Innings',\n",
537 | " 'NO': 'NotOut',\n",
538 | " 'HS': 'Highest_score',\n",
539 | " 'Ave': 'Average',\n",
540 | " 100: 'Centuries',\n",
541 | " 50: 'Half_centuries',\n",
542 | " 0: 'Ducks'})\n",
543 | "\n",
544 | "display(df.head())"
545 | ]
546 | },
547 | {
548 | "cell_type": "markdown",
549 | "metadata": {},
550 | "source": [
551 | "#### How to create a DataFrame?"
552 | ]
553 | },
554 | {
555 | "cell_type": "code",
556 | "execution_count": 278,
557 | "metadata": {},
558 | "outputs": [
559 | {
560 | "data": {
561 | "text/html": [
562 | "\n",
563 | "\n",
576 | "
\n",
577 | " \n",
578 | " \n",
579 | " | \n",
580 | " A | \n",
581 | " B | \n",
582 | "
\n",
583 | " \n",
584 | " \n",
585 | " \n",
586 | " 0 | \n",
587 | " 1 | \n",
588 | " 4 | \n",
589 | "
\n",
590 | " \n",
591 | " 1 | \n",
592 | " 2 | \n",
593 | " 5 | \n",
594 | "
\n",
595 | " \n",
596 | " 2 | \n",
597 | " 3 | \n",
598 | " 6 | \n",
599 | "
\n",
600 | " \n",
601 | "
\n",
602 | "
"
603 | ],
604 | "text/plain": [
605 | " A B\n",
606 | "0 1 4\n",
607 | "1 2 5\n",
608 | "2 3 6"
609 | ]
610 | },
611 | "metadata": {},
612 | "output_type": "display_data"
613 | }
614 | ],
615 | "source": [
616 | "df_A = pd.DataFrame({'A':[1,2,3],\n",
617 | " 'B':[4,5,6]})\n",
618 | "\n",
619 | "display(df_A)"
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": 279,
625 | "metadata": {},
626 | "outputs": [
627 | {
628 | "data": {
629 | "text/html": [
630 | "\n",
631 | "\n",
644 | "
\n",
645 | " \n",
646 | " \n",
647 | " | \n",
648 | " A | \n",
649 | " B | \n",
650 | "
\n",
651 | " \n",
652 | " \n",
653 | " \n",
654 | " 0 | \n",
655 | " 7 | \n",
656 | " -7 | \n",
657 | "
\n",
658 | " \n",
659 | " 1 | \n",
660 | " 8 | \n",
661 | " -8 | \n",
662 | "
\n",
663 | " \n",
664 | " 2 | \n",
665 | " 9 | \n",
666 | " -9 | \n",
667 | "
\n",
668 | " \n",
669 | "
\n",
670 | "
"
671 | ],
672 | "text/plain": [
673 | " A B\n",
674 | "0 7 -7\n",
675 | "1 8 -8\n",
676 | "2 9 -9"
677 | ]
678 | },
679 | "metadata": {},
680 | "output_type": "display_data"
681 | }
682 | ],
683 | "source": [
684 | "df_B = pd.DataFrame()\n",
685 | "# df_B['C'] = [7,8,9]\n",
686 | "# df_B['D'] = [-7,-8,-9]\n",
687 | "\n",
688 | "df_B['A'] = [7,8,9]\n",
689 | "df_B['B'] = [-7,-8,-9]\n",
690 | "\n",
691 | "display(df_B)"
692 | ]
693 | },
694 | {
695 | "cell_type": "markdown",
696 | "metadata": {},
697 | "source": [
698 | "#### How to concatenate two dataframes?"
699 | ]
700 | },
701 | {
702 | "cell_type": "code",
703 | "execution_count": 280,
704 | "metadata": {},
705 | "outputs": [
706 | {
707 | "data": {
708 | "text/html": [
709 | "\n",
710 | "\n",
723 | "
\n",
724 | " \n",
725 | " \n",
726 | " | \n",
727 | " A | \n",
728 | " B | \n",
729 | " A | \n",
730 | " B | \n",
731 | "
\n",
732 | " \n",
733 | " \n",
734 | " \n",
735 | " 0 | \n",
736 | " 1 | \n",
737 | " 4 | \n",
738 | " 7 | \n",
739 | " -7 | \n",
740 | "
\n",
741 | " \n",
742 | " 1 | \n",
743 | " 2 | \n",
744 | " 5 | \n",
745 | " 8 | \n",
746 | " -8 | \n",
747 | "
\n",
748 | " \n",
749 | " 2 | \n",
750 | " 3 | \n",
751 | " 6 | \n",
752 | " 9 | \n",
753 | " -9 | \n",
754 | "
\n",
755 | " \n",
756 | "
\n",
757 | "
"
758 | ],
759 | "text/plain": [
760 | " A B A B\n",
761 | "0 1 4 7 -7\n",
762 | "1 2 5 8 -8\n",
763 | "2 3 6 9 -9"
764 | ]
765 | },
766 | "metadata": {},
767 | "output_type": "display_data"
768 | }
769 | ],
770 | "source": [
771 | "# column-wise concatenation\n",
772 | "df_C = pd.concat([df_A, df_B], axis=1)\n",
773 | "\n",
774 | "display(df_C)"
775 | ]
776 | },
777 | {
778 | "cell_type": "code",
779 | "execution_count": 281,
780 | "metadata": {},
781 | "outputs": [
782 | {
783 | "data": {
784 | "text/html": [
785 | "\n",
786 | "\n",
799 | "
\n",
800 | " \n",
801 | " \n",
802 | " | \n",
803 | " A | \n",
804 | " B | \n",
805 | "
\n",
806 | " \n",
807 | " index | \n",
808 | " | \n",
809 | " | \n",
810 | "
\n",
811 | " \n",
812 | " \n",
813 | " \n",
814 | " 0 | \n",
815 | " 1 | \n",
816 | " 4 | \n",
817 | "
\n",
818 | " \n",
819 | " 1 | \n",
820 | " 2 | \n",
821 | " 5 | \n",
822 | "
\n",
823 | " \n",
824 | " 2 | \n",
825 | " 3 | \n",
826 | " 6 | \n",
827 | "
\n",
828 | " \n",
829 | " 3 | \n",
830 | " 7 | \n",
831 | " -7 | \n",
832 | "
\n",
833 | " \n",
834 | " 4 | \n",
835 | " 8 | \n",
836 | " -8 | \n",
837 | "
\n",
838 | " \n",
839 | " 5 | \n",
840 | " 9 | \n",
841 | " -9 | \n",
842 | "
\n",
843 | " \n",
844 | "
\n",
845 | "
"
846 | ],
847 | "text/plain": [
848 | " A B\n",
849 | "index \n",
850 | "0 1 4\n",
851 | "1 2 5\n",
852 | "2 3 6\n",
853 | "3 7 -7\n",
854 | "4 8 -8\n",
855 | "5 9 -9"
856 | ]
857 | },
858 | "metadata": {},
859 | "output_type": "display_data"
860 | }
861 | ],
862 | "source": [
863 | "# row-wise concatenation\n",
864 | "df_C = pd.concat([df_A, df_B], axis=0)\n",
865 | "df_C['index'] = np.arange(0,6,1)\n",
866 | "df_C = df_C.set_index('index')\n",
867 | "\n",
868 | "display(df_C)"
869 | ]
870 | },
871 | {
872 | "cell_type": "markdown",
873 | "metadata": {},
874 | "source": [
875 | "#### How to split a column and create two new columns?"
876 | ]
877 | },
878 | {
879 | "cell_type": "code",
880 | "execution_count": 282,
881 | "metadata": {},
882 | "outputs": [
883 | {
884 | "data": {
885 | "text/html": [
886 | "\n",
887 | "\n",
900 | "
\n",
901 | " \n",
902 | " \n",
903 | " | \n",
904 | " 0 | \n",
905 | " 1 | \n",
906 | "
\n",
907 | " \n",
908 | " \n",
909 | " \n",
910 | " 0 | \n",
911 | " SR Tendulkar | \n",
912 | " INDIA) | \n",
913 | "
\n",
914 | " \n",
915 | " 1 | \n",
916 | " RT Ponting | \n",
917 | " AUS) | \n",
918 | "
\n",
919 | " \n",
920 | " 2 | \n",
921 | " JH Kallis | \n",
922 | " ICC/SA) | \n",
923 | "
\n",
924 | " \n",
925 | " 3 | \n",
926 | " R Dravid | \n",
927 | " ICC/INDIA) | \n",
928 | "
\n",
929 | " \n",
930 | " 4 | \n",
931 | " AN Cook | \n",
932 | " ENG) | \n",
933 | "
\n",
934 | " \n",
935 | " 5 | \n",
936 | " KC Sangakkara | \n",
937 | " SL) | \n",
938 | "
\n",
939 | " \n",
940 | " 6 | \n",
941 | " BC Lara | \n",
942 | " ICC/WI) | \n",
943 | "
\n",
944 | " \n",
945 | " 7 | \n",
946 | " S Chanderpaul | \n",
947 | " WI) | \n",
948 | "
\n",
949 | " \n",
950 | " 8 | \n",
951 | " DPMD Jayawardene | \n",
952 | " SL) | \n",
953 | "
\n",
954 | " \n",
955 | " 9 | \n",
956 | " AR Border | \n",
957 | " AUS) | \n",
958 | "
\n",
959 | " \n",
960 | "
\n",
961 | "
"
962 | ],
963 | "text/plain": [
964 | " 0 1\n",
965 | "0 SR Tendulkar INDIA)\n",
966 | "1 RT Ponting AUS)\n",
967 | "2 JH Kallis ICC/SA)\n",
968 | "3 R Dravid ICC/INDIA)\n",
969 | "4 AN Cook ENG)\n",
970 | "5 KC Sangakkara SL)\n",
971 | "6 BC Lara ICC/WI)\n",
972 | "7 S Chanderpaul WI)\n",
973 | "8 DPMD Jayawardene SL)\n",
974 | "9 AR Border AUS)"
975 | ]
976 | },
977 | "metadata": {},
978 | "output_type": "display_data"
979 | }
980 | ],
981 | "source": [
982 | "df_player = df['Player'].str.split(\"(\", expand=True)\n",
983 | "\n",
984 | "display(df_player.head(10))"
985 | ]
986 | },
987 | {
988 | "cell_type": "code",
989 | "execution_count": 283,
990 | "metadata": {},
991 | "outputs": [
992 | {
993 | "data": {
994 | "text/html": [
995 | "\n",
996 | "\n",
1009 | "
\n",
1010 | " \n",
1011 | " \n",
1012 | " | \n",
1013 | " Player | \n",
1014 | " Span | \n",
1015 | " Match | \n",
1016 | " Innings | \n",
1017 | " NotOut | \n",
1018 | " Runs | \n",
1019 | " Highest_score | \n",
1020 | " Average | \n",
1021 | " Centuries | \n",
1022 | " Half_centuries | \n",
1023 | " Ducks | \n",
1024 | " 0 | \n",
1025 | " 1 | \n",
1026 | "
\n",
1027 | " \n",
1028 | " \n",
1029 | " \n",
1030 | " 0 | \n",
1031 | " SR Tendulkar (INDIA) | \n",
1032 | " 1989-2013 | \n",
1033 | " 200 | \n",
1034 | " 329 | \n",
1035 | " 33 | \n",
1036 | " 15921 | \n",
1037 | " 248* | \n",
1038 | " 53.78 | \n",
1039 | " 51 | \n",
1040 | " 68 | \n",
1041 | " 14 | \n",
1042 | " SR Tendulkar | \n",
1043 | " INDIA) | \n",
1044 | "
\n",
1045 | " \n",
1046 | " 1 | \n",
1047 | " RT Ponting (AUS) | \n",
1048 | " 1995-2012 | \n",
1049 | " 168 | \n",
1050 | " 287 | \n",
1051 | " 29 | \n",
1052 | " 13378 | \n",
1053 | " 257 | \n",
1054 | " 51.85 | \n",
1055 | " 41 | \n",
1056 | " 62 | \n",
1057 | " 17 | \n",
1058 | " RT Ponting | \n",
1059 | " AUS) | \n",
1060 | "
\n",
1061 | " \n",
1062 | " 2 | \n",
1063 | " JH Kallis (ICC/SA) | \n",
1064 | " 1995-2013 | \n",
1065 | " 166 | \n",
1066 | " 280 | \n",
1067 | " 40 | \n",
1068 | " 13289 | \n",
1069 | " 224 | \n",
1070 | " 55.37 | \n",
1071 | " 45 | \n",
1072 | " 58 | \n",
1073 | " 16 | \n",
1074 | " JH Kallis | \n",
1075 | " ICC/SA) | \n",
1076 | "
\n",
1077 | " \n",
1078 | " 3 | \n",
1079 | " R Dravid (ICC/INDIA) | \n",
1080 | " 1996-2012 | \n",
1081 | " 164 | \n",
1082 | " 286 | \n",
1083 | " 32 | \n",
1084 | " 13288 | \n",
1085 | " 270 | \n",
1086 | " 52.31 | \n",
1087 | " 36 | \n",
1088 | " 63 | \n",
1089 | " 8 | \n",
1090 | " R Dravid | \n",
1091 | " ICC/INDIA) | \n",
1092 | "
\n",
1093 | " \n",
1094 | " 4 | \n",
1095 | " AN Cook (ENG) | \n",
1096 | " 2006-2018 | \n",
1097 | " 161 | \n",
1098 | " 291 | \n",
1099 | " 16 | \n",
1100 | " 12472 | \n",
1101 | " 294 | \n",
1102 | " 45.35 | \n",
1103 | " 33 | \n",
1104 | " 57 | \n",
1105 | " 9 | \n",
1106 | " AN Cook | \n",
1107 | " ENG) | \n",
1108 | "
\n",
1109 | " \n",
1110 | "
\n",
1111 | "
"
1112 | ],
1113 | "text/plain": [
1114 | " Player Span Match Innings NotOut Runs \\\n",
1115 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 \n",
1116 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 \n",
1117 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 \n",
1118 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 \n",
1119 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 \n",
1120 | "\n",
1121 | " Highest_score Average Centuries Half_centuries Ducks 0 \\\n",
1122 | "0 248* 53.78 51 68 14 SR Tendulkar \n",
1123 | "1 257 51.85 41 62 17 RT Ponting \n",
1124 | "2 224 55.37 45 58 16 JH Kallis \n",
1125 | "3 270 52.31 36 63 8 R Dravid \n",
1126 | "4 294 45.35 33 57 9 AN Cook \n",
1127 | "\n",
1128 | " 1 \n",
1129 | "0 INDIA) \n",
1130 | "1 AUS) \n",
1131 | "2 ICC/SA) \n",
1132 | "3 ICC/INDIA) \n",
1133 | "4 ENG) "
1134 | ]
1135 | },
1136 | "metadata": {},
1137 | "output_type": "display_data"
1138 | }
1139 | ],
1140 | "source": [
1141 | "df = pd.concat([df, df_player], axis=1)\n",
1142 | "\n",
1143 | "display(df.head())"
1144 | ]
1145 | },
1146 | {
1147 | "cell_type": "markdown",
1148 | "metadata": {},
1149 | "source": [
1150 | "#### How to remove a column?"
1151 | ]
1152 | },
1153 | {
1154 | "cell_type": "code",
1155 | "execution_count": 284,
1156 | "metadata": {},
1157 | "outputs": [
1158 | {
1159 | "data": {
1160 | "text/html": [
1161 | "\n",
1162 | "\n",
1175 | "
\n",
1176 | " \n",
1177 | " \n",
1178 | " | \n",
1179 | " Span | \n",
1180 | " Match | \n",
1181 | " Innings | \n",
1182 | " NotOut | \n",
1183 | " Runs | \n",
1184 | " Highest_score | \n",
1185 | " Average | \n",
1186 | " Centuries | \n",
1187 | " Half_centuries | \n",
1188 | " Ducks | \n",
1189 | " 0 | \n",
1190 | " 1 | \n",
1191 | "
\n",
1192 | " \n",
1193 | " \n",
1194 | " \n",
1195 | " 0 | \n",
1196 | " 1989-2013 | \n",
1197 | " 200 | \n",
1198 | " 329 | \n",
1199 | " 33 | \n",
1200 | " 15921 | \n",
1201 | " 248* | \n",
1202 | " 53.78 | \n",
1203 | " 51 | \n",
1204 | " 68 | \n",
1205 | " 14 | \n",
1206 | " SR Tendulkar | \n",
1207 | " INDIA) | \n",
1208 | "
\n",
1209 | " \n",
1210 | " 1 | \n",
1211 | " 1995-2012 | \n",
1212 | " 168 | \n",
1213 | " 287 | \n",
1214 | " 29 | \n",
1215 | " 13378 | \n",
1216 | " 257 | \n",
1217 | " 51.85 | \n",
1218 | " 41 | \n",
1219 | " 62 | \n",
1220 | " 17 | \n",
1221 | " RT Ponting | \n",
1222 | " AUS) | \n",
1223 | "
\n",
1224 | " \n",
1225 | " 2 | \n",
1226 | " 1995-2013 | \n",
1227 | " 166 | \n",
1228 | " 280 | \n",
1229 | " 40 | \n",
1230 | " 13289 | \n",
1231 | " 224 | \n",
1232 | " 55.37 | \n",
1233 | " 45 | \n",
1234 | " 58 | \n",
1235 | " 16 | \n",
1236 | " JH Kallis | \n",
1237 | " ICC/SA) | \n",
1238 | "
\n",
1239 | " \n",
1240 | " 3 | \n",
1241 | " 1996-2012 | \n",
1242 | " 164 | \n",
1243 | " 286 | \n",
1244 | " 32 | \n",
1245 | " 13288 | \n",
1246 | " 270 | \n",
1247 | " 52.31 | \n",
1248 | " 36 | \n",
1249 | " 63 | \n",
1250 | " 8 | \n",
1251 | " R Dravid | \n",
1252 | " ICC/INDIA) | \n",
1253 | "
\n",
1254 | " \n",
1255 | " 4 | \n",
1256 | " 2006-2018 | \n",
1257 | " 161 | \n",
1258 | " 291 | \n",
1259 | " 16 | \n",
1260 | " 12472 | \n",
1261 | " 294 | \n",
1262 | " 45.35 | \n",
1263 | " 33 | \n",
1264 | " 57 | \n",
1265 | " 9 | \n",
1266 | " AN Cook | \n",
1267 | " ENG) | \n",
1268 | "
\n",
1269 | " \n",
1270 | "
\n",
1271 | "
"
1272 | ],
1273 | "text/plain": [
1274 | " Span Match Innings NotOut Runs Highest_score Average Centuries \\\n",
1275 | "0 1989-2013 200 329 33 15921 248* 53.78 51 \n",
1276 | "1 1995-2012 168 287 29 13378 257 51.85 41 \n",
1277 | "2 1995-2013 166 280 40 13289 224 55.37 45 \n",
1278 | "3 1996-2012 164 286 32 13288 270 52.31 36 \n",
1279 | "4 2006-2018 161 291 16 12472 294 45.35 33 \n",
1280 | "\n",
1281 | " Half_centuries Ducks 0 1 \n",
1282 | "0 68 14 SR Tendulkar INDIA) \n",
1283 | "1 62 17 RT Ponting AUS) \n",
1284 | "2 58 16 JH Kallis ICC/SA) \n",
1285 | "3 63 8 R Dravid ICC/INDIA) \n",
1286 | "4 57 9 AN Cook ENG) "
1287 | ]
1288 | },
1289 | "metadata": {},
1290 | "output_type": "display_data"
1291 | }
1292 | ],
1293 | "source": [
1294 | "# line 1\n",
1295 | "# df = df.drop('Player', axis=1)\n",
1296 | "\n",
1297 | "# line 2\n",
1298 | "df.drop('Player', axis=1, inplace=True)\n",
1299 | "\n",
1300 | "# line 1 and line 2 both are same\n",
1301 | "\n",
1302 | "display(df.head())"
1303 | ]
1304 | },
1305 | {
1306 | "cell_type": "code",
1307 | "execution_count": 285,
1308 | "metadata": {},
1309 | "outputs": [
1310 | {
1311 | "data": {
1312 | "text/html": [
1313 | "\n",
1314 | "\n",
1327 | "
\n",
1328 | " \n",
1329 | " \n",
1330 | " | \n",
1331 | " Span | \n",
1332 | " Match | \n",
1333 | " Innings | \n",
1334 | " NotOut | \n",
1335 | " Runs | \n",
1336 | " Highest_score | \n",
1337 | " Average | \n",
1338 | " Centuries | \n",
1339 | " Half_centuries | \n",
1340 | " Ducks | \n",
1341 | " Player | \n",
1342 | " Country | \n",
1343 | "
\n",
1344 | " \n",
1345 | " \n",
1346 | " \n",
1347 | " 0 | \n",
1348 | " 1989-2013 | \n",
1349 | " 200 | \n",
1350 | " 329 | \n",
1351 | " 33 | \n",
1352 | " 15921 | \n",
1353 | " 248* | \n",
1354 | " 53.78 | \n",
1355 | " 51 | \n",
1356 | " 68 | \n",
1357 | " 14 | \n",
1358 | " SR Tendulkar | \n",
1359 | " INDIA) | \n",
1360 | "
\n",
1361 | " \n",
1362 | " 1 | \n",
1363 | " 1995-2012 | \n",
1364 | " 168 | \n",
1365 | " 287 | \n",
1366 | " 29 | \n",
1367 | " 13378 | \n",
1368 | " 257 | \n",
1369 | " 51.85 | \n",
1370 | " 41 | \n",
1371 | " 62 | \n",
1372 | " 17 | \n",
1373 | " RT Ponting | \n",
1374 | " AUS) | \n",
1375 | "
\n",
1376 | " \n",
1377 | " 2 | \n",
1378 | " 1995-2013 | \n",
1379 | " 166 | \n",
1380 | " 280 | \n",
1381 | " 40 | \n",
1382 | " 13289 | \n",
1383 | " 224 | \n",
1384 | " 55.37 | \n",
1385 | " 45 | \n",
1386 | " 58 | \n",
1387 | " 16 | \n",
1388 | " JH Kallis | \n",
1389 | " ICC/SA) | \n",
1390 | "
\n",
1391 | " \n",
1392 | " 3 | \n",
1393 | " 1996-2012 | \n",
1394 | " 164 | \n",
1395 | " 286 | \n",
1396 | " 32 | \n",
1397 | " 13288 | \n",
1398 | " 270 | \n",
1399 | " 52.31 | \n",
1400 | " 36 | \n",
1401 | " 63 | \n",
1402 | " 8 | \n",
1403 | " R Dravid | \n",
1404 | " ICC/INDIA) | \n",
1405 | "
\n",
1406 | " \n",
1407 | " 4 | \n",
1408 | " 2006-2018 | \n",
1409 | " 161 | \n",
1410 | " 291 | \n",
1411 | " 16 | \n",
1412 | " 12472 | \n",
1413 | " 294 | \n",
1414 | " 45.35 | \n",
1415 | " 33 | \n",
1416 | " 57 | \n",
1417 | " 9 | \n",
1418 | " AN Cook | \n",
1419 | " ENG) | \n",
1420 | "
\n",
1421 | " \n",
1422 | "
\n",
1423 | "
"
1424 | ],
1425 | "text/plain": [
1426 | " Span Match Innings NotOut Runs Highest_score Average Centuries \\\n",
1427 | "0 1989-2013 200 329 33 15921 248* 53.78 51 \n",
1428 | "1 1995-2012 168 287 29 13378 257 51.85 41 \n",
1429 | "2 1995-2013 166 280 40 13289 224 55.37 45 \n",
1430 | "3 1996-2012 164 286 32 13288 270 52.31 36 \n",
1431 | "4 2006-2018 161 291 16 12472 294 45.35 33 \n",
1432 | "\n",
1433 | " Half_centuries Ducks Player Country \n",
1434 | "0 68 14 SR Tendulkar INDIA) \n",
1435 | "1 62 17 RT Ponting AUS) \n",
1436 | "2 58 16 JH Kallis ICC/SA) \n",
1437 | "3 63 8 R Dravid ICC/INDIA) \n",
1438 | "4 57 9 AN Cook ENG) "
1439 | ]
1440 | },
1441 | "metadata": {},
1442 | "output_type": "display_data"
1443 | }
1444 | ],
1445 | "source": [
1446 | "df = df.rename(columns={0: 'Player',\n",
1447 | " 1: 'Country'})\n",
1448 | "\n",
1449 | "display(df.head())"
1450 | ]
1451 | },
1452 | {
1453 | "cell_type": "markdown",
1454 | "metadata": {},
1455 | "source": [
1456 | "#### How to replace/remove a value from a pandas column?"
1457 | ]
1458 | },
1459 | {
1460 | "cell_type": "code",
1461 | "execution_count": 286,
1462 | "metadata": {},
1463 | "outputs": [
1464 | {
1465 | "data": {
1466 | "text/html": [
1467 | "\n",
1468 | "\n",
1481 | "
\n",
1482 | " \n",
1483 | " \n",
1484 | " | \n",
1485 | " Span | \n",
1486 | " Match | \n",
1487 | " Innings | \n",
1488 | " NotOut | \n",
1489 | " Runs | \n",
1490 | " Highest_score | \n",
1491 | " Average | \n",
1492 | " Centuries | \n",
1493 | " Half_centuries | \n",
1494 | " Ducks | \n",
1495 | " Player | \n",
1496 | " Country | \n",
1497 | "
\n",
1498 | " \n",
1499 | " \n",
1500 | " \n",
1501 | " 0 | \n",
1502 | " 1989-2013 | \n",
1503 | " 200 | \n",
1504 | " 329 | \n",
1505 | " 33 | \n",
1506 | " 15921 | \n",
1507 | " 248* | \n",
1508 | " 53.78 | \n",
1509 | " 51 | \n",
1510 | " 68 | \n",
1511 | " 14 | \n",
1512 | " SR Tendulkar | \n",
1513 | " INDIA | \n",
1514 | "
\n",
1515 | " \n",
1516 | " 1 | \n",
1517 | " 1995-2012 | \n",
1518 | " 168 | \n",
1519 | " 287 | \n",
1520 | " 29 | \n",
1521 | " 13378 | \n",
1522 | " 257 | \n",
1523 | " 51.85 | \n",
1524 | " 41 | \n",
1525 | " 62 | \n",
1526 | " 17 | \n",
1527 | " RT Ponting | \n",
1528 | " AUS | \n",
1529 | "
\n",
1530 | " \n",
1531 | " 2 | \n",
1532 | " 1995-2013 | \n",
1533 | " 166 | \n",
1534 | " 280 | \n",
1535 | " 40 | \n",
1536 | " 13289 | \n",
1537 | " 224 | \n",
1538 | " 55.37 | \n",
1539 | " 45 | \n",
1540 | " 58 | \n",
1541 | " 16 | \n",
1542 | " JH Kallis | \n",
1543 | " ICC/SA | \n",
1544 | "
\n",
1545 | " \n",
1546 | " 3 | \n",
1547 | " 1996-2012 | \n",
1548 | " 164 | \n",
1549 | " 286 | \n",
1550 | " 32 | \n",
1551 | " 13288 | \n",
1552 | " 270 | \n",
1553 | " 52.31 | \n",
1554 | " 36 | \n",
1555 | " 63 | \n",
1556 | " 8 | \n",
1557 | " R Dravid | \n",
1558 | " ICC/INDIA | \n",
1559 | "
\n",
1560 | " \n",
1561 | " 4 | \n",
1562 | " 2006-2018 | \n",
1563 | " 161 | \n",
1564 | " 291 | \n",
1565 | " 16 | \n",
1566 | " 12472 | \n",
1567 | " 294 | \n",
1568 | " 45.35 | \n",
1569 | " 33 | \n",
1570 | " 57 | \n",
1571 | " 9 | \n",
1572 | " AN Cook | \n",
1573 | " ENG | \n",
1574 | "
\n",
1575 | " \n",
1576 | "
\n",
1577 | "
"
1578 | ],
1579 | "text/plain": [
1580 | " Span Match Innings NotOut Runs Highest_score Average Centuries \\\n",
1581 | "0 1989-2013 200 329 33 15921 248* 53.78 51 \n",
1582 | "1 1995-2012 168 287 29 13378 257 51.85 41 \n",
1583 | "2 1995-2013 166 280 40 13289 224 55.37 45 \n",
1584 | "3 1996-2012 164 286 32 13288 270 52.31 36 \n",
1585 | "4 2006-2018 161 291 16 12472 294 45.35 33 \n",
1586 | "\n",
1587 | " Half_centuries Ducks Player Country \n",
1588 | "0 68 14 SR Tendulkar INDIA \n",
1589 | "1 62 17 RT Ponting AUS \n",
1590 | "2 58 16 JH Kallis ICC/SA \n",
1591 | "3 63 8 R Dravid ICC/INDIA \n",
1592 | "4 57 9 AN Cook ENG "
1593 | ]
1594 | },
1595 | "metadata": {},
1596 | "output_type": "display_data"
1597 | }
1598 | ],
1599 | "source": [
1600 | "df['Country'] = df['Country'].str.replace(\")\", \"\")\n",
1601 | "\n",
1602 | "display(df.head())"
1603 | ]
1604 | },
1605 | {
1606 | "cell_type": "code",
1607 | "execution_count": 287,
1608 | "metadata": {},
1609 | "outputs": [
1610 | {
1611 | "name": "stdout",
1612 | "output_type": "stream",
1613 | "text": [
1614 | "Index(['Span', 'Match', 'Innings', 'NotOut', 'Runs', 'Highest_score',\n",
1615 | " 'Average', 'Centuries', 'Half_centuries', 'Ducks', 'Player', 'Country'],\n",
1616 | " dtype='object')\n"
1617 | ]
1618 | }
1619 | ],
1620 | "source": [
1621 | "print(df.columns)\n",
1622 | "\n",
1623 | "new_col_sequence = ['Player', 'Country', 'Span', 'Match', 'Innings', 'NotOut', 'Runs', 'Highest_score',\n",
1624 | " 'Average', 'Centuries', 'Half_centuries', 'Ducks']"
1625 | ]
1626 | },
1627 | {
1628 | "cell_type": "code",
1629 | "execution_count": 288,
1630 | "metadata": {},
1631 | "outputs": [
1632 | {
1633 | "data": {
1634 | "text/html": [
1635 | "\n",
1636 | "\n",
1649 | "
\n",
1650 | " \n",
1651 | " \n",
1652 | " | \n",
1653 | " Player | \n",
1654 | " Country | \n",
1655 | " Span | \n",
1656 | " Match | \n",
1657 | " Innings | \n",
1658 | " NotOut | \n",
1659 | " Runs | \n",
1660 | " Highest_score | \n",
1661 | " Average | \n",
1662 | " Centuries | \n",
1663 | " Half_centuries | \n",
1664 | " Ducks | \n",
1665 | "
\n",
1666 | " \n",
1667 | " \n",
1668 | " \n",
1669 | " 0 | \n",
1670 | " SR Tendulkar | \n",
1671 | " INDIA | \n",
1672 | " 1989-2013 | \n",
1673 | " 200 | \n",
1674 | " 329 | \n",
1675 | " 33 | \n",
1676 | " 15921 | \n",
1677 | " 248* | \n",
1678 | " 53.78 | \n",
1679 | " 51 | \n",
1680 | " 68 | \n",
1681 | " 14 | \n",
1682 | "
\n",
1683 | " \n",
1684 | " 1 | \n",
1685 | " RT Ponting | \n",
1686 | " AUS | \n",
1687 | " 1995-2012 | \n",
1688 | " 168 | \n",
1689 | " 287 | \n",
1690 | " 29 | \n",
1691 | " 13378 | \n",
1692 | " 257 | \n",
1693 | " 51.85 | \n",
1694 | " 41 | \n",
1695 | " 62 | \n",
1696 | " 17 | \n",
1697 | "
\n",
1698 | " \n",
1699 | " 2 | \n",
1700 | " JH Kallis | \n",
1701 | " ICC/SA | \n",
1702 | " 1995-2013 | \n",
1703 | " 166 | \n",
1704 | " 280 | \n",
1705 | " 40 | \n",
1706 | " 13289 | \n",
1707 | " 224 | \n",
1708 | " 55.37 | \n",
1709 | " 45 | \n",
1710 | " 58 | \n",
1711 | " 16 | \n",
1712 | "
\n",
1713 | " \n",
1714 | " 3 | \n",
1715 | " R Dravid | \n",
1716 | " ICC/INDIA | \n",
1717 | " 1996-2012 | \n",
1718 | " 164 | \n",
1719 | " 286 | \n",
1720 | " 32 | \n",
1721 | " 13288 | \n",
1722 | " 270 | \n",
1723 | " 52.31 | \n",
1724 | " 36 | \n",
1725 | " 63 | \n",
1726 | " 8 | \n",
1727 | "
\n",
1728 | " \n",
1729 | " 4 | \n",
1730 | " AN Cook | \n",
1731 | " ENG | \n",
1732 | " 2006-2018 | \n",
1733 | " 161 | \n",
1734 | " 291 | \n",
1735 | " 16 | \n",
1736 | " 12472 | \n",
1737 | " 294 | \n",
1738 | " 45.35 | \n",
1739 | " 33 | \n",
1740 | " 57 | \n",
1741 | " 9 | \n",
1742 | "
\n",
1743 | " \n",
1744 | "
\n",
1745 | "
"
1746 | ],
1747 | "text/plain": [
1748 | " Player Country Span Match Innings NotOut Runs \\\n",
1749 | "0 SR Tendulkar INDIA 1989-2013 200 329 33 15921 \n",
1750 | "1 RT Ponting AUS 1995-2012 168 287 29 13378 \n",
1751 | "2 JH Kallis ICC/SA 1995-2013 166 280 40 13289 \n",
1752 | "3 R Dravid ICC/INDIA 1996-2012 164 286 32 13288 \n",
1753 | "4 AN Cook ENG 2006-2018 161 291 16 12472 \n",
1754 | "\n",
1755 | " Highest_score Average Centuries Half_centuries Ducks \n",
1756 | "0 248* 53.78 51 68 14 \n",
1757 | "1 257 51.85 41 62 17 \n",
1758 | "2 224 55.37 45 58 16 \n",
1759 | "3 270 52.31 36 63 8 \n",
1760 | "4 294 45.35 33 57 9 "
1761 | ]
1762 | },
1763 | "metadata": {},
1764 | "output_type": "display_data"
1765 | }
1766 | ],
1767 | "source": [
1768 | "df = df[new_col_sequence]\n",
1769 | "\n",
1770 | "display(df.head())"
1771 | ]
1772 | },
1773 | {
1774 | "cell_type": "code",
1775 | "execution_count": null,
1776 | "metadata": {},
1777 | "outputs": [],
1778 | "source": []
1779 | }
1780 | ],
1781 | "metadata": {
1782 | "kernelspec": {
1783 | "display_name": "Python 3",
1784 | "language": "python",
1785 | "name": "python3"
1786 | },
1787 | "language_info": {
1788 | "codemirror_mode": {
1789 | "name": "ipython",
1790 | "version": 3
1791 | },
1792 | "file_extension": ".py",
1793 | "mimetype": "text/x-python",
1794 | "name": "python",
1795 | "nbconvert_exporter": "python",
1796 | "pygments_lexer": "ipython3",
1797 | "version": "3.8.5"
1798 | }
1799 | },
1800 | "nbformat": 4,
1801 | "nbformat_minor": 4
1802 | }
1803 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Analysis with Python
2 |
3 | #### Lecture 01: Importing Data with Pandas
4 | - challenges of reading a .csv file
5 | - How to deal with UnicodeDecodeError?
6 | - reading a csv file by changing the engine
7 | - choose columns by name before reading a csv file
8 | - choose columns by number before reading a csv file
9 | - reading only the first n number of rows
10 |
11 | #### Lecture 02: Data Preprocessing with Pandas
12 | - reading a .txt (text) or an excel (.xlsx) file
13 | - dealing with the UnicodeDecodeError?
14 | - renaming column names
15 | - creating a new DataFrame?
16 | - concatenation of two dataframes
17 | - column splitting
18 | - creating a new column in a dataframe
19 | - replace/removing a value from a pandas column
20 | - removing a column from the dataframe
21 |
22 | #### Lecture 03: HW review session
23 |
24 | #### Lecture 04: Data Preproccessing with Pandas
25 | - How to extract new information from a column?
26 | - How to create a column based on a condition or function?
27 | - Removing a string from a column
28 | - Checking the unique values for each column
29 | - performing calculation in dataframe columns
30 | - dataframe sorting
31 | - dataframe slicing
32 |
33 | #### Lecture 05: Data Cleaning - Handling Missing Values
34 | - performing data cleaning
35 | - data visualization of missing values
36 | - string to datetime conversion
37 | - removing missing values
38 | - replacing missing values by: 1. mean, 2. median, 3. constant, 4. interpolation, 5. forward imputation, 6. backward imputation
39 |
40 | #### Lecture 6: Data Joining/Merging using Pandas
41 | - inner join, outer join, left join, right join
42 |
43 | #### Lecture 7: Data Aggregation/grouping and Pivot table using Pandas
44 | - Data filtering
45 | - Data preprocessing
46 | - Data Aggregation/grouping
47 | - Pivot table
48 | - Data Visualization: Barplot
49 |
50 | #### Lecture 8: Data Correlation and Categorical Variable Encoding
51 | - Data Correlation
52 | - Heatmap
53 | - Dealing with categorical variables
54 | - Label encoding
55 | - One-hot encoding
56 | - Categorical variable creation from the numeric variable
57 |
--------------------------------------------------------------------------------
/friends.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SKawsar/Data_Analysis_with_Python/1ca7520aafd8b860647d114fc42a096dda2de071/friends.xlsx
--------------------------------------------------------------------------------
/gre.csv:
--------------------------------------------------------------------------------
1 | Date,verbal_score,quant_score
2 | 09/01/2021,0,1
3 | 09/02/2021,1,2
4 | 09/03/2021,2,3
5 | 09/04/2021,3,
6 | 09/05/2021,4,
7 | 09/06/2021,,
8 | 09/07/2021,,7
9 | 09/08/2021,7,8
10 | 09/09/2021,8,9
11 | 09/10/2021,9,10
--------------------------------------------------------------------------------
/most_runs_in_test_cricket.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SKawsar/Data_Analysis_with_Python/1ca7520aafd8b860647d114fc42a096dda2de071/most_runs_in_test_cricket.csv
--------------------------------------------------------------------------------
/most_runs_in_test_cricket.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SKawsar/Data_Analysis_with_Python/1ca7520aafd8b860647d114fc42a096dda2de071/most_runs_in_test_cricket.txt
--------------------------------------------------------------------------------
/test_cricket.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SKawsar/Data_Analysis_with_Python/1ca7520aafd8b860647d114fc42a096dda2de071/test_cricket.xlsx
--------------------------------------------------------------------------------
/wickets.csv:
--------------------------------------------------------------------------------
1 | Player,Span,Mat,Inns,Balls,Runs,Wkts,BBI,BBM,Ave,Econ,SR,5,10
2 | M Muralitharan (ICC/SL),1992-2010,133,230,44039,18180,800,9/51,16/220,22.72,2.47,55,67,22
3 | SK Warne (AUS),1992-2007,145,273,40705,17995,708,8/71,12/128,25.41,2.65,57.4,37,10
4 | JM Anderson (ENG),2003-2021,164*,304,35079,16575,623,7/42,11/71,26.6,2.83,56.3,30,3
5 | A Kumble (INDIA),1990-2008,132,236,40850,18355,619,10/74,14/149,29.65,2.69,65.9,35,8
6 | GD McGrath (AUS),1993-2007,124,243,29248,12186,563,8/24,10/27,21.64,2.49,51.9,29,3
7 | SCJ Broad (ENG),2007-2021,149,274,29863,14590,524,8/15,11/121,27.84,2.93,56.9,18,3
8 | CA Walsh (WI),1984-2001,132,242,30019,12688,519,7/37,13/55,24.44,2.53,57.8,22,3
9 | DW Steyn (SA),2004-2019,93,171,18608,10077,439,7/51,11/60,22.95,3.24,42.3,26,5
10 | N Kapil Dev (INDIA),1978-1994,131,227,27740,12867,434,9/83,11/146,29.64,2.78,63.9,23,2
11 | HMRKB Herath (SL),1999-2018,93,170,25993,12157,433,9/127,14/184,28.07,2.8,60,34,9
12 | Sir RJ Hadlee (NZ),1973-1990,86,150,21918,9611,431,9/52,15/123,22.29,2.63,50.8,36,9
13 | SM Pollock (SA),1995-2008,108,202,24353,9733,421,7/87,10/147,23.11,2.39,57.8,16,1
14 | Harbhajan Singh (INDIA),1998-2015,103,190,28580,13537,417,8/84,15/217,32.46,2.84,68.5,25,5
15 | Wasim Akram (PAK),1985-2002,104,181,22627,9779,414,7/119,11/110,23.62,2.59,54.6,25,5
16 | R Ashwin (INDIA),2011-2021,79,148,21670,10144,413,7/59,13/140,24.56,2.8,52.4,30,7
17 | CEL Ambrose (WI),1988-2000,98,179,22103,8501,405,8/45,11/84,20.99,2.3,54.5,22,3
18 | NM Lyon (AUS),2011-2021,100,191,25690,12816,399,8/50,13/154,32.12,2.99,64.3,18,3
19 | M Ntini (SA),1998-2009,101,190,20834,11242,390,7/37,13/132,28.82,3.23,53.4,18,4
20 | IT Botham (ENG),1977-1992,102,168,21815,10878,383,8/34,13/106,28.4,2.99,56.9,27,4
21 | MD Marshall (WI),1978-1991,81,151,17584,7876,376,7/22,11/89,20.94,2.68,46.7,22,4
22 | Waqar Younis (PAK),1989-2003,87,154,16224,8788,373,7/76,13/135,23.56,3.25,43.4,22,5
23 | Imran Khan (PAK),1971-1992,88,142,19458,8258,362,8/58,14/116,22.81,2.54,53.7,23,6
24 | DL Vettori (ICC/NZ),1997-2014,113,187,28814,12441,362,7/87,12/149,34.36,2.59,79.5,20,3
25 | DK Lillee (AUS),1971-1984,70,132,18467,8493,355,7/83,11/123,23.92,2.75,52,23,7
26 | WPUJC Vaas (SL),1994-2009,111,194,23438,10501,355,7/71,14/191,29.58,2.68,66,12,2
27 | AA Donald (SA),1992-2002,72,129,15519,7344,330,8/71,12/139,22.25,2.83,47,20,3
28 | RGD Willis (ENG),1971-1984,90,165,17357,8190,325,8/43,9/92,25.2,2.83,53.4,16,0
29 | TG Southee (NZ),2008-2021,79,148,17886,8862,314,7/64,10/108,28.22,2.97,56.9,12,1
30 | MG Johnson (AUS),2007-2015,73,140,16001,8891,313,8/61,12/127,28.4,3.33,51.1,12,3
31 | Z Khan (INDIA),2000-2014,92,165,18785,10247,311,7/87,10/149,32.94,3.27,60.4,11,1
32 | B Lee (AUS),1999-2008,76,150,16531,9554,310,5/30,9/171,30.81,3.46,53.3,10,0
33 | M Morkel (SA),2006-2018,86,160,16498,8550,309,6/23,9/110,27.66,3.1,53.3,8,0
34 | LR Gibbs (WI),1958-1976,79,148,27115,8989,309,8/38,11/157,29.09,1.98,87.7,18,2
35 | FS Trueman (ENG),1952-1965,67,127,15178,6625,307,8/31,12/119,21.57,2.61,49.4,17,3
36 | I Sharma (INDIA),2007-2021,103*,183,18692,9849,306,7/74,10/108,32.18,3.16,61,11,1
37 | DL Underwood (ENG),1966-1982,86,151,21862,7674,297,8/51,13/71,25.83,2.1,73.6,17,6
38 | TA Boult (NZ),2011-2021,73,139,16271,8080,292,6/30,10/80,27.67,2.97,55.7,8,1
39 | JH Kallis (ICC/SA),1995-2013,166,272,20232,9535,292,6/54,9/92,32.65,2.82,69.2,5,0
40 | CJ McDermott (AUS),1984-1996,71,124,16586,8332,291,8/97,11/157,28.63,3.01,56.9,14,2
41 | BS Bedi (INDIA),1966-1979,67,118,21364,7637,266,7/98,10/194,28.71,2.14,80.3,14,1
42 | Danish Kaneria (PAK),2000-2010,61,112,17697,9082,261,7/77,12/94,34.79,3.07,67.8,15,2
43 | J Garner (WI),1977-1987,58,111,13169,5433,259,6/56,9/108,20.97,2.47,50.8,7,0
44 | JN Gillespie (AUS),1996-2006,71,137,14234,6770,259,7/37,9/80,26.13,2.85,54.9,8,0
45 | MA Starc (AUS),2011-2021,61,117,12575,7031,255,6/50,11/94,27.57,3.35,49.3,13,2
46 | GP Swann (ENG),2008-2013,60,109,15349,7642,255,6/65,10/132,29.96,2.98,60.1,17,3
47 | JB Statham (ENG),1951-1965,70,129,16056,6261,252,7/39,11/97,24.84,2.33,63.7,9,1
48 | MA Holding (WI),1975-1987,60,113,12680,5898,249,8/92,14/149,23.68,2.79,50.9,13,2
49 | R Benaud (AUS),1952-1964,63,116,19108,6704,248,7/72,11/105,27.03,2.1,77,16,1
50 | MJ Hoggard (ENG),2000-2008,67,122,13909,7564,248,7/61,12/205,30.5,3.26,56,7,1
51 | GD McKenzie (AUS),1961-1971,60,113,17681,7328,246,8/71,10/91,29.78,2.48,71.8,16,3
52 | BS Chandrasekhar (INDIA),1964-1979,58,97,15963,7199,242,8/79,12/104,29.74,2.7,65.9,16,2
53 | AV Bedser (ENG),1946-1955,51,92,15918,5876,236,7/44,14/99,24.89,2.21,67.4,15,5
54 | J Srinath (INDIA),1991-2002,67,121,15104,7196,236,8/86,13/132,30.49,2.85,64,10,1
55 | Abdul Qadir (PAK),1977-1990,67,111,17126,7742,236,9/56,13/101,32.8,2.71,72.5,15,5
56 | Yasir Shah (PAK),2014-2021,46*,84,13607,7248,235,8/41,14/184,30.84,3.19,57.9,16,3
57 | GS Sobers (WI),1954-1974,93,159,21599,7999,235,6/73,8/80,34.03,2.22,91.9,6,0
58 | AR Caddick (ENG),1993-2003,62,105,13558,6999,234,7/46,10/215,29.91,3.09,57.9,13,1
59 | CS Martin (NZ),2000-2013,71,126,14026,7878,233,6/26,11/180,33.81,3.37,60.1,10,1
60 | N Wagner (NZ),2012-2021,54,102,11991,6046,229,7/39,9/73,26.4,3.02,52.3,9,0
61 | D Gough (ENG),1994-2003,58,95,11821,6503,229,6/42,9/92,28.39,3.3,51.6,9,0
62 | RR Lindwall (AUS),1946-1960,61,113,13650,5251,228,7/38,9/70,23.03,2.3,59.8,12,0
63 | SJ Harmison (ENG/ICC),2002-2009,63,115,13375,7192,226,7/12,11/76,31.82,3.22,59.1,8,1
64 | A Flintoff (ENG/ICC),1998-2009,79,137,14951,7410,226,5/58,8/156,32.78,2.97,66.1,3,0
65 | KAJ Roach (WI),2009-2021,66*,117,11924,6141,225,6/48,10/146,27.29,3.09,52.9,9,1
66 | VD Philander (SA),2011-2020,64,119,11391,5000,224,6/21,10/102,22.32,2.63,50.8,13,2
67 | RA Jadeja (INDIA),2012-2021,54*,101,13325,5446,221,7/48,10/154,24.64,2.45,60.2,9,1
68 | PM Siddle (AUS),2008-2019,67,126,13907,6777,221,6/54,9/104,30.66,2.92,62.9,8,0
69 | CL Cairns (NZ),1989-2004,62,104,11698,6410,218,7/27,10/100,29.4,3.28,53.6,13,1
70 | CV Grimmett (AUS),1925-1936,37,67,14513,5231,216,7/40,14/199,24.21,2.16,67.1,21,7
71 | HH Streak (ZIM),1993-2005,65,102,13559,6079,216,6/73,9/72,28.14,2.69,62.7,7,0
72 | Shakib Al Hasan (BDESH),2007-2021,58,98,13415,6679,215,7/36,10/124,31.06,2.98,62.3,18,2
73 | K Rabada (SA),2015-2021,47,86,8785,4846,213,7/112,13/144,22.75,3.3,41.2,10,4
74 | JR Hazlewood (AUS),2014-2021,55,103,11887,5438,212,6/67,9/115,25.65,2.74,56,9,0
75 | MG Hughes (AUS),1985-1994,53,97,12285,6017,212,8/87,13/217,28.38,2.93,57.9,7,1
76 | SCG MacGill (AUS),1998-2008,44,85,11237,6038,208,8/108,12/107,29.02,3.22,54,12,2
77 | Saqlain Mushtaq (PAK),1995-2004,49,86,14070,6206,208,8/164,10/155,29.83,2.64,67.6,13,3
78 | AME Roberts (WI),1974-1983,47,90,11135,5174,202,7/54,12/121,25.61,2.78,55.1,11,2
79 | JA Snow (ENG),1965-1976,49,93,12021,5387,202,7/40,10/142,26.66,2.68,59.5,8,1
80 | JR Thomson (AUS),1972-1985,51,90,10535,5601,200,6/46,9/105,28,3.18,52.6,8,0
--------------------------------------------------------------------------------