├── Lecture_1.ipynb ├── Lecture_2.ipynb ├── Lecture_4.ipynb ├── Lecture_5.ipynb ├── Lecture_6.ipynb ├── Lecture_7.ipynb ├── Lecture_8.ipynb ├── README.md ├── bmw.csv ├── friends.xlsx ├── gre.csv ├── most_runs_in_test_cricket.csv ├── most_runs_in_test_cricket.txt ├── test_cricket.xlsx └── wickets.csv /Lecture_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Analysis with Python\n", 8 | "## Lecture 01: Importing Data with Pandas\n", 9 | "
Instructor: Md Shahidullah Kawsar\n", 10 | "
Data Scientist, IDARE, Houston, TX, USA\n", 11 | "\n", 12 | "**Objectives:**\n", 13 | "- challenges of reading a .csv file\n", 14 | "- How to deal with UnicodeDecodeError?\n", 15 | "- reading a csv file by changing the engine\n", 16 | "- choose columns by name before reading a csv file\n", 17 | "- choose columns by number before reading a csv file\n", 18 | "- reading only the first n number of rows\n", 19 | "\n", 20 | "**References:**\n", 21 | "
[1] Data Source: https://stats.espncricinfo.com/ci/content/records/223646.html\n", 22 | "
[2] https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html\n", 23 | "
[3] https://docs.python.org/3/library/codecs.html#standard-encodings\n", 24 | "
[4] https://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "#### Import required libraries" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 49, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import pandas as pd\n", 41 | "import numpy as np" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "#### How to read a csv file?\n", 49 | "#### How to deal with UnicodeDecodeError?" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 50, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/html": [ 60 | "
\n", 61 | "\n", 74 | "\n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | "
PlayerSpanMatInnsNORunsHSAve100500
0SR Tendulkar (INDIA)1989-20132003293315921248*53.78516814
1RT Ponting (AUS)1995-2012168287291337825751.85416217
2JH Kallis (ICC/SA)1995-2013166280401328922455.37455816
3R Dravid (ICC/INDIA)1996-2012164286321328827052.3136638
4AN Cook (ENG)2006-2018161291161247229445.3533579
\n", 164 | "
" 165 | ], 166 | "text/plain": [ 167 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n", 168 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n", 169 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n", 170 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n", 171 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n", 172 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n", 173 | "\n", 174 | " 50 0 \n", 175 | "0 68 14 \n", 176 | "1 62 17 \n", 177 | "2 58 16 \n", 178 | "3 63 8 \n", 179 | "4 57 9 " 180 | ] 181 | }, 182 | "metadata": {}, 183 | "output_type": "display_data" 184 | }, 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "(97, 11)\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "# method 1\n", 195 | "# reading a csv file \n", 196 | "df = pd.read_csv(\"most_runs_in_test_cricket.csv\", encoding = \"ISO-8859-1\")\n", 197 | "\n", 198 | "display(df.head())\n", 199 | "# print(df.tail())\n", 200 | "print(df.shape)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 51, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/html": [ 211 | "
\n", 212 | "\n", 225 | "\n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | "
PlayerSpanMatInnsNORunsHSAve100500
0SR Tendulkar (INDIA)1989-20132003293315921248*53.78516814
1RT Ponting (AUS)1995-2012168287291337825751.85416217
2JH Kallis (ICC/SA)1995-2013166280401328922455.37455816
3R Dravid (ICC/INDIA)1996-2012164286321328827052.3136638
4AN Cook (ENG)2006-2018161291161247229445.3533579
\n", 315 | "
" 316 | ], 317 | "text/plain": [ 318 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n", 319 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n", 320 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n", 321 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n", 322 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n", 323 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n", 324 | "\n", 325 | " 50 0 \n", 326 | "0 68 14 \n", 327 | "1 62 17 \n", 328 | "2 58 16 \n", 329 | "3 63 8 \n", 330 | "4 57 9 " 331 | ] 332 | }, 333 | "metadata": {}, 334 | "output_type": "display_data" 335 | } 336 | ], 337 | "source": [ 338 | "# method 2\n", 339 | "df = pd.read_csv(\"most_runs_in_test_cricket.csv\", encoding = 'unicode_escape')\n", 340 | "\n", 341 | "display(df.head())" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 52, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/html": [ 352 | "
\n", 353 | "\n", 366 | "\n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | "
PlayerSpanMatInnsNORunsHSAve100500
0SR Tendulkar (INDIA)1989-20132003293315921248*53.78516814
1RT Ponting (AUS)1995-2012168287291337825751.85416217
2JH Kallis (ICC/SA)1995-2013166280401328922455.37455816
3R Dravid (ICC/INDIA)1996-2012164286321328827052.3136638
4AN Cook (ENG)2006-2018161291161247229445.3533579
\n", 456 | "
" 457 | ], 458 | "text/plain": [ 459 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n", 460 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n", 461 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n", 462 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n", 463 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n", 464 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n", 465 | "\n", 466 | " 50 0 \n", 467 | "0 68 14 \n", 468 | "1 62 17 \n", 469 | "2 58 16 \n", 470 | "3 63 8 \n", 471 | "4 57 9 " 472 | ] 473 | }, 474 | "metadata": {}, 475 | "output_type": "display_data" 476 | } 477 | ], 478 | "source": [ 479 | "# method 3\n", 480 | "# reading a csv file by changing the engine\n", 481 | "df = pd.read_csv(\"most_runs_in_test_cricket.csv\", engine = 'python')\n", 482 | "\n", 483 | "# removing the weird \"�\" symbol from the 'Player' column\n", 484 | "df['Player'] = df['Player'].str.replace(\"�\", \" \")\n", 485 | "display(df.head())" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 53, 491 | "metadata": {}, 492 | "outputs": [ 493 | { 494 | "name": "stdout", 495 | "output_type": "stream", 496 | "text": [ 497 | "number of rows = 97\n", 498 | "number of columns = 11\n" 499 | ] 500 | } 501 | ], 502 | "source": [ 503 | "# number of rows\n", 504 | "print(\"number of rows = \", df.shape[0])\n", 505 | "\n", 506 | "# number of columns\n", 507 | "print(\"number of columns = \", df.shape[1])" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 54, 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "\n", 520 | "RangeIndex: 97 entries, 0 to 96\n", 521 | "Data columns (total 11 columns):\n", 522 | " # Column Non-Null Count Dtype \n", 523 | "--- ------ -------------- ----- \n", 524 | " 0 Player 97 non-null object \n", 525 | " 1 Span 97 non-null object \n", 526 | " 2 Mat 97 non-null int64 \n", 527 | " 3 Inns 97 non-null int64 \n", 528 | " 4 NO 97 non-null int64 \n", 529 | " 5 Runs 97 non-null int64 \n", 530 | " 6 HS 97 non-null object \n", 531 | " 7 Ave 97 non-null float64\n", 532 | " 8 100 97 non-null int64 \n", 533 | " 9 50 97 non-null int64 \n", 534 | " 10 0 97 non-null int64 \n", 535 | "dtypes: float64(1), int64(7), object(3)\n", 536 | "memory usage: 8.5+ KB\n", 537 | "None\n" 538 | ] 539 | } 540 | ], 541 | "source": [ 542 | "# checking for missing values and data types of each column\n", 543 | "print(df.info())" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 55, 549 | "metadata": {}, 550 | "outputs": [ 551 | { 552 | "data": { 553 | "text/html": [ 554 | "
\n", 555 | "\n", 568 | "\n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | "
MatInnsNORunsAve100500
count97.00000097.00000097.00000097.00000097.00000097.00000097.00000097.000000
mean104.979381178.75257716.0515467574.17525846.78103120.54639235.47422711.329897
std27.06472944.9634188.7540122224.2552788.1682688.22600111.4991784.147594
min52.00000080.0000005.0000005062.00000030.3000004.00000013.0000002.000000
25%86.000000146.00000010.0000005825.00000042.29000015.00000027.0000009.000000
50%102.000000176.00000015.0000007214.00000045.84000019.00000033.00000011.000000
75%117.000000200.00000020.0000008540.00000050.66000024.00000042.00000014.000000
max200.000000329.00000049.00000015921.00000099.94000051.00000068.00000022.000000
\n", 673 | "
" 674 | ], 675 | "text/plain": [ 676 | " Mat Inns NO Runs Ave 100 \\\n", 677 | "count 97.000000 97.000000 97.000000 97.000000 97.000000 97.000000 \n", 678 | "mean 104.979381 178.752577 16.051546 7574.175258 46.781031 20.546392 \n", 679 | "std 27.064729 44.963418 8.754012 2224.255278 8.168268 8.226001 \n", 680 | "min 52.000000 80.000000 5.000000 5062.000000 30.300000 4.000000 \n", 681 | "25% 86.000000 146.000000 10.000000 5825.000000 42.290000 15.000000 \n", 682 | "50% 102.000000 176.000000 15.000000 7214.000000 45.840000 19.000000 \n", 683 | "75% 117.000000 200.000000 20.000000 8540.000000 50.660000 24.000000 \n", 684 | "max 200.000000 329.000000 49.000000 15921.000000 99.940000 51.000000 \n", 685 | "\n", 686 | " 50 0 \n", 687 | "count 97.000000 97.000000 \n", 688 | "mean 35.474227 11.329897 \n", 689 | "std 11.499178 4.147594 \n", 690 | "min 13.000000 2.000000 \n", 691 | "25% 27.000000 9.000000 \n", 692 | "50% 33.000000 11.000000 \n", 693 | "75% 42.000000 14.000000 \n", 694 | "max 68.000000 22.000000 " 695 | ] 696 | }, 697 | "metadata": {}, 698 | "output_type": "display_data" 699 | } 700 | ], 701 | "source": [ 702 | "# checking data statistics\n", 703 | "display(df.describe())" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 56, 709 | "metadata": {}, 710 | "outputs": [ 711 | { 712 | "name": "stdout", 713 | "output_type": "stream", 714 | "text": [ 715 | "Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', '100', '50',\n", 716 | " '0'],\n", 717 | " dtype='object')\n" 718 | ] 719 | } 720 | ], 721 | "source": [ 722 | "# column names\n", 723 | "print(df.columns)" 724 | ] 725 | }, 726 | { 727 | "cell_type": "markdown", 728 | "metadata": {}, 729 | "source": [ 730 | "#### choose columns by name to read a csv file" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": 57, 736 | "metadata": {}, 737 | "outputs": [ 738 | { 739 | "data": { 740 | "text/html": [ 741 | "
\n", 742 | "\n", 755 | "\n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | "
PlayerMatRuns100
0SR Tendulkar (INDIA)2001592151
1RT Ponting (AUS)1681337841
2JH Kallis (ICC/SA)1661328945
3R Dravid (ICC/INDIA)1641328836
4AN Cook (ENG)1611247233
5KC Sangakkara (SL)1341240038
6BC Lara (ICC/WI)1311195334
7S Chanderpaul (WI)1641186730
8DPMD Jayawardene (SL)1491181434
9AR Border (AUS)1561117427
\n", 838 | "
" 839 | ], 840 | "text/plain": [ 841 | " Player Mat Runs 100\n", 842 | "0 SR Tendulkar (INDIA) 200 15921 51\n", 843 | "1 RT Ponting (AUS) 168 13378 41\n", 844 | "2 JH Kallis (ICC/SA) 166 13289 45\n", 845 | "3 R Dravid (ICC/INDIA) 164 13288 36\n", 846 | "4 AN Cook (ENG) 161 12472 33\n", 847 | "5 KC Sangakkara (SL) 134 12400 38\n", 848 | "6 BC Lara (ICC/WI) 131 11953 34\n", 849 | "7 S Chanderpaul (WI) 164 11867 30\n", 850 | "8 DPMD Jayawardene (SL) 149 11814 34\n", 851 | "9 AR Border (AUS) 156 11174 27" 852 | ] 853 | }, 854 | "metadata": {}, 855 | "output_type": "display_data" 856 | } 857 | ], 858 | "source": [ 859 | "col_names = ['Player', 'Mat', 'Runs', '100']\n", 860 | "df_usecols = pd.read_csv(\"most_runs_in_test_cricket.csv\", encoding = 'unicode_escape', usecols=col_names)\n", 861 | "\n", 862 | "display(df_usecols.head(10))" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 58, 868 | "metadata": {}, 869 | "outputs": [ 870 | { 871 | "name": "stdout", 872 | "output_type": "stream", 873 | "text": [ 874 | "(97, 11)\n", 875 | "(97, 4)\n" 876 | ] 877 | } 878 | ], 879 | "source": [ 880 | "print(df.shape)\n", 881 | "\n", 882 | "# selecting columns after data importing\n", 883 | "df = df[col_names]\n", 884 | "\n", 885 | "print(df.shape)" 886 | ] 887 | }, 888 | { 889 | "cell_type": "markdown", 890 | "metadata": {}, 891 | "source": [ 892 | "#### choose columns by number to read a csv file" 893 | ] 894 | }, 895 | { 896 | "cell_type": "code", 897 | "execution_count": 59, 898 | "metadata": {}, 899 | "outputs": [ 900 | { 901 | "data": { 902 | "text/html": [ 903 | "
\n", 904 | "\n", 917 | "\n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | "
PlayerMatRuns100
0SR Tendulkar (INDIA)2001592151
1RT Ponting (AUS)1681337841
2JH Kallis (ICC/SA)1661328945
3R Dravid (ICC/INDIA)1641328836
4AN Cook (ENG)1611247233
5KC Sangakkara (SL)1341240038
6BC Lara (ICC/WI)1311195334
7S Chanderpaul (WI)1641186730
8DPMD Jayawardene (SL)1491181434
9AR Border (AUS)1561117427
\n", 1000 | "
" 1001 | ], 1002 | "text/plain": [ 1003 | " Player Mat Runs 100\n", 1004 | "0 SR Tendulkar (INDIA) 200 15921 51\n", 1005 | "1 RT Ponting (AUS) 168 13378 41\n", 1006 | "2 JH Kallis (ICC/SA) 166 13289 45\n", 1007 | "3 R Dravid (ICC/INDIA) 164 13288 36\n", 1008 | "4 AN Cook (ENG) 161 12472 33\n", 1009 | "5 KC Sangakkara (SL) 134 12400 38\n", 1010 | "6 BC Lara (ICC/WI) 131 11953 34\n", 1011 | "7 S Chanderpaul (WI) 164 11867 30\n", 1012 | "8 DPMD Jayawardene (SL) 149 11814 34\n", 1013 | "9 AR Border (AUS) 156 11174 27" 1014 | ] 1015 | }, 1016 | "metadata": {}, 1017 | "output_type": "display_data" 1018 | }, 1019 | { 1020 | "name": "stdout", 1021 | "output_type": "stream", 1022 | "text": [ 1023 | "(97, 4)\n" 1024 | ] 1025 | } 1026 | ], 1027 | "source": [ 1028 | "col_nums = [0, 2, 5, 8]\n", 1029 | "df_usecols_index = pd.read_csv(\"most_runs_in_test_cricket.csv\", encoding = 'unicode_escape', usecols=col_nums)\n", 1030 | "\n", 1031 | "display(df_usecols_index.head(10))\n", 1032 | "print(df_usecols_index.shape)" 1033 | ] 1034 | }, 1035 | { 1036 | "cell_type": "markdown", 1037 | "metadata": {}, 1038 | "source": [ 1039 | "#### reading only the first n number of rows" 1040 | ] 1041 | }, 1042 | { 1043 | "cell_type": "code", 1044 | "execution_count": 60, 1045 | "metadata": {}, 1046 | "outputs": [ 1047 | { 1048 | "data": { 1049 | "text/html": [ 1050 | "
\n", 1051 | "\n", 1064 | "\n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | "
PlayerSpanMatInnsNORunsHSAve100500
0SR Tendulkar (INDIA)1989-20132003293315921248*53.78516814
1RT Ponting (AUS)1995-2012168287291337825751.85416217
2JH Kallis (ICC/SA)1995-2013166280401328922455.37455816
3R Dravid (ICC/INDIA)1996-2012164286321328827052.3136638
4AN Cook (ENG)2006-2018161291161247229445.3533579
5KC Sangakkara (SL)2000-2015134233171240031957.40385211
6BC Lara (ICC/WI)1990-2006131232611953400*52.88344817
7S Chanderpaul (WI)1994-20151642804911867203*51.37306615
8DPMD Jayawardene (SL)1997-2014149252151181437449.84345015
9AR Border (AUS)1978-1994156265441117420550.56276311
10SR Waugh (AUS)1985-2004168260461092720051.06325022
11SM Gavaskar (INDIA)1971-19871252141610122236*51.12344512
12Younis Khan (PAK)2000-2017118213191009931352.05343319
13HM Amla (SA)2004-2019124215169282311*46.64284113
14GC Smith (ICC/SA)2002-201411720513926527748.25273811
\n", 1294 | "
" 1295 | ], 1296 | "text/plain": [ 1297 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n", 1298 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n", 1299 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n", 1300 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n", 1301 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n", 1302 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n", 1303 | "5 KC Sangakkara (SL) 2000-2015 134 233 17 12400 319 57.40 38 \n", 1304 | "6 BC Lara (ICC/WI) 1990-2006 131 232 6 11953 400* 52.88 34 \n", 1305 | "7 S Chanderpaul (WI) 1994-2015 164 280 49 11867 203* 51.37 30 \n", 1306 | "8 DPMD Jayawardene (SL) 1997-2014 149 252 15 11814 374 49.84 34 \n", 1307 | "9 AR Border (AUS) 1978-1994 156 265 44 11174 205 50.56 27 \n", 1308 | "10 SR Waugh (AUS) 1985-2004 168 260 46 10927 200 51.06 32 \n", 1309 | "11 SM Gavaskar (INDIA) 1971-1987 125 214 16 10122 236* 51.12 34 \n", 1310 | "12 Younis Khan (PAK) 2000-2017 118 213 19 10099 313 52.05 34 \n", 1311 | "13 HM Amla (SA) 2004-2019 124 215 16 9282 311* 46.64 28 \n", 1312 | "14 GC Smith (ICC/SA) 2002-2014 117 205 13 9265 277 48.25 27 \n", 1313 | "\n", 1314 | " 50 0 \n", 1315 | "0 68 14 \n", 1316 | "1 62 17 \n", 1317 | "2 58 16 \n", 1318 | "3 63 8 \n", 1319 | "4 57 9 \n", 1320 | "5 52 11 \n", 1321 | "6 48 17 \n", 1322 | "7 66 15 \n", 1323 | "8 50 15 \n", 1324 | "9 63 11 \n", 1325 | "10 50 22 \n", 1326 | "11 45 12 \n", 1327 | "12 33 19 \n", 1328 | "13 41 13 \n", 1329 | "14 38 11 " 1330 | ] 1331 | }, 1332 | "metadata": {}, 1333 | "output_type": "display_data" 1334 | }, 1335 | { 1336 | "name": "stdout", 1337 | "output_type": "stream", 1338 | "text": [ 1339 | "(50, 11)\n" 1340 | ] 1341 | } 1342 | ], 1343 | "source": [ 1344 | "df = pd.read_csv(\"most_runs_in_test_cricket.csv\", encoding = 'unicode_escape', nrows=50)\n", 1345 | "\n", 1346 | "display(df.head(15))\n", 1347 | "print(df.shape)" 1348 | ] 1349 | }, 1350 | { 1351 | "cell_type": "code", 1352 | "execution_count": 61, 1353 | "metadata": {}, 1354 | "outputs": [ 1355 | { 1356 | "data": { 1357 | "text/html": [ 1358 | "
\n", 1359 | "\n", 1372 | "\n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | "
PlayerSpanMatInnsNORunsHSAve100500
38SPD Smith (AUS)2010-20217713917754023961.8027315
34MC Cowdrey (ENG)1954-197511418815762418244.0622389
\n", 1420 | "
" 1421 | ], 1422 | "text/plain": [ 1423 | " Player Span Mat Inns NO Runs HS Ave 100 50 0\n", 1424 | "38 SPD Smith (AUS) 2010-2021 77 139 17 7540 239 61.80 27 31 5\n", 1425 | "34 MC Cowdrey (ENG) 1954-1975 114 188 15 7624 182 44.06 22 38 9" 1426 | ] 1427 | }, 1428 | "execution_count": 61, 1429 | "metadata": {}, 1430 | "output_type": "execute_result" 1431 | } 1432 | ], 1433 | "source": [ 1434 | "# showing randomly 2 different rows\n", 1435 | "df.sample(2)" 1436 | ] 1437 | }, 1438 | { 1439 | "cell_type": "code", 1440 | "execution_count": null, 1441 | "metadata": {}, 1442 | "outputs": [], 1443 | "source": [] 1444 | }, 1445 | { 1446 | "cell_type": "code", 1447 | "execution_count": null, 1448 | "metadata": {}, 1449 | "outputs": [], 1450 | "source": [] 1451 | } 1452 | ], 1453 | "metadata": { 1454 | "kernelspec": { 1455 | "display_name": "Python 3", 1456 | "language": "python", 1457 | "name": "python3" 1458 | }, 1459 | "language_info": { 1460 | "codemirror_mode": { 1461 | "name": "ipython", 1462 | "version": 3 1463 | }, 1464 | "file_extension": ".py", 1465 | "mimetype": "text/x-python", 1466 | "name": "python", 1467 | "nbconvert_exporter": "python", 1468 | "pygments_lexer": "ipython3", 1469 | "version": "3.8.5" 1470 | } 1471 | }, 1472 | "nbformat": 4, 1473 | "nbformat_minor": 4 1474 | } 1475 | -------------------------------------------------------------------------------- /Lecture_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Analysis with Python\n", 8 | "## Lecture 02: Data Preprocessing with Pandas\n", 9 | "
Instructor: Md Shahidullah Kawsar\n", 10 | "
Data Scientist, IDARE, Houston, TX, USA\n", 11 | "\n", 12 | "**Objectives:**\n", 13 | "- reading a .txt (text) or an excel (.xlsx) file\n", 14 | "- dealing with the UnicodeDecodeError?\n", 15 | "- renaming column names\n", 16 | "- creating a new DataFrame?\n", 17 | "- concatenation of two dataframes\n", 18 | "- column splitting\n", 19 | "- creating a new column in a dataframe\n", 20 | "- replace/removing a value from a pandas column\n", 21 | "- removing a column from the dataframe\n", 22 | "\n", 23 | "**References:**\n", 24 | "
[1] Data Source: https://stats.espncricinfo.com/ci/content/records/223646.html\n", 25 | "
[2] https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html\n", 26 | "
[3] different data sources: https://archive.ics.uci.edu/ml/index.php\n", 27 | "
[4] https://www.kaggle.com/learn\n", 28 | "
[5] https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reindex.html" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "#### Import required libraries" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 270, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import numpy as np\n", 45 | "import pandas as pd" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "#### How to read a text file?\n", 53 | "#### How to deal with UnicodeDecodeError?" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 271, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/html": [ 64 | "
\n", 65 | "\n", 78 | "\n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | "
PlayerSpanMatInnsNORunsHSAve100500
0SR Tendulkar (INDIA)1989-20132003293315921248*53.78516814
1RT Ponting (AUS)1995-2012168287291337825751.85416217
2JH Kallis (ICC/SA)1995-2013166280401328922455.37455816
3R Dravid (ICC/INDIA)1996-2012164286321328827052.3136638
4AN Cook (ENG)2006-2018161291161247229445.3533579
\n", 168 | "
" 169 | ], 170 | "text/plain": [ 171 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n", 172 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n", 173 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n", 174 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n", 175 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n", 176 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n", 177 | "\n", 178 | " 50 0 \n", 179 | "0 68 14 \n", 180 | "1 62 17 \n", 181 | "2 58 16 \n", 182 | "3 63 8 \n", 183 | "4 57 9 " 184 | ] 185 | }, 186 | "metadata": {}, 187 | "output_type": "display_data" 188 | } 189 | ], 190 | "source": [ 191 | "df = pd.read_csv(\"most_runs_in_test_cricket.txt\", encoding='unicode_escape', delimiter='\\t')\n", 192 | "\n", 193 | "display(df.head())" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "#### Reading an excel file" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 272, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "# pip install openpyxl" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 273, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/html": [ 220 | "
\n", 221 | "\n", 234 | "\n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | "
PlayerSpanMatInnsNORunsHSAve100500
0SR Tendulkar (INDIA)1989-20132003293315921248*53.78516814
1RT Ponting (AUS)1995-2012168287291337825751.85416217
2JH Kallis (ICC/SA)1995-2013166280401328922455.37455816
3R Dravid (ICC/INDIA)1996-2012164286321328827052.3136638
4AN Cook (ENG)2006-2018161291161247229445.3533579
\n", 324 | "
" 325 | ], 326 | "text/plain": [ 327 | " Player Span Mat Inns NO Runs HS Ave 100 \\\n", 328 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 248* 53.78 51 \n", 329 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 257 51.85 41 \n", 330 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 224 55.37 45 \n", 331 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 270 52.31 36 \n", 332 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 294 45.35 33 \n", 333 | "\n", 334 | " 50 0 \n", 335 | "0 68 14 \n", 336 | "1 62 17 \n", 337 | "2 58 16 \n", 338 | "3 63 8 \n", 339 | "4 57 9 " 340 | ] 341 | }, 342 | "metadata": {}, 343 | "output_type": "display_data" 344 | } 345 | ], 346 | "source": [ 347 | "df = pd.read_excel(\"test_cricket.xlsx\", sheet_name='runs')\n", 348 | "\n", 349 | "display(df.head())" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 274, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "# df['Player'] = df['Player'].str.replace(\"(\", \"\")\n", 359 | "# df['Player'] = df['Player'].str.replace(\")\", \"\")\n", 360 | "\n", 361 | "# display(df.head())" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 275, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "# df_Player = df['Player'].str.split(\" \", expand=True)\n", 371 | "\n", 372 | "# display(df_Player.head())\n", 373 | "# print(df_Player.info())" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "#### How to rename the column names?" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 276, 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "name": "stdout", 390 | "output_type": "stream", 391 | "text": [ 392 | "Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 100, 50, 0], dtype='object')\n" 393 | ] 394 | } 395 | ], 396 | "source": [ 397 | "print(df.columns)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 277, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "data": { 407 | "text/html": [ 408 | "
\n", 409 | "\n", 422 | "\n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | "
PlayerSpanMatchInningsNotOutRunsHighest_scoreAverageCenturiesHalf_centuriesDucks
0SR Tendulkar (INDIA)1989-20132003293315921248*53.78516814
1RT Ponting (AUS)1995-2012168287291337825751.85416217
2JH Kallis (ICC/SA)1995-2013166280401328922455.37455816
3R Dravid (ICC/INDIA)1996-2012164286321328827052.3136638
4AN Cook (ENG)2006-2018161291161247229445.3533579
\n", 512 | "
" 513 | ], 514 | "text/plain": [ 515 | " Player Span Match Innings NotOut Runs \\\n", 516 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 \n", 517 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 \n", 518 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 \n", 519 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 \n", 520 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 \n", 521 | "\n", 522 | " Highest_score Average Centuries Half_centuries Ducks \n", 523 | "0 248* 53.78 51 68 14 \n", 524 | "1 257 51.85 41 62 17 \n", 525 | "2 224 55.37 45 58 16 \n", 526 | "3 270 52.31 36 63 8 \n", 527 | "4 294 45.35 33 57 9 " 528 | ] 529 | }, 530 | "metadata": {}, 531 | "output_type": "display_data" 532 | } 533 | ], 534 | "source": [ 535 | "df = df.rename(columns={'Mat':'Match', \n", 536 | " 'Inns':'Innings',\n", 537 | " 'NO': 'NotOut',\n", 538 | " 'HS': 'Highest_score',\n", 539 | " 'Ave': 'Average',\n", 540 | " 100: 'Centuries',\n", 541 | " 50: 'Half_centuries',\n", 542 | " 0: 'Ducks'})\n", 543 | "\n", 544 | "display(df.head())" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": {}, 550 | "source": [ 551 | "#### How to create a DataFrame?" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 278, 557 | "metadata": {}, 558 | "outputs": [ 559 | { 560 | "data": { 561 | "text/html": [ 562 | "
\n", 563 | "\n", 576 | "\n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | "
AB
014
125
236
\n", 602 | "
" 603 | ], 604 | "text/plain": [ 605 | " A B\n", 606 | "0 1 4\n", 607 | "1 2 5\n", 608 | "2 3 6" 609 | ] 610 | }, 611 | "metadata": {}, 612 | "output_type": "display_data" 613 | } 614 | ], 615 | "source": [ 616 | "df_A = pd.DataFrame({'A':[1,2,3],\n", 617 | " 'B':[4,5,6]})\n", 618 | "\n", 619 | "display(df_A)" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 279, 625 | "metadata": {}, 626 | "outputs": [ 627 | { 628 | "data": { 629 | "text/html": [ 630 | "
\n", 631 | "\n", 644 | "\n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | "
AB
07-7
18-8
29-9
\n", 670 | "
" 671 | ], 672 | "text/plain": [ 673 | " A B\n", 674 | "0 7 -7\n", 675 | "1 8 -8\n", 676 | "2 9 -9" 677 | ] 678 | }, 679 | "metadata": {}, 680 | "output_type": "display_data" 681 | } 682 | ], 683 | "source": [ 684 | "df_B = pd.DataFrame()\n", 685 | "# df_B['C'] = [7,8,9]\n", 686 | "# df_B['D'] = [-7,-8,-9]\n", 687 | "\n", 688 | "df_B['A'] = [7,8,9]\n", 689 | "df_B['B'] = [-7,-8,-9]\n", 690 | "\n", 691 | "display(df_B)" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": {}, 697 | "source": [ 698 | "#### How to concatenate two dataframes?" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 280, 704 | "metadata": {}, 705 | "outputs": [ 706 | { 707 | "data": { 708 | "text/html": [ 709 | "
\n", 710 | "\n", 723 | "\n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | "
ABAB
0147-7
1258-8
2369-9
\n", 757 | "
" 758 | ], 759 | "text/plain": [ 760 | " A B A B\n", 761 | "0 1 4 7 -7\n", 762 | "1 2 5 8 -8\n", 763 | "2 3 6 9 -9" 764 | ] 765 | }, 766 | "metadata": {}, 767 | "output_type": "display_data" 768 | } 769 | ], 770 | "source": [ 771 | "# column-wise concatenation\n", 772 | "df_C = pd.concat([df_A, df_B], axis=1)\n", 773 | "\n", 774 | "display(df_C)" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 281, 780 | "metadata": {}, 781 | "outputs": [ 782 | { 783 | "data": { 784 | "text/html": [ 785 | "
\n", 786 | "\n", 799 | "\n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | "
AB
index
014
125
236
37-7
48-8
59-9
\n", 845 | "
" 846 | ], 847 | "text/plain": [ 848 | " A B\n", 849 | "index \n", 850 | "0 1 4\n", 851 | "1 2 5\n", 852 | "2 3 6\n", 853 | "3 7 -7\n", 854 | "4 8 -8\n", 855 | "5 9 -9" 856 | ] 857 | }, 858 | "metadata": {}, 859 | "output_type": "display_data" 860 | } 861 | ], 862 | "source": [ 863 | "# row-wise concatenation\n", 864 | "df_C = pd.concat([df_A, df_B], axis=0)\n", 865 | "df_C['index'] = np.arange(0,6,1)\n", 866 | "df_C = df_C.set_index('index')\n", 867 | "\n", 868 | "display(df_C)" 869 | ] 870 | }, 871 | { 872 | "cell_type": "markdown", 873 | "metadata": {}, 874 | "source": [ 875 | "#### How to split a column and create two new columns?" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": 282, 881 | "metadata": {}, 882 | "outputs": [ 883 | { 884 | "data": { 885 | "text/html": [ 886 | "
\n", 887 | "\n", 900 | "\n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | "
01
0SR TendulkarINDIA)
1RT PontingAUS)
2JH KallisICC/SA)
3R DravidICC/INDIA)
4AN CookENG)
5KC SangakkaraSL)
6BC LaraICC/WI)
7S ChanderpaulWI)
8DPMD JayawardeneSL)
9AR BorderAUS)
\n", 961 | "
" 962 | ], 963 | "text/plain": [ 964 | " 0 1\n", 965 | "0 SR Tendulkar  INDIA)\n", 966 | "1 RT Ponting  AUS)\n", 967 | "2 JH Kallis  ICC/SA)\n", 968 | "3 R Dravid  ICC/INDIA)\n", 969 | "4 AN Cook  ENG)\n", 970 | "5 KC Sangakkara  SL)\n", 971 | "6 BC Lara  ICC/WI)\n", 972 | "7 S Chanderpaul  WI)\n", 973 | "8 DPMD Jayawardene  SL)\n", 974 | "9 AR Border  AUS)" 975 | ] 976 | }, 977 | "metadata": {}, 978 | "output_type": "display_data" 979 | } 980 | ], 981 | "source": [ 982 | "df_player = df['Player'].str.split(\"(\", expand=True)\n", 983 | "\n", 984 | "display(df_player.head(10))" 985 | ] 986 | }, 987 | { 988 | "cell_type": "code", 989 | "execution_count": 283, 990 | "metadata": {}, 991 | "outputs": [ 992 | { 993 | "data": { 994 | "text/html": [ 995 | "
\n", 996 | "\n", 1009 | "\n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | "
PlayerSpanMatchInningsNotOutRunsHighest_scoreAverageCenturiesHalf_centuriesDucks01
0SR Tendulkar (INDIA)1989-20132003293315921248*53.78516814SR TendulkarINDIA)
1RT Ponting (AUS)1995-2012168287291337825751.85416217RT PontingAUS)
2JH Kallis (ICC/SA)1995-2013166280401328922455.37455816JH KallisICC/SA)
3R Dravid (ICC/INDIA)1996-2012164286321328827052.3136638R DravidICC/INDIA)
4AN Cook (ENG)2006-2018161291161247229445.3533579AN CookENG)
\n", 1111 | "
" 1112 | ], 1113 | "text/plain": [ 1114 | " Player Span Match Innings NotOut Runs \\\n", 1115 | "0 SR Tendulkar (INDIA) 1989-2013 200 329 33 15921 \n", 1116 | "1 RT Ponting (AUS) 1995-2012 168 287 29 13378 \n", 1117 | "2 JH Kallis (ICC/SA) 1995-2013 166 280 40 13289 \n", 1118 | "3 R Dravid (ICC/INDIA) 1996-2012 164 286 32 13288 \n", 1119 | "4 AN Cook (ENG) 2006-2018 161 291 16 12472 \n", 1120 | "\n", 1121 | " Highest_score Average Centuries Half_centuries Ducks 0 \\\n", 1122 | "0 248* 53.78 51 68 14 SR Tendulkar  \n", 1123 | "1 257 51.85 41 62 17 RT Ponting  \n", 1124 | "2 224 55.37 45 58 16 JH Kallis  \n", 1125 | "3 270 52.31 36 63 8 R Dravid  \n", 1126 | "4 294 45.35 33 57 9 AN Cook  \n", 1127 | "\n", 1128 | " 1 \n", 1129 | "0 INDIA) \n", 1130 | "1 AUS) \n", 1131 | "2 ICC/SA) \n", 1132 | "3 ICC/INDIA) \n", 1133 | "4 ENG) " 1134 | ] 1135 | }, 1136 | "metadata": {}, 1137 | "output_type": "display_data" 1138 | } 1139 | ], 1140 | "source": [ 1141 | "df = pd.concat([df, df_player], axis=1)\n", 1142 | "\n", 1143 | "display(df.head())" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "markdown", 1148 | "metadata": {}, 1149 | "source": [ 1150 | "#### How to remove a column?" 1151 | ] 1152 | }, 1153 | { 1154 | "cell_type": "code", 1155 | "execution_count": 284, 1156 | "metadata": {}, 1157 | "outputs": [ 1158 | { 1159 | "data": { 1160 | "text/html": [ 1161 | "
\n", 1162 | "\n", 1175 | "\n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | "
SpanMatchInningsNotOutRunsHighest_scoreAverageCenturiesHalf_centuriesDucks01
01989-20132003293315921248*53.78516814SR TendulkarINDIA)
11995-2012168287291337825751.85416217RT PontingAUS)
21995-2013166280401328922455.37455816JH KallisICC/SA)
31996-2012164286321328827052.3136638R DravidICC/INDIA)
42006-2018161291161247229445.3533579AN CookENG)
\n", 1271 | "
" 1272 | ], 1273 | "text/plain": [ 1274 | " Span Match Innings NotOut Runs Highest_score Average Centuries \\\n", 1275 | "0 1989-2013 200 329 33 15921 248* 53.78 51 \n", 1276 | "1 1995-2012 168 287 29 13378 257 51.85 41 \n", 1277 | "2 1995-2013 166 280 40 13289 224 55.37 45 \n", 1278 | "3 1996-2012 164 286 32 13288 270 52.31 36 \n", 1279 | "4 2006-2018 161 291 16 12472 294 45.35 33 \n", 1280 | "\n", 1281 | " Half_centuries Ducks 0 1 \n", 1282 | "0 68 14 SR Tendulkar  INDIA) \n", 1283 | "1 62 17 RT Ponting  AUS) \n", 1284 | "2 58 16 JH Kallis  ICC/SA) \n", 1285 | "3 63 8 R Dravid  ICC/INDIA) \n", 1286 | "4 57 9 AN Cook  ENG) " 1287 | ] 1288 | }, 1289 | "metadata": {}, 1290 | "output_type": "display_data" 1291 | } 1292 | ], 1293 | "source": [ 1294 | "# line 1\n", 1295 | "# df = df.drop('Player', axis=1)\n", 1296 | "\n", 1297 | "# line 2\n", 1298 | "df.drop('Player', axis=1, inplace=True)\n", 1299 | "\n", 1300 | "# line 1 and line 2 both are same\n", 1301 | "\n", 1302 | "display(df.head())" 1303 | ] 1304 | }, 1305 | { 1306 | "cell_type": "code", 1307 | "execution_count": 285, 1308 | "metadata": {}, 1309 | "outputs": [ 1310 | { 1311 | "data": { 1312 | "text/html": [ 1313 | "
\n", 1314 | "\n", 1327 | "\n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | "
SpanMatchInningsNotOutRunsHighest_scoreAverageCenturiesHalf_centuriesDucksPlayerCountry
01989-20132003293315921248*53.78516814SR TendulkarINDIA)
11995-2012168287291337825751.85416217RT PontingAUS)
21995-2013166280401328922455.37455816JH KallisICC/SA)
31996-2012164286321328827052.3136638R DravidICC/INDIA)
42006-2018161291161247229445.3533579AN CookENG)
\n", 1423 | "
" 1424 | ], 1425 | "text/plain": [ 1426 | " Span Match Innings NotOut Runs Highest_score Average Centuries \\\n", 1427 | "0 1989-2013 200 329 33 15921 248* 53.78 51 \n", 1428 | "1 1995-2012 168 287 29 13378 257 51.85 41 \n", 1429 | "2 1995-2013 166 280 40 13289 224 55.37 45 \n", 1430 | "3 1996-2012 164 286 32 13288 270 52.31 36 \n", 1431 | "4 2006-2018 161 291 16 12472 294 45.35 33 \n", 1432 | "\n", 1433 | " Half_centuries Ducks Player Country \n", 1434 | "0 68 14 SR Tendulkar  INDIA) \n", 1435 | "1 62 17 RT Ponting  AUS) \n", 1436 | "2 58 16 JH Kallis  ICC/SA) \n", 1437 | "3 63 8 R Dravid  ICC/INDIA) \n", 1438 | "4 57 9 AN Cook  ENG) " 1439 | ] 1440 | }, 1441 | "metadata": {}, 1442 | "output_type": "display_data" 1443 | } 1444 | ], 1445 | "source": [ 1446 | "df = df.rename(columns={0: 'Player',\n", 1447 | " 1: 'Country'})\n", 1448 | "\n", 1449 | "display(df.head())" 1450 | ] 1451 | }, 1452 | { 1453 | "cell_type": "markdown", 1454 | "metadata": {}, 1455 | "source": [ 1456 | "#### How to replace/remove a value from a pandas column?" 1457 | ] 1458 | }, 1459 | { 1460 | "cell_type": "code", 1461 | "execution_count": 286, 1462 | "metadata": {}, 1463 | "outputs": [ 1464 | { 1465 | "data": { 1466 | "text/html": [ 1467 | "
\n", 1468 | "\n", 1481 | "\n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | "
SpanMatchInningsNotOutRunsHighest_scoreAverageCenturiesHalf_centuriesDucksPlayerCountry
01989-20132003293315921248*53.78516814SR TendulkarINDIA
11995-2012168287291337825751.85416217RT PontingAUS
21995-2013166280401328922455.37455816JH KallisICC/SA
31996-2012164286321328827052.3136638R DravidICC/INDIA
42006-2018161291161247229445.3533579AN CookENG
\n", 1577 | "
" 1578 | ], 1579 | "text/plain": [ 1580 | " Span Match Innings NotOut Runs Highest_score Average Centuries \\\n", 1581 | "0 1989-2013 200 329 33 15921 248* 53.78 51 \n", 1582 | "1 1995-2012 168 287 29 13378 257 51.85 41 \n", 1583 | "2 1995-2013 166 280 40 13289 224 55.37 45 \n", 1584 | "3 1996-2012 164 286 32 13288 270 52.31 36 \n", 1585 | "4 2006-2018 161 291 16 12472 294 45.35 33 \n", 1586 | "\n", 1587 | " Half_centuries Ducks Player Country \n", 1588 | "0 68 14 SR Tendulkar  INDIA \n", 1589 | "1 62 17 RT Ponting  AUS \n", 1590 | "2 58 16 JH Kallis  ICC/SA \n", 1591 | "3 63 8 R Dravid  ICC/INDIA \n", 1592 | "4 57 9 AN Cook  ENG " 1593 | ] 1594 | }, 1595 | "metadata": {}, 1596 | "output_type": "display_data" 1597 | } 1598 | ], 1599 | "source": [ 1600 | "df['Country'] = df['Country'].str.replace(\")\", \"\")\n", 1601 | "\n", 1602 | "display(df.head())" 1603 | ] 1604 | }, 1605 | { 1606 | "cell_type": "code", 1607 | "execution_count": 287, 1608 | "metadata": {}, 1609 | "outputs": [ 1610 | { 1611 | "name": "stdout", 1612 | "output_type": "stream", 1613 | "text": [ 1614 | "Index(['Span', 'Match', 'Innings', 'NotOut', 'Runs', 'Highest_score',\n", 1615 | " 'Average', 'Centuries', 'Half_centuries', 'Ducks', 'Player', 'Country'],\n", 1616 | " dtype='object')\n" 1617 | ] 1618 | } 1619 | ], 1620 | "source": [ 1621 | "print(df.columns)\n", 1622 | "\n", 1623 | "new_col_sequence = ['Player', 'Country', 'Span', 'Match', 'Innings', 'NotOut', 'Runs', 'Highest_score',\n", 1624 | " 'Average', 'Centuries', 'Half_centuries', 'Ducks']" 1625 | ] 1626 | }, 1627 | { 1628 | "cell_type": "code", 1629 | "execution_count": 288, 1630 | "metadata": {}, 1631 | "outputs": [ 1632 | { 1633 | "data": { 1634 | "text/html": [ 1635 | "
\n", 1636 | "\n", 1649 | "\n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | " \n", 1743 | " \n", 1744 | "
PlayerCountrySpanMatchInningsNotOutRunsHighest_scoreAverageCenturiesHalf_centuriesDucks
0SR TendulkarINDIA1989-20132003293315921248*53.78516814
1RT PontingAUS1995-2012168287291337825751.85416217
2JH KallisICC/SA1995-2013166280401328922455.37455816
3R DravidICC/INDIA1996-2012164286321328827052.3136638
4AN CookENG2006-2018161291161247229445.3533579
\n", 1745 | "
" 1746 | ], 1747 | "text/plain": [ 1748 | " Player Country Span Match Innings NotOut Runs \\\n", 1749 | "0 SR Tendulkar  INDIA 1989-2013 200 329 33 15921 \n", 1750 | "1 RT Ponting  AUS 1995-2012 168 287 29 13378 \n", 1751 | "2 JH Kallis  ICC/SA 1995-2013 166 280 40 13289 \n", 1752 | "3 R Dravid  ICC/INDIA 1996-2012 164 286 32 13288 \n", 1753 | "4 AN Cook  ENG 2006-2018 161 291 16 12472 \n", 1754 | "\n", 1755 | " Highest_score Average Centuries Half_centuries Ducks \n", 1756 | "0 248* 53.78 51 68 14 \n", 1757 | "1 257 51.85 41 62 17 \n", 1758 | "2 224 55.37 45 58 16 \n", 1759 | "3 270 52.31 36 63 8 \n", 1760 | "4 294 45.35 33 57 9 " 1761 | ] 1762 | }, 1763 | "metadata": {}, 1764 | "output_type": "display_data" 1765 | } 1766 | ], 1767 | "source": [ 1768 | "df = df[new_col_sequence]\n", 1769 | "\n", 1770 | "display(df.head())" 1771 | ] 1772 | }, 1773 | { 1774 | "cell_type": "code", 1775 | "execution_count": null, 1776 | "metadata": {}, 1777 | "outputs": [], 1778 | "source": [] 1779 | } 1780 | ], 1781 | "metadata": { 1782 | "kernelspec": { 1783 | "display_name": "Python 3", 1784 | "language": "python", 1785 | "name": "python3" 1786 | }, 1787 | "language_info": { 1788 | "codemirror_mode": { 1789 | "name": "ipython", 1790 | "version": 3 1791 | }, 1792 | "file_extension": ".py", 1793 | "mimetype": "text/x-python", 1794 | "name": "python", 1795 | "nbconvert_exporter": "python", 1796 | "pygments_lexer": "ipython3", 1797 | "version": "3.8.5" 1798 | } 1799 | }, 1800 | "nbformat": 4, 1801 | "nbformat_minor": 4 1802 | } 1803 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Analysis with Python 2 | 3 | #### Lecture 01: Importing Data with Pandas 4 | - challenges of reading a .csv file 5 | - How to deal with UnicodeDecodeError? 6 | - reading a csv file by changing the engine 7 | - choose columns by name before reading a csv file 8 | - choose columns by number before reading a csv file 9 | - reading only the first n number of rows 10 | 11 | #### Lecture 02: Data Preprocessing with Pandas 12 | - reading a .txt (text) or an excel (.xlsx) file 13 | - dealing with the UnicodeDecodeError? 14 | - renaming column names 15 | - creating a new DataFrame? 16 | - concatenation of two dataframes 17 | - column splitting 18 | - creating a new column in a dataframe 19 | - replace/removing a value from a pandas column 20 | - removing a column from the dataframe 21 | 22 | #### Lecture 03: HW review session 23 | 24 | #### Lecture 04: Data Preproccessing with Pandas 25 | - How to extract new information from a column? 26 | - How to create a column based on a condition or function? 27 | - Removing a string from a column 28 | - Checking the unique values for each column 29 | - performing calculation in dataframe columns 30 | - dataframe sorting 31 | - dataframe slicing 32 | 33 | #### Lecture 05: Data Cleaning - Handling Missing Values 34 | - performing data cleaning 35 | - data visualization of missing values 36 | - string to datetime conversion 37 | - removing missing values 38 | - replacing missing values by: 1. mean, 2. median, 3. constant, 4. interpolation, 5. forward imputation, 6. backward imputation 39 | 40 | #### Lecture 6: Data Joining/Merging using Pandas 41 | - inner join, outer join, left join, right join 42 | 43 | #### Lecture 7: Data Aggregation/grouping and Pivot table using Pandas 44 | - Data filtering 45 | - Data preprocessing 46 | - Data Aggregation/grouping 47 | - Pivot table 48 | - Data Visualization: Barplot 49 | 50 | #### Lecture 8: Data Correlation and Categorical Variable Encoding 51 | - Data Correlation 52 | - Heatmap 53 | - Dealing with categorical variables 54 | - Label encoding 55 | - One-hot encoding 56 | - Categorical variable creation from the numeric variable 57 | -------------------------------------------------------------------------------- /friends.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SKawsar/Data_Analysis_with_Python/1ca7520aafd8b860647d114fc42a096dda2de071/friends.xlsx -------------------------------------------------------------------------------- /gre.csv: -------------------------------------------------------------------------------- 1 | Date,verbal_score,quant_score 2 | 09/01/2021,0,1 3 | 09/02/2021,1,2 4 | 09/03/2021,2,3 5 | 09/04/2021,3, 6 | 09/05/2021,4, 7 | 09/06/2021,, 8 | 09/07/2021,,7 9 | 09/08/2021,7,8 10 | 09/09/2021,8,9 11 | 09/10/2021,9,10 -------------------------------------------------------------------------------- /most_runs_in_test_cricket.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SKawsar/Data_Analysis_with_Python/1ca7520aafd8b860647d114fc42a096dda2de071/most_runs_in_test_cricket.csv -------------------------------------------------------------------------------- /most_runs_in_test_cricket.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SKawsar/Data_Analysis_with_Python/1ca7520aafd8b860647d114fc42a096dda2de071/most_runs_in_test_cricket.txt -------------------------------------------------------------------------------- /test_cricket.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SKawsar/Data_Analysis_with_Python/1ca7520aafd8b860647d114fc42a096dda2de071/test_cricket.xlsx -------------------------------------------------------------------------------- /wickets.csv: -------------------------------------------------------------------------------- 1 | Player,Span,Mat,Inns,Balls,Runs,Wkts,BBI,BBM,Ave,Econ,SR,5,10 2 | M Muralitharan (ICC/SL),1992-2010,133,230,44039,18180,800,9/51,16/220,22.72,2.47,55,67,22 3 | SK Warne (AUS),1992-2007,145,273,40705,17995,708,8/71,12/128,25.41,2.65,57.4,37,10 4 | JM Anderson (ENG),2003-2021,164*,304,35079,16575,623,7/42,11/71,26.6,2.83,56.3,30,3 5 | A Kumble (INDIA),1990-2008,132,236,40850,18355,619,10/74,14/149,29.65,2.69,65.9,35,8 6 | GD McGrath (AUS),1993-2007,124,243,29248,12186,563,8/24,10/27,21.64,2.49,51.9,29,3 7 | SCJ Broad (ENG),2007-2021,149,274,29863,14590,524,8/15,11/121,27.84,2.93,56.9,18,3 8 | CA Walsh (WI),1984-2001,132,242,30019,12688,519,7/37,13/55,24.44,2.53,57.8,22,3 9 | DW Steyn (SA),2004-2019,93,171,18608,10077,439,7/51,11/60,22.95,3.24,42.3,26,5 10 | N Kapil Dev (INDIA),1978-1994,131,227,27740,12867,434,9/83,11/146,29.64,2.78,63.9,23,2 11 | HMRKB Herath (SL),1999-2018,93,170,25993,12157,433,9/127,14/184,28.07,2.8,60,34,9 12 | Sir RJ Hadlee (NZ),1973-1990,86,150,21918,9611,431,9/52,15/123,22.29,2.63,50.8,36,9 13 | SM Pollock (SA),1995-2008,108,202,24353,9733,421,7/87,10/147,23.11,2.39,57.8,16,1 14 | Harbhajan Singh (INDIA),1998-2015,103,190,28580,13537,417,8/84,15/217,32.46,2.84,68.5,25,5 15 | Wasim Akram (PAK),1985-2002,104,181,22627,9779,414,7/119,11/110,23.62,2.59,54.6,25,5 16 | R Ashwin (INDIA),2011-2021,79,148,21670,10144,413,7/59,13/140,24.56,2.8,52.4,30,7 17 | CEL Ambrose (WI),1988-2000,98,179,22103,8501,405,8/45,11/84,20.99,2.3,54.5,22,3 18 | NM Lyon (AUS),2011-2021,100,191,25690,12816,399,8/50,13/154,32.12,2.99,64.3,18,3 19 | M Ntini (SA),1998-2009,101,190,20834,11242,390,7/37,13/132,28.82,3.23,53.4,18,4 20 | IT Botham (ENG),1977-1992,102,168,21815,10878,383,8/34,13/106,28.4,2.99,56.9,27,4 21 | MD Marshall (WI),1978-1991,81,151,17584,7876,376,7/22,11/89,20.94,2.68,46.7,22,4 22 | Waqar Younis (PAK),1989-2003,87,154,16224,8788,373,7/76,13/135,23.56,3.25,43.4,22,5 23 | Imran Khan (PAK),1971-1992,88,142,19458,8258,362,8/58,14/116,22.81,2.54,53.7,23,6 24 | DL Vettori (ICC/NZ),1997-2014,113,187,28814,12441,362,7/87,12/149,34.36,2.59,79.5,20,3 25 | DK Lillee (AUS),1971-1984,70,132,18467,8493,355,7/83,11/123,23.92,2.75,52,23,7 26 | WPUJC Vaas (SL),1994-2009,111,194,23438,10501,355,7/71,14/191,29.58,2.68,66,12,2 27 | AA Donald (SA),1992-2002,72,129,15519,7344,330,8/71,12/139,22.25,2.83,47,20,3 28 | RGD Willis (ENG),1971-1984,90,165,17357,8190,325,8/43,9/92,25.2,2.83,53.4,16,0 29 | TG Southee (NZ),2008-2021,79,148,17886,8862,314,7/64,10/108,28.22,2.97,56.9,12,1 30 | MG Johnson (AUS),2007-2015,73,140,16001,8891,313,8/61,12/127,28.4,3.33,51.1,12,3 31 | Z Khan (INDIA),2000-2014,92,165,18785,10247,311,7/87,10/149,32.94,3.27,60.4,11,1 32 | B Lee (AUS),1999-2008,76,150,16531,9554,310,5/30,9/171,30.81,3.46,53.3,10,0 33 | M Morkel (SA),2006-2018,86,160,16498,8550,309,6/23,9/110,27.66,3.1,53.3,8,0 34 | LR Gibbs (WI),1958-1976,79,148,27115,8989,309,8/38,11/157,29.09,1.98,87.7,18,2 35 | FS Trueman (ENG),1952-1965,67,127,15178,6625,307,8/31,12/119,21.57,2.61,49.4,17,3 36 | I Sharma (INDIA),2007-2021,103*,183,18692,9849,306,7/74,10/108,32.18,3.16,61,11,1 37 | DL Underwood (ENG),1966-1982,86,151,21862,7674,297,8/51,13/71,25.83,2.1,73.6,17,6 38 | TA Boult (NZ),2011-2021,73,139,16271,8080,292,6/30,10/80,27.67,2.97,55.7,8,1 39 | JH Kallis (ICC/SA),1995-2013,166,272,20232,9535,292,6/54,9/92,32.65,2.82,69.2,5,0 40 | CJ McDermott (AUS),1984-1996,71,124,16586,8332,291,8/97,11/157,28.63,3.01,56.9,14,2 41 | BS Bedi (INDIA),1966-1979,67,118,21364,7637,266,7/98,10/194,28.71,2.14,80.3,14,1 42 | Danish Kaneria (PAK),2000-2010,61,112,17697,9082,261,7/77,12/94,34.79,3.07,67.8,15,2 43 | J Garner (WI),1977-1987,58,111,13169,5433,259,6/56,9/108,20.97,2.47,50.8,7,0 44 | JN Gillespie (AUS),1996-2006,71,137,14234,6770,259,7/37,9/80,26.13,2.85,54.9,8,0 45 | MA Starc (AUS),2011-2021,61,117,12575,7031,255,6/50,11/94,27.57,3.35,49.3,13,2 46 | GP Swann (ENG),2008-2013,60,109,15349,7642,255,6/65,10/132,29.96,2.98,60.1,17,3 47 | JB Statham (ENG),1951-1965,70,129,16056,6261,252,7/39,11/97,24.84,2.33,63.7,9,1 48 | MA Holding (WI),1975-1987,60,113,12680,5898,249,8/92,14/149,23.68,2.79,50.9,13,2 49 | R Benaud (AUS),1952-1964,63,116,19108,6704,248,7/72,11/105,27.03,2.1,77,16,1 50 | MJ Hoggard (ENG),2000-2008,67,122,13909,7564,248,7/61,12/205,30.5,3.26,56,7,1 51 | GD McKenzie (AUS),1961-1971,60,113,17681,7328,246,8/71,10/91,29.78,2.48,71.8,16,3 52 | BS Chandrasekhar (INDIA),1964-1979,58,97,15963,7199,242,8/79,12/104,29.74,2.7,65.9,16,2 53 | AV Bedser (ENG),1946-1955,51,92,15918,5876,236,7/44,14/99,24.89,2.21,67.4,15,5 54 | J Srinath (INDIA),1991-2002,67,121,15104,7196,236,8/86,13/132,30.49,2.85,64,10,1 55 | Abdul Qadir (PAK),1977-1990,67,111,17126,7742,236,9/56,13/101,32.8,2.71,72.5,15,5 56 | Yasir Shah (PAK),2014-2021,46*,84,13607,7248,235,8/41,14/184,30.84,3.19,57.9,16,3 57 | GS Sobers (WI),1954-1974,93,159,21599,7999,235,6/73,8/80,34.03,2.22,91.9,6,0 58 | AR Caddick (ENG),1993-2003,62,105,13558,6999,234,7/46,10/215,29.91,3.09,57.9,13,1 59 | CS Martin (NZ),2000-2013,71,126,14026,7878,233,6/26,11/180,33.81,3.37,60.1,10,1 60 | N Wagner (NZ),2012-2021,54,102,11991,6046,229,7/39,9/73,26.4,3.02,52.3,9,0 61 | D Gough (ENG),1994-2003,58,95,11821,6503,229,6/42,9/92,28.39,3.3,51.6,9,0 62 | RR Lindwall (AUS),1946-1960,61,113,13650,5251,228,7/38,9/70,23.03,2.3,59.8,12,0 63 | SJ Harmison (ENG/ICC),2002-2009,63,115,13375,7192,226,7/12,11/76,31.82,3.22,59.1,8,1 64 | A Flintoff (ENG/ICC),1998-2009,79,137,14951,7410,226,5/58,8/156,32.78,2.97,66.1,3,0 65 | KAJ Roach (WI),2009-2021,66*,117,11924,6141,225,6/48,10/146,27.29,3.09,52.9,9,1 66 | VD Philander (SA),2011-2020,64,119,11391,5000,224,6/21,10/102,22.32,2.63,50.8,13,2 67 | RA Jadeja (INDIA),2012-2021,54*,101,13325,5446,221,7/48,10/154,24.64,2.45,60.2,9,1 68 | PM Siddle (AUS),2008-2019,67,126,13907,6777,221,6/54,9/104,30.66,2.92,62.9,8,0 69 | CL Cairns (NZ),1989-2004,62,104,11698,6410,218,7/27,10/100,29.4,3.28,53.6,13,1 70 | CV Grimmett (AUS),1925-1936,37,67,14513,5231,216,7/40,14/199,24.21,2.16,67.1,21,7 71 | HH Streak (ZIM),1993-2005,65,102,13559,6079,216,6/73,9/72,28.14,2.69,62.7,7,0 72 | Shakib Al Hasan (BDESH),2007-2021,58,98,13415,6679,215,7/36,10/124,31.06,2.98,62.3,18,2 73 | K Rabada (SA),2015-2021,47,86,8785,4846,213,7/112,13/144,22.75,3.3,41.2,10,4 74 | JR Hazlewood (AUS),2014-2021,55,103,11887,5438,212,6/67,9/115,25.65,2.74,56,9,0 75 | MG Hughes (AUS),1985-1994,53,97,12285,6017,212,8/87,13/217,28.38,2.93,57.9,7,1 76 | SCG MacGill (AUS),1998-2008,44,85,11237,6038,208,8/108,12/107,29.02,3.22,54,12,2 77 | Saqlain Mushtaq (PAK),1995-2004,49,86,14070,6206,208,8/164,10/155,29.83,2.64,67.6,13,3 78 | AME Roberts (WI),1974-1983,47,90,11135,5174,202,7/54,12/121,25.61,2.78,55.1,11,2 79 | JA Snow (ENG),1965-1976,49,93,12021,5387,202,7/40,10/142,26.66,2.68,59.5,8,1 80 | JR Thomson (AUS),1972-1985,51,90,10535,5601,200,6/46,9/105,28,3.18,52.6,8,0 --------------------------------------------------------------------------------