├── 1. Importing Data with Pandas.ipynb ├── 2. Data Preprocessing with Pandas.ipynb ├── 3. Feature Extraction.ipynb ├── 5. Handling Missing values.ipynb ├── 6. Data Joining.ipynb ├── 7. Pivot table.ipynb ├── 8. Categorical Variable Encoding.ipynb ├── ODI_cricket.xlsx ├── README.md ├── batsman_most_runs_ODI.csv ├── bmw.csv ├── bowler_most_wickets_ODI.csv └── gre.csv /1. Importing Data with Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "Y66h391odn_x" 17 | }, 18 | "source": [ 19 | "# Lecture 01: Importing Data with Pandas\n", 20 | "\n", 21 | "Instructor: **Md Shahidullah Kawsar**\n", 22 | "
Data Scientist, IDARE, Houston, TX, USA\n", 23 | "\n", 24 | "#### Objectives:\n", 25 | "- Challenges of reading a CSV or Excel file\n", 26 | "- Choose columns by name before reading a csv file\n", 27 | "- Choose columns by number before reading a csv file\n", 28 | "- Reading only the first n number of rows\n", 29 | "\n", 30 | "#### References:\n", 31 | "[1] Data Source: https://stats.espncricinfo.com/ci/content/records/83548.html\n", 32 | "https://stats.espncricinfo.com/ci/content/records/283193.html\n", 33 | "
[2] Reading CSV file in pandas:\n", 34 | "https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html\n", 35 | "
[3] Reading Excel file in pandas: https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "id": "EmvuNOxVdn_y" 42 | }, 43 | "source": [ 44 | "#### Import required libraries" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "id": "bdrNcRF0dn_y" 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "import pandas as pd" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "id": "824w1FcNdn_z", 63 | "outputId": "999b29a2-6067-46c5-f129-d53a2f0e962a" 64 | }, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/html": [ 69 | "
\n", 70 | "\n", 83 | "\n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | "
PlayerSpanMatInnsNORunsHSAveBFSR1005004s6s
0SR Tendulkar (INDIA)1989-20124634524118426200*44.832136886.234996202016195
1KC Sangakkara (Asia/ICC/SL)2000-2015404380411423416941.981804878.86259315138588
2RT Ponting (AUS/ICC)1995-2012375365391370416442.031704680.393082201231162
3ST Jayasuriya (Asia/SL)1989-2011445433181343018932.361472591.202868341500270
4DPMD Jayawardene (Asia/SL)1998-2015448418391265014433.371602078.96197728111976
\n", 197 | "
" 198 | ], 199 | "text/plain": [ 200 | " Player Span Mat Inns NO Runs HS Ave \\\n", 201 | "0 SR Tendulkar (INDIA) 1989-2012 463 452 41 18426 200* 44.83 \n", 202 | "1 KC Sangakkara (Asia/ICC/SL) 2000-2015 404 380 41 14234 169 41.98 \n", 203 | "2 RT Ponting (AUS/ICC) 1995-2012 375 365 39 13704 164 42.03 \n", 204 | "3 ST Jayasuriya (Asia/SL) 1989-2011 445 433 18 13430 189 32.36 \n", 205 | "4 DPMD Jayawardene (Asia/SL) 1998-2015 448 418 39 12650 144 33.37 \n", 206 | "\n", 207 | " BF SR 100 50 0 4s 6s \n", 208 | "0 21368 86.23 49 96 20 2016 195 \n", 209 | "1 18048 78.86 25 93 15 1385 88 \n", 210 | "2 17046 80.39 30 82 20 1231 162 \n", 211 | "3 14725 91.20 28 68 34 1500 270 \n", 212 | "4 16020 78.96 19 77 28 1119 76 " 213 | ] 214 | }, 215 | "metadata": {}, 216 | "output_type": "display_data" 217 | }, 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "(92, 15)\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "# method 1\n", 228 | "# reading a csv file \n", 229 | "df = pd.read_csv(\"batsman_most_runs_ODI.csv\")\n", 230 | "\n", 231 | "display(df.head())\n", 232 | "# print(df.tail())\n", 233 | "print(df.shape)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "id": "3kezCVUodn_0", 241 | "outputId": "bccb0c9e-e12c-4136-8d8b-c483f1b9c789" 242 | }, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "number of rows = 92\n", 249 | "number of columns = 15\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "# number of rows\n", 255 | "print(\"number of rows = \", df.shape[0])\n", 256 | "\n", 257 | "# number of columns\n", 258 | "print(\"number of columns = \", df.shape[1])" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "id": "l31EuE1Hdn_0", 266 | "outputId": "b1ea11b1-fb12-40f6-8ce7-4d9e21a406fa" 267 | }, 268 | "outputs": [ 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "\n", 274 | "RangeIndex: 92 entries, 0 to 91\n", 275 | "Data columns (total 15 columns):\n", 276 | " # Column Non-Null Count Dtype \n", 277 | "--- ------ -------------- ----- \n", 278 | " 0 Player 92 non-null object \n", 279 | " 1 Span 92 non-null object \n", 280 | " 2 Mat 92 non-null int64 \n", 281 | " 3 Inns 92 non-null int64 \n", 282 | " 4 NO 92 non-null int64 \n", 283 | " 5 Runs 92 non-null int64 \n", 284 | " 6 HS 92 non-null object \n", 285 | " 7 Ave 92 non-null float64\n", 286 | " 8 BF 92 non-null int64 \n", 287 | " 9 SR 92 non-null float64\n", 288 | " 10 100 92 non-null int64 \n", 289 | " 11 50 92 non-null int64 \n", 290 | " 12 0 92 non-null int64 \n", 291 | " 13 4s 92 non-null object \n", 292 | " 14 6s 92 non-null object \n", 293 | "dtypes: float64(2), int64(8), object(5)\n", 294 | "memory usage: 10.9+ KB\n", 295 | "None\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "# checking for missing values and data types of each column\n", 301 | "print(df.info())" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": { 308 | "id": "V8hS9RPfdn_0", 309 | "outputId": "26c9a813-0bde-4bbf-88c7-90590af27291" 310 | }, 311 | "outputs": [ 312 | { 313 | "data": { 314 | "text/html": [ 315 | "
\n", 316 | "\n", 329 | "\n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | "
MatInnsNORunsAveBFSR100500
count92.00000092.00000092.00000092.00000092.00000092.00000092.00000092.00000092.00000092.000000
mean241.021739226.89130429.2173917617.88043539.1457619560.04347880.41423912.21739146.04347812.847826
std73.54896469.19481315.4135852490.7102296.2641663112.8559699.8707328.58035815.9159286.101585
min128.000000126.0000003.0000005047.00000023.5700005504.00000060.5700000.00000023.0000002.000000
25%189.500000179.00000016.0000005760.00000035.0175007422.25000074.1250006.00000035.0000008.000000
50%229.000000217.50000029.5000006855.00000038.7300009144.50000078.96000010.50000041.50000013.000000
75%274.750000256.50000040.0000008661.25000042.34000010957.00000087.16750016.25000052.25000016.000000
max463.000000452.00000084.00000018426.00000058.07000021368.000000117.00000049.00000096.00000034.000000
\n", 452 | "
" 453 | ], 454 | "text/plain": [ 455 | " Mat Inns NO Runs Ave \\\n", 456 | "count 92.000000 92.000000 92.000000 92.000000 92.000000 \n", 457 | "mean 241.021739 226.891304 29.217391 7617.880435 39.145761 \n", 458 | "std 73.548964 69.194813 15.413585 2490.710229 6.264166 \n", 459 | "min 128.000000 126.000000 3.000000 5047.000000 23.570000 \n", 460 | "25% 189.500000 179.000000 16.000000 5760.000000 35.017500 \n", 461 | "50% 229.000000 217.500000 29.500000 6855.000000 38.730000 \n", 462 | "75% 274.750000 256.500000 40.000000 8661.250000 42.340000 \n", 463 | "max 463.000000 452.000000 84.000000 18426.000000 58.070000 \n", 464 | "\n", 465 | " BF SR 100 50 0 \n", 466 | "count 92.000000 92.000000 92.000000 92.000000 92.000000 \n", 467 | "mean 9560.043478 80.414239 12.217391 46.043478 12.847826 \n", 468 | "std 3112.855969 9.870732 8.580358 15.915928 6.101585 \n", 469 | "min 5504.000000 60.570000 0.000000 23.000000 2.000000 \n", 470 | "25% 7422.250000 74.125000 6.000000 35.000000 8.000000 \n", 471 | "50% 9144.500000 78.960000 10.500000 41.500000 13.000000 \n", 472 | "75% 10957.000000 87.167500 16.250000 52.250000 16.000000 \n", 473 | "max 21368.000000 117.000000 49.000000 96.000000 34.000000 " 474 | ] 475 | }, 476 | "metadata": {}, 477 | "output_type": "display_data" 478 | } 479 | ], 480 | "source": [ 481 | "# checking data statistics\n", 482 | "display(df.describe())" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": { 489 | "id": "ioXEtL3ldn_0", 490 | "outputId": "5662e3b0-b91e-40a6-f0da-d0c38970c9cb" 491 | }, 492 | "outputs": [ 493 | { 494 | "name": "stdout", 495 | "output_type": "stream", 496 | "text": [ 497 | "Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS', 'Ave', 'BF', 'SR',\n", 498 | " '100', '50', '0', '4s', '6s'],\n", 499 | " dtype='object')\n" 500 | ] 501 | } 502 | ], 503 | "source": [ 504 | "# column names\n", 505 | "print(df.columns)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": { 512 | "id": "Y8k5_uuIdn_3" 513 | }, 514 | "outputs": [], 515 | "source": [ 516 | "" 517 | ] 518 | } 519 | ], 520 | "metadata": { 521 | "colab": { 522 | "name": "Lecture_01.ipynb", 523 | "provenance": [], 524 | "include_colab_link": true 525 | }, 526 | "kernelspec": { 527 | "display_name": "Python 3", 528 | "language": "python", 529 | "name": "python3" 530 | }, 531 | "language_info": { 532 | "codemirror_mode": { 533 | "name": "ipython", 534 | "version": 3 535 | }, 536 | "file_extension": ".py", 537 | "mimetype": "text/x-python", 538 | "name": "python", 539 | "nbconvert_exporter": "python", 540 | "pygments_lexer": "ipython3", 541 | "version": "3.7.7" 542 | } 543 | }, 544 | "nbformat": 4, 545 | "nbformat_minor": 0 546 | } 547 | -------------------------------------------------------------------------------- /2. Data Preprocessing with Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "Y66h391odn_x" 17 | }, 18 | "source": [ 19 | "# Lecture 02: Data Preprocessing with Pandas (Part 1)\n", 20 | "\n", 21 | "Instructor: **Md Shahidullah Kawsar**\n", 22 | "
Data Scientist, IDARE, Houston, TX, USA\n", 23 | "\n", 24 | "#### Objectives:\n", 25 | "- Challenges of reading a CSV or Excel file\n", 26 | "- Choose columns by name before reading a csv file\n", 27 | "- Choose columns by number before reading a csv file\n", 28 | "- Reading only the first n number of rows\n", 29 | "\n", 30 | "#### References:\n", 31 | "[1] Data Source: https://stats.espncricinfo.com/ci/content/records/83548.html\n", 32 | "https://stats.espncricinfo.com/ci/content/records/283193.html\n", 33 | "
[2] Reading CSV file in pandas:\n", 34 | "https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html\n", 35 | "
[3] Reading Excel file in pandas: https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "id": "EmvuNOxVdn_y" 42 | }, 43 | "source": [ 44 | "#### Import required libraries" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "id": "bdrNcRF0dn_y" 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "import pandas as pd" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": { 61 | "id": "rMU5Wi_idn_1" 62 | }, 63 | "source": [ 64 | "#### choose columns by name to read a csv file" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "id": "0aA0-7Fzdn_1", 72 | "outputId": "0b873b89-2bcd-40a6-87ec-7f3b8fd2a35d" 73 | }, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/html": [ 78 | "
\n", 79 | "\n", 92 | "\n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | "
PlayerMatRuns100
0SR Tendulkar (INDIA)4631842649
1KC Sangakkara (Asia/ICC/SL)4041423425
2RT Ponting (AUS/ICC)3751370430
3ST Jayasuriya (Asia/SL)4451343028
4DPMD Jayawardene (Asia/SL)4481265019
5V Kohli (INDIA)2601231143
6Inzamam-ul-Haq (Asia/PAK)3781173910
7JH Kallis (Afr/ICC/SA)3281157917
8SC Ganguly (Asia/INDIA)3111136322
9R Dravid (Asia/ICC/INDIA)3441088912
\n", 175 | "
" 176 | ], 177 | "text/plain": [ 178 | " Player Mat Runs 100\n", 179 | "0 SR Tendulkar (INDIA) 463 18426 49\n", 180 | "1 KC Sangakkara (Asia/ICC/SL) 404 14234 25\n", 181 | "2 RT Ponting (AUS/ICC) 375 13704 30\n", 182 | "3 ST Jayasuriya (Asia/SL) 445 13430 28\n", 183 | "4 DPMD Jayawardene (Asia/SL) 448 12650 19\n", 184 | "5 V Kohli (INDIA) 260 12311 43\n", 185 | "6 Inzamam-ul-Haq (Asia/PAK) 378 11739 10\n", 186 | "7 JH Kallis (Afr/ICC/SA) 328 11579 17\n", 187 | "8 SC Ganguly (Asia/INDIA) 311 11363 22\n", 188 | "9 R Dravid (Asia/ICC/INDIA) 344 10889 12" 189 | ] 190 | }, 191 | "metadata": {}, 192 | "output_type": "display_data" 193 | } 194 | ], 195 | "source": [ 196 | "col_names = ['Player', 'Mat', 'Runs', '100']\n", 197 | "df_usecols = pd.read_csv(\"batsman_most_runs_ODI.csv\", usecols=col_names)\n", 198 | "\n", 199 | "display(df_usecols.head(10))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": { 205 | "id": "Fs9HUH6Hdn_1" 206 | }, 207 | "source": [ 208 | "#### Choose columns by number to read a csv file" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "id": "u_86zNcHdn_2", 216 | "outputId": "eed640f3-5fe2-48a9-99e7-3311d9a0488e" 217 | }, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/html": [ 222 | "
\n", 223 | "\n", 236 | "\n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | "
PlayerMatRuns100
0SR Tendulkar (INDIA)4631842649
1KC Sangakkara (Asia/ICC/SL)4041423425
2RT Ponting (AUS/ICC)3751370430
3ST Jayasuriya (Asia/SL)4451343028
4DPMD Jayawardene (Asia/SL)4481265019
5V Kohli (INDIA)2601231143
6Inzamam-ul-Haq (Asia/PAK)3781173910
7JH Kallis (Afr/ICC/SA)3281157917
8SC Ganguly (Asia/INDIA)3111136322
9R Dravid (Asia/ICC/INDIA)3441088912
\n", 319 | "
" 320 | ], 321 | "text/plain": [ 322 | " Player Mat Runs 100\n", 323 | "0 SR Tendulkar (INDIA) 463 18426 49\n", 324 | "1 KC Sangakkara (Asia/ICC/SL) 404 14234 25\n", 325 | "2 RT Ponting (AUS/ICC) 375 13704 30\n", 326 | "3 ST Jayasuriya (Asia/SL) 445 13430 28\n", 327 | "4 DPMD Jayawardene (Asia/SL) 448 12650 19\n", 328 | "5 V Kohli (INDIA) 260 12311 43\n", 329 | "6 Inzamam-ul-Haq (Asia/PAK) 378 11739 10\n", 330 | "7 JH Kallis (Afr/ICC/SA) 328 11579 17\n", 331 | "8 SC Ganguly (Asia/INDIA) 311 11363 22\n", 332 | "9 R Dravid (Asia/ICC/INDIA) 344 10889 12" 333 | ] 334 | }, 335 | "metadata": {}, 336 | "output_type": "display_data" 337 | }, 338 | { 339 | "name": "stdout", 340 | "output_type": "stream", 341 | "text": [ 342 | "(92, 4)\n" 343 | ] 344 | } 345 | ], 346 | "source": [ 347 | "col_nums = [0, 2, 5, 10]\n", 348 | "df_usecols_index = pd.read_csv(\"batsman_most_runs_ODI.csv\", usecols=col_nums)\n", 349 | "\n", 350 | "display(df_usecols_index.head(10))\n", 351 | "print(df_usecols_index.shape)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": { 357 | "id": "fHdNp7hbdn_2" 358 | }, 359 | "source": [ 360 | "#### Reading only the first n number of rows" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": { 367 | "id": "OLgOGcCEdn_2", 368 | "outputId": "f05abd0e-dc1c-4c5a-da97-7e9c8e522cac" 369 | }, 370 | "outputs": [ 371 | { 372 | "data": { 373 | "text/html": [ 374 | "
\n", 375 | "\n", 388 | "\n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | "
PlayerSpanMatInnsNORunsHSAveBFSR1005004s6s
0SR Tendulkar (INDIA)1989-20124634524118426200*44.832136886.234996202016195
1KC Sangakkara (Asia/ICC/SL)2000-2015404380411423416941.981804878.86259315138588
2RT Ponting (AUS/ICC)1995-2012375365391370416442.031704680.393082201231162
3ST Jayasuriya (Asia/SL)1989-2011445433181343018932.361472591.202868341500270
4DPMD Jayawardene (Asia/SL)1998-2015448418391265014433.371602078.96197728111976
\n", 502 | "
" 503 | ], 504 | "text/plain": [ 505 | " Player Span Mat Inns NO Runs HS Ave \\\n", 506 | "0 SR Tendulkar (INDIA) 1989-2012 463 452 41 18426 200* 44.83 \n", 507 | "1 KC Sangakkara (Asia/ICC/SL) 2000-2015 404 380 41 14234 169 41.98 \n", 508 | "2 RT Ponting (AUS/ICC) 1995-2012 375 365 39 13704 164 42.03 \n", 509 | "3 ST Jayasuriya (Asia/SL) 1989-2011 445 433 18 13430 189 32.36 \n", 510 | "4 DPMD Jayawardene (Asia/SL) 1998-2015 448 418 39 12650 144 33.37 \n", 511 | "\n", 512 | " BF SR 100 50 0 4s 6s \n", 513 | "0 21368 86.23 49 96 20 2016 195 \n", 514 | "1 18048 78.86 25 93 15 1385 88 \n", 515 | "2 17046 80.39 30 82 20 1231 162 \n", 516 | "3 14725 91.20 28 68 34 1500 270 \n", 517 | "4 16020 78.96 19 77 28 1119 76 " 518 | ] 519 | }, 520 | "metadata": {}, 521 | "output_type": "display_data" 522 | }, 523 | { 524 | "name": "stdout", 525 | "output_type": "stream", 526 | "text": [ 527 | "(50, 15)\n" 528 | ] 529 | } 530 | ], 531 | "source": [ 532 | "df = pd.read_csv(\"batsman_most_runs_ODI.csv\", nrows=50)\n", 533 | "\n", 534 | "display(df.head())\n", 535 | "print(df.shape)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": { 542 | "id": "SnWhEDbndn_2", 543 | "outputId": "70332403-4719-494c-bf84-72180291fe1f" 544 | }, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/html": [ 549 | "
\n", 550 | "\n", 563 | "\n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | "
PlayerSpanMatInnsNORunsHSAveBFSR1005004s6s
44MJ Guptill (NZ)2009-2021186183196927237*42.23789687.72163715702181
2RT Ponting (AUS/ICC)1995-2012375365391370416442.031704680.393082201231162
\n", 623 | "
" 624 | ], 625 | "text/plain": [ 626 | " Player Span Mat Inns NO Runs HS Ave BF \\\n", 627 | "44 MJ Guptill (NZ) 2009-2021 186 183 19 6927 237* 42.23 7896 \n", 628 | "2 RT Ponting (AUS/ICC) 1995-2012 375 365 39 13704 164 42.03 17046 \n", 629 | "\n", 630 | " SR 100 50 0 4s 6s \n", 631 | "44 87.72 16 37 15 702 181 \n", 632 | "2 80.39 30 82 20 1231 162 " 633 | ] 634 | }, 635 | "execution_count": 220, 636 | "metadata": {}, 637 | "output_type": "execute_result" 638 | } 639 | ], 640 | "source": [ 641 | "# showing randomly 2 different rows\n", 642 | "df.sample(2)" 643 | ] 644 | }, 645 | { 646 | "cell_type": "markdown", 647 | "metadata": { 648 | "id": "irul-Zt_dn_3" 649 | }, 650 | "source": [ 651 | "#### Reading Excel file" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": null, 657 | "metadata": { 658 | "id": "AcNg73d-dn_3", 659 | "outputId": "09e29b8d-eff4-43a6-e42c-4a1397068304" 660 | }, 661 | "outputs": [ 662 | { 663 | "data": { 664 | "text/html": [ 665 | "
\n", 666 | "\n", 679 | "\n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | "
PlayerSpanMatInnsNORunsHSAveBFSR1005004s6s
0SR Tendulkar (INDIA)1989-20124634524118426200*44.832136886.234996202016195
1KC Sangakkara (Asia/ICC/SL)2000-2015404380411423416941.981804878.86259315138588
2RT Ponting (AUS/ICC)1995-2012375365391370416442.031704680.393082201231162
3ST Jayasuriya (Asia/SL)1989-2011445433181343018932.361472591.202868341500270
4DPMD Jayawardene (Asia/SL)1998-2015448418391265014433.371602078.96197728111976
\n", 793 | "
" 794 | ], 795 | "text/plain": [ 796 | " Player Span Mat Inns NO Runs HS Ave \\\n", 797 | "0 SR Tendulkar (INDIA) 1989-2012 463 452 41 18426 200* 44.83 \n", 798 | "1 KC Sangakkara (Asia/ICC/SL) 2000-2015 404 380 41 14234 169 41.98 \n", 799 | "2 RT Ponting (AUS/ICC) 1995-2012 375 365 39 13704 164 42.03 \n", 800 | "3 ST Jayasuriya (Asia/SL) 1989-2011 445 433 18 13430 189 32.36 \n", 801 | "4 DPMD Jayawardene (Asia/SL) 1998-2015 448 418 39 12650 144 33.37 \n", 802 | "\n", 803 | " BF SR 100 50 0 4s 6s \n", 804 | "0 21368 86.23 49 96 20 2016 195 \n", 805 | "1 18048 78.86 25 93 15 1385 88 \n", 806 | "2 17046 80.39 30 82 20 1231 162 \n", 807 | "3 14725 91.20 28 68 34 1500 270 \n", 808 | "4 16020 78.96 19 77 28 1119 76 " 809 | ] 810 | }, 811 | "metadata": {}, 812 | "output_type": "display_data" 813 | } 814 | ], 815 | "source": [ 816 | "df = pd.read_excel(\"ODI_cricket.xlsx\", sheet_name=\"batsman\", engine=\"openpyxl\")\n", 817 | "\n", 818 | "display(df.head())" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": null, 824 | "metadata": { 825 | "id": "2XfuJLPudn_1" 826 | }, 827 | "outputs": [], 828 | "source": [ 829 | "# print(df.shape)\n", 830 | "# col_names = ['Player', 'Mat', 'Runs', 'SR']\n", 831 | "# # selecting columns after data importing\n", 832 | "# df = df[col_names]\n", 833 | "\n", 834 | "# print(df.shape)" 835 | ] 836 | }, 837 | { 838 | "cell_type": "markdown", 839 | "metadata": { 840 | "id": "Y8k5_uuIdn_3" 841 | }, 842 | "source": [ 843 | "#### How to rename the column names?" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": null, 849 | "metadata": { 850 | "id": "ZxXbqzobv8hq", 851 | "outputId": "ff6df881-baf0-4e60-a7dc-56c889fc1f1c" 852 | }, 853 | "outputs": [ 854 | { 855 | "name": "stdout", 856 | "output_type": "stream", 857 | "text": [ 858 | "Index(['Player', 'Span', 'Mat', 'Inns', 'NO', 'Runs', 'HS',\n", 859 | " 'Ave', 'BF', 'SR', 100, 50, 0, '4s',\n", 860 | " '6s'],\n", 861 | " dtype='object')\n" 862 | ] 863 | } 864 | ], 865 | "source": [ 866 | "print(df.columns)" 867 | ] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "execution_count": null, 872 | "metadata": { 873 | "id": "pY76ppQbv8hq", 874 | "outputId": "e96f5f7a-95d7-4e55-9370-a793bd0fb99e" 875 | }, 876 | "outputs": [ 877 | { 878 | "data": { 879 | "text/html": [ 880 | "
\n", 881 | "\n", 894 | "\n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | "
PlayerSpanMatchInningsNotOutRunsHighest_scoreAverageBalls_FacedStrike_RateCenturiesHalf_centuriesDucksFoursSixes
0SR Tendulkar (INDIA)1989-20124634524118426200*44.832136886.234996202016195
1KC Sangakkara (Asia/ICC/SL)2000-2015404380411423416941.981804878.86259315138588
2RT Ponting (AUS/ICC)1995-2012375365391370416442.031704680.393082201231162
3ST Jayasuriya (Asia/SL)1989-2011445433181343018932.361472591.202868341500270
4DPMD Jayawardene (Asia/SL)1998-2015448418391265014433.371602078.96197728111976
\n", 1008 | "
" 1009 | ], 1010 | "text/plain": [ 1011 | " Player Span Match Innings NotOut Runs \\\n", 1012 | "0 SR Tendulkar (INDIA) 1989-2012 463 452 41 18426 \n", 1013 | "1 KC Sangakkara (Asia/ICC/SL) 2000-2015 404 380 41 14234 \n", 1014 | "2 RT Ponting (AUS/ICC) 1995-2012 375 365 39 13704 \n", 1015 | "3 ST Jayasuriya (Asia/SL) 1989-2011 445 433 18 13430 \n", 1016 | "4 DPMD Jayawardene (Asia/SL) 1998-2015 448 418 39 12650 \n", 1017 | "\n", 1018 | " Highest_score Average Balls_Faced Strike_Rate Centuries Half_centuries \\\n", 1019 | "0 200* 44.83 21368 86.23 49 96 \n", 1020 | "1 169 41.98 18048 78.86 25 93 \n", 1021 | "2 164 42.03 17046 80.39 30 82 \n", 1022 | "3 189 32.36 14725 91.20 28 68 \n", 1023 | "4 144 33.37 16020 78.96 19 77 \n", 1024 | "\n", 1025 | " Ducks Fours Sixes \n", 1026 | "0 20 2016 195 \n", 1027 | "1 15 1385 88 \n", 1028 | "2 20 1231 162 \n", 1029 | "3 34 1500 270 \n", 1030 | "4 28 1119 76 " 1031 | ] 1032 | }, 1033 | "metadata": {}, 1034 | "output_type": "display_data" 1035 | } 1036 | ], 1037 | "source": [ 1038 | "df = df.rename(columns={'Mat':'Match', \n", 1039 | " 'Inns':'Innings',\n", 1040 | " 'NO': 'NotOut',\n", 1041 | " 'HS': 'Highest_score',\n", 1042 | " 'Ave': 'Average',\n", 1043 | " 'BF': 'Balls_Faced',\n", 1044 | " 'SR': 'Strike_Rate',\n", 1045 | " 100: 'Centuries',\n", 1046 | " 50: 'Half_centuries',\n", 1047 | " 0: 'Ducks',\n", 1048 | " \"4s\": \"Fours\",\n", 1049 | " \"6s\": \"Sixes\"})\n", 1050 | "\n", 1051 | "display(df.head())" 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "markdown", 1056 | "metadata": { 1057 | "id": "V8AUroJcv8hr" 1058 | }, 1059 | "source": [ 1060 | "#### How to split a column and create two new columns?" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "code", 1065 | "execution_count": null, 1066 | "metadata": { 1067 | "id": "GllvKT30v8hr", 1068 | "outputId": "9e5444b7-23fe-4462-86b9-6dcf1fd84f95" 1069 | }, 1070 | "outputs": [ 1071 | { 1072 | "data": { 1073 | "text/html": [ 1074 | "
\n", 1075 | "\n", 1088 | "\n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | "
01
0SR TendulkarINDIA)
1KC SangakkaraAsia/ICC/SL)
2RT PontingAUS/ICC)
3ST JayasuriyaAsia/SL)
4DPMD JayawardeneAsia/SL)
5V KohliINDIA)
6Inzamam-ul-HaqAsia/PAK)
7JH KallisAfr/ICC/SA)
8SC GangulyAsia/INDIA)
9R DravidAsia/ICC/INDIA)
\n", 1149 | "
" 1150 | ], 1151 | "text/plain": [ 1152 | " 0 1\n", 1153 | "0 SR Tendulkar INDIA)\n", 1154 | "1 KC Sangakkara Asia/ICC/SL)\n", 1155 | "2 RT Ponting AUS/ICC)\n", 1156 | "3 ST Jayasuriya Asia/SL)\n", 1157 | "4 DPMD Jayawardene Asia/SL)\n", 1158 | "5 V Kohli INDIA)\n", 1159 | "6 Inzamam-ul-Haq Asia/PAK)\n", 1160 | "7 JH Kallis Afr/ICC/SA)\n", 1161 | "8 SC Ganguly Asia/INDIA)\n", 1162 | "9 R Dravid Asia/ICC/INDIA)" 1163 | ] 1164 | }, 1165 | "metadata": {}, 1166 | "output_type": "display_data" 1167 | } 1168 | ], 1169 | "source": [ 1170 | "df_player = df['Player'].str.split(\"(\", expand=True)\n", 1171 | "\n", 1172 | "display(df_player.head(10))" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "code", 1177 | "execution_count": null, 1178 | "metadata": { 1179 | "id": "pIFx34ijv8hr", 1180 | "outputId": "fda321ee-a63a-42a6-cdb3-645920849483" 1181 | }, 1182 | "outputs": [ 1183 | { 1184 | "data": { 1185 | "text/html": [ 1186 | "
\n", 1187 | "\n", 1200 | "\n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | "
PlayerSpanMatchInningsNotOutRunsHighest_scoreAverageBalls_FacedStrike_RateCenturiesHalf_centuriesDucksFoursSixesPlayer_NameCountry
0SR Tendulkar (INDIA)1989-20124634524118426200*44.832136886.234996202016195SR TendulkarINDIA)
1KC Sangakkara (Asia/ICC/SL)2000-2015404380411423416941.981804878.86259315138588KC SangakkaraAsia/ICC/SL)
2RT Ponting (AUS/ICC)1995-2012375365391370416442.031704680.393082201231162RT PontingAUS/ICC)
3ST Jayasuriya (Asia/SL)1989-2011445433181343018932.361472591.202868341500270ST JayasuriyaAsia/SL)
4DPMD Jayawardene (Asia/SL)1998-2015448418391265014433.371602078.96197728111976DPMD JayawardeneAsia/SL)
\n", 1326 | "
" 1327 | ], 1328 | "text/plain": [ 1329 | " Player Span Match Innings NotOut Runs \\\n", 1330 | "0 SR Tendulkar (INDIA) 1989-2012 463 452 41 18426 \n", 1331 | "1 KC Sangakkara (Asia/ICC/SL) 2000-2015 404 380 41 14234 \n", 1332 | "2 RT Ponting (AUS/ICC) 1995-2012 375 365 39 13704 \n", 1333 | "3 ST Jayasuriya (Asia/SL) 1989-2011 445 433 18 13430 \n", 1334 | "4 DPMD Jayawardene (Asia/SL) 1998-2015 448 418 39 12650 \n", 1335 | "\n", 1336 | " Highest_score Average Balls_Faced Strike_Rate Centuries Half_centuries \\\n", 1337 | "0 200* 44.83 21368 86.23 49 96 \n", 1338 | "1 169 41.98 18048 78.86 25 93 \n", 1339 | "2 164 42.03 17046 80.39 30 82 \n", 1340 | "3 189 32.36 14725 91.20 28 68 \n", 1341 | "4 144 33.37 16020 78.96 19 77 \n", 1342 | "\n", 1343 | " Ducks Fours Sixes Player_Name Country \n", 1344 | "0 20 2016 195 SR Tendulkar INDIA) \n", 1345 | "1 15 1385 88 KC Sangakkara Asia/ICC/SL) \n", 1346 | "2 20 1231 162 RT Ponting AUS/ICC) \n", 1347 | "3 34 1500 270 ST Jayasuriya Asia/SL) \n", 1348 | "4 28 1119 76 DPMD Jayawardene Asia/SL) " 1349 | ] 1350 | }, 1351 | "metadata": {}, 1352 | "output_type": "display_data" 1353 | } 1354 | ], 1355 | "source": [ 1356 | "df[[\"Player_Name\", \"Country\"]] = df['Player'].str.split(\"(\", expand=True)\n", 1357 | "\n", 1358 | "display(df.head())" 1359 | ] 1360 | }, 1361 | { 1362 | "cell_type": "markdown", 1363 | "metadata": { 1364 | "id": "wa8kloXJv8hs" 1365 | }, 1366 | "source": [ 1367 | "#### How to remove a column?" 1368 | ] 1369 | }, 1370 | { 1371 | "cell_type": "code", 1372 | "execution_count": null, 1373 | "metadata": { 1374 | "id": "z9VLMvGOv8hs", 1375 | "outputId": "a6f57a70-56b4-4380-93c4-a85001925700" 1376 | }, 1377 | "outputs": [ 1378 | { 1379 | "data": { 1380 | "text/html": [ 1381 | "
\n", 1382 | "\n", 1395 | "\n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | "
SpanMatchInningsNotOutRunsHighest_scoreAverageBalls_FacedStrike_RateCenturiesHalf_centuriesDucksFoursSixesPlayer_NameCountry
01989-20124634524118426200*44.832136886.234996202016195SR TendulkarINDIA)
12000-2015404380411423416941.981804878.86259315138588KC SangakkaraAsia/ICC/SL)
21995-2012375365391370416442.031704680.393082201231162RT PontingAUS/ICC)
31989-2011445433181343018932.361472591.202868341500270ST JayasuriyaAsia/SL)
41998-2015448418391265014433.371602078.96197728111976DPMD JayawardeneAsia/SL)
\n", 1515 | "
" 1516 | ], 1517 | "text/plain": [ 1518 | " Span Match Innings NotOut Runs Highest_score Average \\\n", 1519 | "0 1989-2012 463 452 41 18426 200* 44.83 \n", 1520 | "1 2000-2015 404 380 41 14234 169 41.98 \n", 1521 | "2 1995-2012 375 365 39 13704 164 42.03 \n", 1522 | "3 1989-2011 445 433 18 13430 189 32.36 \n", 1523 | "4 1998-2015 448 418 39 12650 144 33.37 \n", 1524 | "\n", 1525 | " Balls_Faced Strike_Rate Centuries Half_centuries Ducks Fours Sixes \\\n", 1526 | "0 21368 86.23 49 96 20 2016 195 \n", 1527 | "1 18048 78.86 25 93 15 1385 88 \n", 1528 | "2 17046 80.39 30 82 20 1231 162 \n", 1529 | "3 14725 91.20 28 68 34 1500 270 \n", 1530 | "4 16020 78.96 19 77 28 1119 76 \n", 1531 | "\n", 1532 | " Player_Name Country \n", 1533 | "0 SR Tendulkar INDIA) \n", 1534 | "1 KC Sangakkara Asia/ICC/SL) \n", 1535 | "2 RT Ponting AUS/ICC) \n", 1536 | "3 ST Jayasuriya Asia/SL) \n", 1537 | "4 DPMD Jayawardene Asia/SL) " 1538 | ] 1539 | }, 1540 | "metadata": {}, 1541 | "output_type": "display_data" 1542 | } 1543 | ], 1544 | "source": [ 1545 | "# line 1\n", 1546 | "# df = df.drop('Player', axis=1)\n", 1547 | "\n", 1548 | "# line 2\n", 1549 | "df.drop('Player', axis=1, inplace=True)\n", 1550 | "\n", 1551 | "# line 1 and line 2 both are same\n", 1552 | "\n", 1553 | "display(df.head())" 1554 | ] 1555 | }, 1556 | { 1557 | "cell_type": "markdown", 1558 | "metadata": { 1559 | "id": "CcpvRLtEv8hs" 1560 | }, 1561 | "source": [ 1562 | "#### How to replace/remove a value from a pandas column?" 1563 | ] 1564 | }, 1565 | { 1566 | "cell_type": "code", 1567 | "execution_count": null, 1568 | "metadata": { 1569 | "id": "bKMHBsQEv8hs", 1570 | "outputId": "478310ad-f494-4e29-b87c-1fd31714c510" 1571 | }, 1572 | "outputs": [ 1573 | { 1574 | "data": { 1575 | "text/html": [ 1576 | "
\n", 1577 | "\n", 1590 | "\n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | "
SpanMatchInningsNotOutRunsHighest_scoreAverageBalls_FacedStrike_RateCenturiesHalf_centuriesDucksFoursSixesPlayer_NameCountry
01989-20124634524118426200*44.832136886.234996202016195SR TendulkarINDIA
12000-2015404380411423416941.981804878.86259315138588KC SangakkaraAsia/ICC/SL
21995-2012375365391370416442.031704680.393082201231162RT PontingAUS/ICC
31989-2011445433181343018932.361472591.202868341500270ST JayasuriyaAsia/SL
41998-2015448418391265014433.371602078.96197728111976DPMD JayawardeneAsia/SL
\n", 1710 | "
" 1711 | ], 1712 | "text/plain": [ 1713 | " Span Match Innings NotOut Runs Highest_score Average \\\n", 1714 | "0 1989-2012 463 452 41 18426 200* 44.83 \n", 1715 | "1 2000-2015 404 380 41 14234 169 41.98 \n", 1716 | "2 1995-2012 375 365 39 13704 164 42.03 \n", 1717 | "3 1989-2011 445 433 18 13430 189 32.36 \n", 1718 | "4 1998-2015 448 418 39 12650 144 33.37 \n", 1719 | "\n", 1720 | " Balls_Faced Strike_Rate Centuries Half_centuries Ducks Fours Sixes \\\n", 1721 | "0 21368 86.23 49 96 20 2016 195 \n", 1722 | "1 18048 78.86 25 93 15 1385 88 \n", 1723 | "2 17046 80.39 30 82 20 1231 162 \n", 1724 | "3 14725 91.20 28 68 34 1500 270 \n", 1725 | "4 16020 78.96 19 77 28 1119 76 \n", 1726 | "\n", 1727 | " Player_Name Country \n", 1728 | "0 SR Tendulkar INDIA \n", 1729 | "1 KC Sangakkara Asia/ICC/SL \n", 1730 | "2 RT Ponting AUS/ICC \n", 1731 | "3 ST Jayasuriya Asia/SL \n", 1732 | "4 DPMD Jayawardene Asia/SL " 1733 | ] 1734 | }, 1735 | "metadata": {}, 1736 | "output_type": "display_data" 1737 | } 1738 | ], 1739 | "source": [ 1740 | "df['Country'] = df['Country'].str.replace(\")\", \"\")\n", 1741 | "\n", 1742 | "display(df.head())" 1743 | ] 1744 | }, 1745 | { 1746 | "cell_type": "code", 1747 | "execution_count": null, 1748 | "metadata": { 1749 | "id": "z6o5XnAcv8ht", 1750 | "outputId": "63a9a402-1f01-4158-d70c-0a84ed96ee05" 1751 | }, 1752 | "outputs": [ 1753 | { 1754 | "name": "stdout", 1755 | "output_type": "stream", 1756 | "text": [ 1757 | "Index(['Span', 'Match', 'Innings', 'NotOut', 'Runs', 'Highest_score',\n", 1758 | " 'Average', 'Balls_Faced', 'Strike_Rate', 'Centuries', 'Half_centuries',\n", 1759 | " 'Ducks', 'Fours', 'Sixes', 'Player_Name', 'Country'],\n", 1760 | " dtype='object')\n" 1761 | ] 1762 | } 1763 | ], 1764 | "source": [ 1765 | "print(df.columns)\n", 1766 | "\n", 1767 | "new_col_sequence = ['Player_Name', 'Country', 'Span', 'Match', 'Innings', 'NotOut', 'Runs', 'Highest_score',\n", 1768 | " 'Average', 'Balls_Faced', 'Strike_Rate', 'Centuries', 'Half_centuries', 'Ducks', 'Fours', 'Sixes']" 1769 | ] 1770 | }, 1771 | { 1772 | "cell_type": "code", 1773 | "execution_count": null, 1774 | "metadata": { 1775 | "id": "afGc2B3Rv8ht", 1776 | "outputId": "153f69a2-46d8-4eaa-fde7-6bd4f41fd365" 1777 | }, 1778 | "outputs": [ 1779 | { 1780 | "data": { 1781 | "text/html": [ 1782 | "
\n", 1783 | "\n", 1796 | "\n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | " \n", 1809 | " \n", 1810 | " \n", 1811 | " \n", 1812 | " \n", 1813 | " \n", 1814 | " \n", 1815 | " \n", 1816 | " \n", 1817 | " \n", 1818 | " \n", 1819 | " \n", 1820 | " \n", 1821 | " \n", 1822 | " \n", 1823 | " \n", 1824 | " \n", 1825 | " \n", 1826 | " \n", 1827 | " \n", 1828 | " \n", 1829 | " \n", 1830 | " \n", 1831 | " \n", 1832 | " \n", 1833 | " \n", 1834 | " \n", 1835 | " \n", 1836 | " \n", 1837 | " \n", 1838 | " \n", 1839 | " \n", 1840 | " \n", 1841 | " \n", 1842 | " \n", 1843 | " \n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | " \n", 1859 | " \n", 1860 | " \n", 1861 | " \n", 1862 | " \n", 1863 | " \n", 1864 | " \n", 1865 | " \n", 1866 | " \n", 1867 | " \n", 1868 | " \n", 1869 | " \n", 1870 | " \n", 1871 | " \n", 1872 | " \n", 1873 | " \n", 1874 | " \n", 1875 | " \n", 1876 | " \n", 1877 | " \n", 1878 | " \n", 1879 | " \n", 1880 | " \n", 1881 | " \n", 1882 | " \n", 1883 | " \n", 1884 | " \n", 1885 | " \n", 1886 | " \n", 1887 | " \n", 1888 | " \n", 1889 | " \n", 1890 | " \n", 1891 | " \n", 1892 | " \n", 1893 | " \n", 1894 | " \n", 1895 | " \n", 1896 | " \n", 1897 | " \n", 1898 | " \n", 1899 | " \n", 1900 | " \n", 1901 | " \n", 1902 | " \n", 1903 | " \n", 1904 | " \n", 1905 | " \n", 1906 | " \n", 1907 | " \n", 1908 | " \n", 1909 | " \n", 1910 | " \n", 1911 | " \n", 1912 | " \n", 1913 | " \n", 1914 | " \n", 1915 | "
Player_NameCountrySpanMatchInningsNotOutRunsHighest_scoreAverageBalls_FacedStrike_RateCenturiesHalf_centuriesDucksFoursSixes
0SR TendulkarINDIA1989-20124634524118426200*44.832136886.234996202016195
1KC SangakkaraAsia/ICC/SL2000-2015404380411423416941.981804878.86259315138588
2RT PontingAUS/ICC1995-2012375365391370416442.031704680.393082201231162
3ST JayasuriyaAsia/SL1989-2011445433181343018932.361472591.202868341500270
4DPMD JayawardeneAsia/SL1998-2015448418391265014433.371602078.96197728111976
\n", 1916 | "
" 1917 | ], 1918 | "text/plain": [ 1919 | " Player_Name Country Span Match Innings NotOut Runs \\\n", 1920 | "0 SR Tendulkar INDIA 1989-2012 463 452 41 18426 \n", 1921 | "1 KC Sangakkara Asia/ICC/SL 2000-2015 404 380 41 14234 \n", 1922 | "2 RT Ponting AUS/ICC 1995-2012 375 365 39 13704 \n", 1923 | "3 ST Jayasuriya Asia/SL 1989-2011 445 433 18 13430 \n", 1924 | "4 DPMD Jayawardene Asia/SL 1998-2015 448 418 39 12650 \n", 1925 | "\n", 1926 | " Highest_score Average Balls_Faced Strike_Rate Centuries Half_centuries \\\n", 1927 | "0 200* 44.83 21368 86.23 49 96 \n", 1928 | "1 169 41.98 18048 78.86 25 93 \n", 1929 | "2 164 42.03 17046 80.39 30 82 \n", 1930 | "3 189 32.36 14725 91.20 28 68 \n", 1931 | "4 144 33.37 16020 78.96 19 77 \n", 1932 | "\n", 1933 | " Ducks Fours Sixes \n", 1934 | "0 20 2016 195 \n", 1935 | "1 15 1385 88 \n", 1936 | "2 20 1231 162 \n", 1937 | "3 34 1500 270 \n", 1938 | "4 28 1119 76 " 1939 | ] 1940 | }, 1941 | "metadata": {}, 1942 | "output_type": "display_data" 1943 | } 1944 | ], 1945 | "source": [ 1946 | "df = df[new_col_sequence]\n", 1947 | "\n", 1948 | "display(df.head())" 1949 | ] 1950 | }, 1951 | { 1952 | "cell_type": "code", 1953 | "execution_count": null, 1954 | "metadata": { 1955 | "id": "Nlyq488Zv8ht" 1956 | }, 1957 | "outputs": [], 1958 | "source": [ 1959 | "" 1960 | ] 1961 | } 1962 | ], 1963 | "metadata": { 1964 | "colab": { 1965 | "name": "Lecture_02.ipynb", 1966 | "provenance": [], 1967 | "include_colab_link": true 1968 | }, 1969 | "kernelspec": { 1970 | "display_name": "Python 3", 1971 | "language": "python", 1972 | "name": "python3" 1973 | }, 1974 | "language_info": { 1975 | "codemirror_mode": { 1976 | "name": "ipython", 1977 | "version": 3 1978 | }, 1979 | "file_extension": ".py", 1980 | "mimetype": "text/x-python", 1981 | "name": "python", 1982 | "nbconvert_exporter": "python", 1983 | "pygments_lexer": "ipython3", 1984 | "version": "3.7.7" 1985 | } 1986 | }, 1987 | "nbformat": 4, 1988 | "nbformat_minor": 0 1989 | } 1990 | -------------------------------------------------------------------------------- /8. Categorical Variable Encoding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lecture 8: Categorical Variable Encoding\n", 8 | "\n", 9 | "Instructor: Md Shahidullah Kawsar\n", 10 | "
Data Scientist, IDARE, Houston, TX, USA\n", 11 | "\n", 12 | "#### Objectives:\n", 13 | "- Dealing with categorical variables\n", 14 | "- Label encoding\n", 15 | "- One-hot encoding\n", 16 | "- Categorical variable creation from the numeric variable\n", 17 | "\n", 18 | "#### References:\n", 19 | "
[1] One-Hot Encoding vs. Label Encoding using Scikit-Learn: https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/\n", 20 | "
[2] Label Encoding: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html\n", 21 | "
[3] One-hot encoding: https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html\n", 22 | "
[4] https://pandas.pydata.org/docs/reference/api/pandas.cut.html" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 51, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd\n", 32 | "\n", 33 | "import seaborn as sns\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "\n", 36 | "from sklearn.preprocessing import LabelEncoder\n", 37 | "\n", 38 | "sns.set_context(\"talk\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "#### Load data" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 52, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/html": [ 56 | "
\n", 57 | "\n", 70 | "\n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | "
modelyearpricetransmissionmileagefuelTypetaxmpgengineSize
05 Series201411200Automatic67068Diesel12557.62.0
16 Series201827000Automatic14827Petrol14542.82.0
25 Series201616000Automatic62794Diesel16051.43.0
31 Series201712750Automatic26676Diesel14572.41.5
47 Series201414500Automatic39554Diesel16050.43.0
55 Series201614900Automatic35309Diesel12560.12.0
65 Series201716000Automatic38538Diesel12560.12.0
72 Series201816250Manual10401Petrol14552.31.5
84 Series201714250Manual42668Diesel3062.82.0
95 Series201614250Automatic36099Diesel2068.92.0
\n", 208 | "
" 209 | ], 210 | "text/plain": [ 211 | " model year price transmission mileage fuelType tax mpg \\\n", 212 | "0 5 Series 2014 11200 Automatic 67068 Diesel 125 57.6 \n", 213 | "1 6 Series 2018 27000 Automatic 14827 Petrol 145 42.8 \n", 214 | "2 5 Series 2016 16000 Automatic 62794 Diesel 160 51.4 \n", 215 | "3 1 Series 2017 12750 Automatic 26676 Diesel 145 72.4 \n", 216 | "4 7 Series 2014 14500 Automatic 39554 Diesel 160 50.4 \n", 217 | "5 5 Series 2016 14900 Automatic 35309 Diesel 125 60.1 \n", 218 | "6 5 Series 2017 16000 Automatic 38538 Diesel 125 60.1 \n", 219 | "7 2 Series 2018 16250 Manual 10401 Petrol 145 52.3 \n", 220 | "8 4 Series 2017 14250 Manual 42668 Diesel 30 62.8 \n", 221 | "9 5 Series 2016 14250 Automatic 36099 Diesel 20 68.9 \n", 222 | "\n", 223 | " engineSize \n", 224 | "0 2.0 \n", 225 | "1 2.0 \n", 226 | "2 3.0 \n", 227 | "3 1.5 \n", 228 | "4 3.0 \n", 229 | "5 2.0 \n", 230 | "6 2.0 \n", 231 | "7 1.5 \n", 232 | "8 2.0 \n", 233 | "9 2.0 " 234 | ] 235 | }, 236 | "metadata": {}, 237 | "output_type": "display_data" 238 | } 239 | ], 240 | "source": [ 241 | "df = pd.read_csv(\"bmw.csv\")\n", 242 | "\n", 243 | "display(df.head(10))" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "#### Dealing with categorical variables" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 53, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "[' 5 Series' ' 6 Series' ' 1 Series' ' 7 Series' ' 2 Series' ' 4 Series'\n", 263 | " ' X3' ' 3 Series' ' X5' ' X4' ' i3' ' X1' ' M4' ' X2' ' X6' ' 8 Series'\n", 264 | " ' Z4' ' X7' ' M5' ' i8' ' M2' ' M3' ' M6' ' Z3']\n", 265 | "24\n", 266 | "['Automatic' 'Manual' 'Semi-Auto']\n", 267 | "3\n", 268 | "['Diesel' 'Petrol' 'Other' 'Hybrid' 'Electric']\n", 269 | "5\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "print(df[\"model\"].unique())\n", 275 | "print(len(df[\"model\"].unique()))\n", 276 | "\n", 277 | "print(df[\"transmission\"].unique())\n", 278 | "print(len(df[\"transmission\"].unique()))\n", 279 | "\n", 280 | "print(df[\"fuelType\"].unique())\n", 281 | "print(len(df[\"fuelType\"].unique()))" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "#### Label Encoding" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 54, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "['Automatic' 'Manual' 'Semi-Auto']\n", 301 | "[' 1 Series' ' 2 Series' ' 3 Series' ' 4 Series' ' 5 Series' ' 6 Series'\n", 302 | " ' 7 Series' ' 8 Series' ' M2' ' M3' ' M4' ' M5' ' M6' ' X1' ' X2' ' X3'\n", 303 | " ' X4' ' X5' ' X6' ' X7' ' Z3' ' Z4' ' i3' ' i8']\n", 304 | "['Diesel' 'Electric' 'Hybrid' 'Other' 'Petrol']\n" 305 | ] 306 | }, 307 | { 308 | "data": { 309 | "text/html": [ 310 | "
\n", 311 | "\n", 324 | "\n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | "
modelyearpricetransmissionmileagefuelTypetaxmpgengineSizetransmission_model_fuelType_
58525 Series201726000Semi-Auto43533Diesel15053.33.0240
82041 Series201712415Manual17724Diesel14583.11.5100
5318X5201952950Semi-Auto3309Petrol14527.23.02174
80424 Series201719600Automatic19293Diesel14565.72.0030
32861 Series201618990Semi-Auto11538Petrol20039.83.0204
12882 Series201920276Automatic4013Petrol14553.31.5014
23294 Series201820991Manual11455Petrol15046.32.0134
8830X1201617400Manual28999Diesel12558.92.01130
42572 Series201921680Semi-Auto8994Petrol14547.92.0214
9153Z4201615991Manual25921Petrol20541.52.01214
\n", 495 | "
" 496 | ], 497 | "text/plain": [ 498 | " model year price transmission mileage fuelType tax mpg \\\n", 499 | "5852 5 Series 2017 26000 Semi-Auto 43533 Diesel 150 53.3 \n", 500 | "8204 1 Series 2017 12415 Manual 17724 Diesel 145 83.1 \n", 501 | "5318 X5 2019 52950 Semi-Auto 3309 Petrol 145 27.2 \n", 502 | "8042 4 Series 2017 19600 Automatic 19293 Diesel 145 65.7 \n", 503 | "3286 1 Series 2016 18990 Semi-Auto 11538 Petrol 200 39.8 \n", 504 | "1288 2 Series 2019 20276 Automatic 4013 Petrol 145 53.3 \n", 505 | "2329 4 Series 2018 20991 Manual 11455 Petrol 150 46.3 \n", 506 | "8830 X1 2016 17400 Manual 28999 Diesel 125 58.9 \n", 507 | "4257 2 Series 2019 21680 Semi-Auto 8994 Petrol 145 47.9 \n", 508 | "9153 Z4 2016 15991 Manual 25921 Petrol 205 41.5 \n", 509 | "\n", 510 | " engineSize transmission_ model_ fuelType_ \n", 511 | "5852 3.0 2 4 0 \n", 512 | "8204 1.5 1 0 0 \n", 513 | "5318 3.0 2 17 4 \n", 514 | "8042 2.0 0 3 0 \n", 515 | "3286 3.0 2 0 4 \n", 516 | "1288 1.5 0 1 4 \n", 517 | "2329 2.0 1 3 4 \n", 518 | "8830 2.0 1 13 0 \n", 519 | "4257 2.0 2 1 4 \n", 520 | "9153 2.0 1 21 4 " 521 | ] 522 | }, 523 | "metadata": {}, 524 | "output_type": "display_data" 525 | } 526 | ], 527 | "source": [ 528 | "LE = LabelEncoder()\n", 529 | "df[\"transmission_\"] = LE.fit_transform(df[\"transmission\"])\n", 530 | "print(LE.classes_)\n", 531 | "# print(df[\"transmission_\"].unique())\n", 532 | "\n", 533 | "df[\"model_\"] = LE.fit_transform(df[\"model\"])\n", 534 | "print(LE.classes_)\n", 535 | "# print(df[\"model_\"].unique())\n", 536 | "\n", 537 | "df[\"fuelType_\"] = LE.fit_transform(df[\"fuelType\"])\n", 538 | "print(LE.classes_)\n", 539 | "# print(df[\"fuelType_\"].unique())\n", 540 | "\n", 541 | "display(df.sample(10))" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 55, 547 | "metadata": {}, 548 | "outputs": [ 549 | { 550 | "name": "stdout", 551 | "output_type": "stream", 552 | "text": [ 553 | "\n", 554 | "RangeIndex: 10781 entries, 0 to 10780\n", 555 | "Data columns (total 12 columns):\n", 556 | " # Column Non-Null Count Dtype \n", 557 | "--- ------ -------------- ----- \n", 558 | " 0 model 10781 non-null object \n", 559 | " 1 year 10781 non-null int64 \n", 560 | " 2 price 10781 non-null int64 \n", 561 | " 3 transmission 10781 non-null object \n", 562 | " 4 mileage 10781 non-null int64 \n", 563 | " 5 fuelType 10781 non-null object \n", 564 | " 6 tax 10781 non-null int64 \n", 565 | " 7 mpg 10781 non-null float64\n", 566 | " 8 engineSize 10781 non-null float64\n", 567 | " 9 transmission_ 10781 non-null int64 \n", 568 | " 10 model_ 10781 non-null int64 \n", 569 | " 11 fuelType_ 10781 non-null int64 \n", 570 | "dtypes: float64(2), int64(7), object(3)\n", 571 | "memory usage: 1010.8+ KB\n" 572 | ] 573 | } 574 | ], 575 | "source": [ 576 | "df.info()" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 56, 582 | "metadata": {}, 583 | "outputs": [ 584 | { 585 | "data": { 586 | "text/plain": [ 587 | "Semi-Auto 4666\n", 588 | "Automatic 3588\n", 589 | "Manual 2527\n", 590 | "Name: transmission, dtype: int64" 591 | ] 592 | }, 593 | "execution_count": 56, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "df[\"transmission\"].value_counts()" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 57, 605 | "metadata": {}, 606 | "outputs": [ 607 | { 608 | "data": { 609 | "text/plain": [ 610 | "Diesel 7027\n", 611 | "Petrol 3417\n", 612 | "Hybrid 298\n", 613 | "Other 36\n", 614 | "Electric 3\n", 615 | "Name: fuelType, dtype: int64" 616 | ] 617 | }, 618 | "execution_count": 57, 619 | "metadata": {}, 620 | "output_type": "execute_result" 621 | } 622 | ], 623 | "source": [ 624 | "df[\"fuelType\"].value_counts()" 625 | ] 626 | }, 627 | { 628 | "cell_type": "markdown", 629 | "metadata": {}, 630 | "source": [ 631 | "#### One-hot Encoding" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 58, 637 | "metadata": {}, 638 | "outputs": [ 639 | { 640 | "data": { 641 | "text/html": [ 642 | "
\n", 643 | "\n", 656 | "\n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | "
AutomaticSemi-Auto
598901
289101
933210
710210
365000
426600
140910
50501
911210
634501
\n", 717 | "
" 718 | ], 719 | "text/plain": [ 720 | " Automatic Semi-Auto\n", 721 | "5989 0 1\n", 722 | "2891 0 1\n", 723 | "9332 1 0\n", 724 | "7102 1 0\n", 725 | "3650 0 0\n", 726 | "4266 0 0\n", 727 | "1409 1 0\n", 728 | "505 0 1\n", 729 | "9112 1 0\n", 730 | "6345 0 1" 731 | ] 732 | }, 733 | "metadata": {}, 734 | "output_type": "display_data" 735 | } 736 | ], 737 | "source": [ 738 | "df_transmission = pd.get_dummies(df[\"transmission\"])\n", 739 | "df_transmission = df_transmission.drop(\"Manual\", axis=1)\n", 740 | "display(df_transmission.sample(10))" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 59, 746 | "metadata": {}, 747 | "outputs": [ 748 | { 749 | "data": { 750 | "text/html": [ 751 | "
\n", 752 | "\n", 765 | "\n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | "
DieselElectricHybridOtherPetrol
91710000
411500001
350610000
466010000
602700001
921800001
724900001
793910000
475510000
213900001
\n", 859 | "
" 860 | ], 861 | "text/plain": [ 862 | " Diesel Electric Hybrid Other Petrol\n", 863 | "917 1 0 0 0 0\n", 864 | "4115 0 0 0 0 1\n", 865 | "3506 1 0 0 0 0\n", 866 | "4660 1 0 0 0 0\n", 867 | "6027 0 0 0 0 1\n", 868 | "9218 0 0 0 0 1\n", 869 | "7249 0 0 0 0 1\n", 870 | "7939 1 0 0 0 0\n", 871 | "4755 1 0 0 0 0\n", 872 | "2139 0 0 0 0 1" 873 | ] 874 | }, 875 | "metadata": {}, 876 | "output_type": "display_data" 877 | } 878 | ], 879 | "source": [ 880 | "df_fuelType = pd.get_dummies(df[\"fuelType\"])\n", 881 | "display(df_fuelType.sample(10))" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": 60, 887 | "metadata": {}, 888 | "outputs": [ 889 | { 890 | "data": { 891 | "text/html": [ 892 | "
\n", 893 | "\n", 906 | "\n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | "
1 Series2 Series3 Series4 Series5 Series6 Series7 Series8 SeriesM2M3...X2X3X4X5X6X7Z3Z4i3i8
106330000100000...0000000000
55920100000000...0000000000
107730000000000...0000000000
54490010000000...0000000000
84810100000000...0000000000
87460010000000...0000000000
20610001000000...0000000000
57641000000000...0000000000
93170010000000...0000000000
17610010000000...0000000000
\n", 1176 | "

10 rows × 24 columns

\n", 1177 | "
" 1178 | ], 1179 | "text/plain": [ 1180 | " 1 Series 2 Series 3 Series 4 Series 5 Series 6 Series \\\n", 1181 | "10633 0 0 0 0 1 0 \n", 1182 | "5592 0 1 0 0 0 0 \n", 1183 | "10773 0 0 0 0 0 0 \n", 1184 | "5449 0 0 1 0 0 0 \n", 1185 | "8481 0 1 0 0 0 0 \n", 1186 | "8746 0 0 1 0 0 0 \n", 1187 | "2061 0 0 0 1 0 0 \n", 1188 | "5764 1 0 0 0 0 0 \n", 1189 | "9317 0 0 1 0 0 0 \n", 1190 | "1761 0 0 1 0 0 0 \n", 1191 | "\n", 1192 | " 7 Series 8 Series M2 M3 ... X2 X3 X4 X5 X6 X7 Z3 \\\n", 1193 | "10633 0 0 0 0 ... 0 0 0 0 0 0 0 \n", 1194 | "5592 0 0 0 0 ... 0 0 0 0 0 0 0 \n", 1195 | "10773 0 0 0 0 ... 0 0 0 0 0 0 0 \n", 1196 | "5449 0 0 0 0 ... 0 0 0 0 0 0 0 \n", 1197 | "8481 0 0 0 0 ... 0 0 0 0 0 0 0 \n", 1198 | "8746 0 0 0 0 ... 0 0 0 0 0 0 0 \n", 1199 | "2061 0 0 0 0 ... 0 0 0 0 0 0 0 \n", 1200 | "5764 0 0 0 0 ... 0 0 0 0 0 0 0 \n", 1201 | "9317 0 0 0 0 ... 0 0 0 0 0 0 0 \n", 1202 | "1761 0 0 0 0 ... 0 0 0 0 0 0 0 \n", 1203 | "\n", 1204 | " Z4 i3 i8 \n", 1205 | "10633 0 0 0 \n", 1206 | "5592 0 0 0 \n", 1207 | "10773 0 0 0 \n", 1208 | "5449 0 0 0 \n", 1209 | "8481 0 0 0 \n", 1210 | "8746 0 0 0 \n", 1211 | "2061 0 0 0 \n", 1212 | "5764 0 0 0 \n", 1213 | "9317 0 0 0 \n", 1214 | "1761 0 0 0 \n", 1215 | "\n", 1216 | "[10 rows x 24 columns]" 1217 | ] 1218 | }, 1219 | "metadata": {}, 1220 | "output_type": "display_data" 1221 | } 1222 | ], 1223 | "source": [ 1224 | "df_model = pd.get_dummies(df[\"model\"])\n", 1225 | "display(df_model.sample(10))" 1226 | ] 1227 | }, 1228 | { 1229 | "cell_type": "code", 1230 | "execution_count": 61, 1231 | "metadata": {}, 1232 | "outputs": [ 1233 | { 1234 | "name": "stdout", 1235 | "output_type": "stream", 1236 | "text": [ 1237 | "(10781, 12)\n" 1238 | ] 1239 | }, 1240 | { 1241 | "data": { 1242 | "text/html": [ 1243 | "
\n", 1244 | "\n", 1257 | "\n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | "
yearpricemileagetaxmpgengineSizetransmission_model_fuelType_model_ 1 Series...model_ i3model_ i8transmission_Automatictransmission_Manualtransmission_Semi-AutofuelType_DieselfuelType_ElectricfuelType_HybridfuelType_OtherfuelType_Petrol
6462019299981017114541.52.02240...0000100001
98182016179992719316050.43.00400...0010010000
87232017130002410412553.31.51041...0001000001
3323201950990472914537.73.021700...0000110000
2641201930326339214541.52.02240...0000100001
2205201510999900182070.62.00200...0010010000
74042007349513000020044.82.01240...0001000001
980420125295953603062.82.01001...0001010000
68782017194601107614556.53.00200...0010010000
103952017156002901914565.72.00001...0010010000
\n", 1527 | "

10 rows × 41 columns

\n", 1528 | "
" 1529 | ], 1530 | "text/plain": [ 1531 | " year price mileage tax mpg engineSize transmission_ model_ \\\n", 1532 | "646 2019 29998 10171 145 41.5 2.0 2 2 \n", 1533 | "9818 2016 17999 27193 160 50.4 3.0 0 4 \n", 1534 | "8723 2017 13000 24104 125 53.3 1.5 1 0 \n", 1535 | "3323 2019 50990 4729 145 37.7 3.0 2 17 \n", 1536 | "2641 2019 30326 3392 145 41.5 2.0 2 2 \n", 1537 | "2205 2015 10999 90018 20 70.6 2.0 0 2 \n", 1538 | "7404 2007 3495 130000 200 44.8 2.0 1 2 \n", 1539 | "9804 2012 5295 95360 30 62.8 2.0 1 0 \n", 1540 | "6878 2017 19460 11076 145 56.5 3.0 0 2 \n", 1541 | "10395 2017 15600 29019 145 65.7 2.0 0 0 \n", 1542 | "\n", 1543 | " fuelType_ model_ 1 Series ... model_ i3 model_ i8 \\\n", 1544 | "646 4 0 ... 0 0 \n", 1545 | "9818 0 0 ... 0 0 \n", 1546 | "8723 4 1 ... 0 0 \n", 1547 | "3323 0 0 ... 0 0 \n", 1548 | "2641 4 0 ... 0 0 \n", 1549 | "2205 0 0 ... 0 0 \n", 1550 | "7404 4 0 ... 0 0 \n", 1551 | "9804 0 1 ... 0 0 \n", 1552 | "6878 0 0 ... 0 0 \n", 1553 | "10395 0 1 ... 0 0 \n", 1554 | "\n", 1555 | " transmission_Automatic transmission_Manual transmission_Semi-Auto \\\n", 1556 | "646 0 0 1 \n", 1557 | "9818 1 0 0 \n", 1558 | "8723 0 1 0 \n", 1559 | "3323 0 0 1 \n", 1560 | "2641 0 0 1 \n", 1561 | "2205 1 0 0 \n", 1562 | "7404 0 1 0 \n", 1563 | "9804 0 1 0 \n", 1564 | "6878 1 0 0 \n", 1565 | "10395 1 0 0 \n", 1566 | "\n", 1567 | " fuelType_Diesel fuelType_Electric fuelType_Hybrid fuelType_Other \\\n", 1568 | "646 0 0 0 0 \n", 1569 | "9818 1 0 0 0 \n", 1570 | "8723 0 0 0 0 \n", 1571 | "3323 1 0 0 0 \n", 1572 | "2641 0 0 0 0 \n", 1573 | "2205 1 0 0 0 \n", 1574 | "7404 0 0 0 0 \n", 1575 | "9804 1 0 0 0 \n", 1576 | "6878 1 0 0 0 \n", 1577 | "10395 1 0 0 0 \n", 1578 | "\n", 1579 | " fuelType_Petrol \n", 1580 | "646 1 \n", 1581 | "9818 0 \n", 1582 | "8723 1 \n", 1583 | "3323 0 \n", 1584 | "2641 1 \n", 1585 | "2205 0 \n", 1586 | "7404 1 \n", 1587 | "9804 0 \n", 1588 | "6878 0 \n", 1589 | "10395 0 \n", 1590 | "\n", 1591 | "[10 rows x 41 columns]" 1592 | ] 1593 | }, 1594 | "metadata": {}, 1595 | "output_type": "display_data" 1596 | }, 1597 | { 1598 | "name": "stdout", 1599 | "output_type": "stream", 1600 | "text": [ 1601 | "(10781, 41)\n" 1602 | ] 1603 | } 1604 | ], 1605 | "source": [ 1606 | "print(df.shape)\n", 1607 | "df = pd.get_dummies(df)\n", 1608 | "# df = pd.get_dummies(df, drop_first=True)\n", 1609 | "\n", 1610 | "display(df.sample(10))\n", 1611 | "print(df.shape)" 1612 | ] 1613 | }, 1614 | { 1615 | "cell_type": "code", 1616 | "execution_count": 62, 1617 | "metadata": {}, 1618 | "outputs": [], 1619 | "source": [ 1620 | "# Linear Regression\n", 1621 | "# Decision Tree\n", 1622 | "# Random Forest\n", 1623 | "# XGB Xtreme Gradient Boosting\n", 1624 | "# y = m1*x1 + m2*x2 + c\n", 1625 | "\n", 1626 | "# 1000\n", 1627 | "# 80% training data = 800\n", 1628 | "# 20% test data = 200 # seprate actual price\n", 1629 | "\n", 1630 | "# price = c1*model_1series + c2*model_2series + c3*year + c # ML training\n", 1631 | "# predicted price = c1*model_1series + c2*model_2series + c3*year + c # ML testing\n", 1632 | "\n", 1633 | "# error = compare(actual price, predicted price)\n", 1634 | "\n", 1635 | "# 10%\n", 1636 | "\n", 1637 | "# 14%" 1638 | ] 1639 | }, 1640 | { 1641 | "cell_type": "markdown", 1642 | "metadata": {}, 1643 | "source": [ 1644 | "# Summary of Data Preprocessing for ML with Python\n", 1645 | "What you have learned from this module:\n", 1646 | "\n", 1647 | "- Importing Data (csv, xlsx, txt etc.) with Pandas\n", 1648 | "- creating a new DataFrame\n", 1649 | "- column splitting\n", 1650 | "- creating a new column in a dataframe\n", 1651 | "- replace/removing a value from a pandas column\n", 1652 | "- removing a column from the dataframe\n", 1653 | "- renaming column names\n", 1654 | "- extracting new information from a column\n", 1655 | "- creating a column based on a condition or function\n", 1656 | "- Removing a string from a column\n", 1657 | "- Checking the unique values for each column\n", 1658 | "- performing calculation in dataframe columns\n", 1659 | "- dataframe sorting\n", 1660 | "- dataframe slicing\n", 1661 | "- data cleaning\n", 1662 | "- data visualization of missing values\n", 1663 | "- string to datetime conversion\n", 1664 | "- removing missing values\n", 1665 | "- replacing missing values by: 1. mean, 2. median, 3. constant, 4. interpolation, 5. forward imputation, 6. backward imputation\n", 1666 | "- inner join, outer join, left join, right join\n", 1667 | "- Data filtering\n", 1668 | "- Data Aggregation/grouping \n", 1669 | "- Pivot table\n", 1670 | "- Data Visualization: Barplot\n", 1671 | "- Dealing with categorical variables: 1. Label encoding, 2. One-hot encoding" 1672 | ] 1673 | }, 1674 | { 1675 | "cell_type": "code", 1676 | "execution_count": null, 1677 | "metadata": {}, 1678 | "outputs": [], 1679 | "source": [] 1680 | } 1681 | ], 1682 | "metadata": { 1683 | "kernelspec": { 1684 | "display_name": "Python 3 (ipykernel)", 1685 | "language": "python", 1686 | "name": "python3" 1687 | }, 1688 | "language_info": { 1689 | "codemirror_mode": { 1690 | "name": "ipython", 1691 | "version": 3 1692 | }, 1693 | "file_extension": ".py", 1694 | "mimetype": "text/x-python", 1695 | "name": "python", 1696 | "nbconvert_exporter": "python", 1697 | "pygments_lexer": "ipython3", 1698 | "version": "3.8.5" 1699 | } 1700 | }, 1701 | "nbformat": 4, 1702 | "nbformat_minor": 4 1703 | } 1704 | -------------------------------------------------------------------------------- /ODI_cricket.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SKawsar/Data-Preprocessing-for-ML/7918aaa43c3594ed9781d72fd438f60da9247eae/ODI_cricket.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | I taught Data Preprocessing with Python to 200+ students through an online platform. These are the course materials I built for my students (Mostly graduate-level students from the Non-CS background). 2 | 3 | Video Lectures on YouTube: https://lnkd.in/gK5epasB 4 | 5 | Medium blogs for pandas: https://kawsar34.medium.com/list/learn-pandas-from-leetcode-a06903853aed 6 | 7 | # Data Preprocessing with Python 8 | 9 | #### Lecture 01: Importing Data with Pandas 10 | - Challenges of reading a CSV file 11 | - Understanding the data 12 | - Finding Data Statistics, data types and missing value information 13 | 14 | #### Lecture 02: Data Preprocessing with Pandas (Part 1) 15 | - Challenges of reading a CSV or Excel file 16 | - Choose columns by name before reading a csv file 17 | - Choose columns by number before reading a csv file 18 | - Reading only the first n number of rows 19 | 20 | #### Lecture 03: Data Preproccessing with Pandas (Part 2) 21 | - How to extract new information from a column? 22 | - How to create a column based on a condition or function? 23 | - Removing a string from a column 24 | - Checking the unique values for each column 25 | - performing calculation in dataframe columns 26 | - dataframe sorting 27 | 28 | #### Lecture 04: HW review 29 | 30 | #### Lecture 05: Handling Missing values 31 | - performing data cleaning 32 | - data visualization of missing values 33 | - string to datetime conversion 34 | - removing missing values 35 | - replacing missing values by: 1. mean, 2. median, 3. constant, 4. interpolation, 5. forward imputation, 6. backward imputation 36 | 37 | #### Lecture 06: Data Joining using Pandas 38 | - inner join, outer join, left join, right join 39 | 40 | #### Lecture 07: Data Aggregation/grouping and Pivot table using Pandas 41 | - Data filtering 42 | - Data preprocessing 43 | - Data Aggregation/grouping 44 | - Pivot table 45 | - Data Visualization: Barplot 46 | 47 | #### Lecture 08: Data Correlation and Categorical Variable Encoding 48 | - Dealing with categorical variables 49 | - Label encoding 50 | - One-hot encoding 51 | -------------------------------------------------------------------------------- /batsman_most_runs_ODI.csv: -------------------------------------------------------------------------------- 1 | Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s 2 | SR Tendulkar (INDIA),1989-2012,463,452,41,18426,200*,44.83,21368,86.23,49,96,20,2016,195 3 | KC Sangakkara (Asia/ICC/SL),2000-2015,404,380,41,14234,169,41.98,18048,78.86,25,93,15,1385,88 4 | RT Ponting (AUS/ICC),1995-2012,375,365,39,13704,164,42.03,17046,80.39,30,82,20,1231,162 5 | ST Jayasuriya (Asia/SL),1989-2011,445,433,18,13430,189,32.36,14725,91.2,28,68,34,1500,270 6 | DPMD Jayawardene (Asia/SL),1998-2015,448,418,39,12650,144,33.37,16020,78.96,19,77,28,1119,76 7 | V Kohli (INDIA),2008-2022,260,251,39,12311,183,58.07,13249,92.92,43,64,15,1153,125 8 | Inzamam-ul-Haq (Asia/PAK),1991-2007,378,350,53,11739,137*,39.52,15812,74.24,10,83,20,971,144 9 | JH Kallis (Afr/ICC/SA),1996-2014,328,314,53,11579,139,44.36,15885,72.89,17,86,17,911,137 10 | SC Ganguly (Asia/INDIA),1992-2007,311,300,23,11363,183,41.02,15416,73.7,22,72,16,1122,190 11 | R Dravid (Asia/ICC/INDIA),1996-2011,344,318,40,10889,153,39.16,15285,71.23,12,83,13,950,42 12 | MS Dhoni (Asia/INDIA),2004-2019,350,297,84,10773,183*,50.57,12303,87.56,10,73,10,826,229 13 | CH Gayle (ICC/WI),1999-2019,301,294,17,10480,215,37.83,12019,87.19,25,54,25,1128,331 14 | BC Lara (ICC/WI),1990-2007,299,289,32,10405,169,40.48,13086,79.51,19,63,16,1042,133 15 | TM Dilshan (SL),1999-2016,330,303,41,10290,161*,39.27,11933,86.23,22,47,11,1111,55 16 | Mohammad Yousuf (Asia/PAK),1998-2010,288,273,40,9720,141*,41.71,12942,75.1,15,64,15,785,90 17 | AC Gilchrist (AUS/ICC),1996-2008,287,279,11,9619,172,35.89,9922,96.94,16,55,19,1162,149 18 | AB de Villiers (Afr/SA),2005-2018,228,218,39,9577,176,53.5,9473,101.09,25,53,7,840,204 19 | M Azharuddin (INDIA),1985-2000,334,308,54,9378,153*,36.92,12669,74.02,7,58,9,622+,77+ 20 | PA de Silva (SL),1984-2003,308,296,30,9284,145,34.9,11443,81.13,11,64,17,712+,102+ 21 | RG Sharma (INDIA),2007-2022,230,223,32,9283,264,48.6,10428,89.01,29,44,13,845,245 22 | Saeed Anwar (PAK),1989-2003,247,244,19,8824,194,39.21,10938,80.67,20,43,15,938,97 23 | S Chanderpaul (WI),1994-2011,268,251,40,8778,150,41.6,12408,70.74,11,59,6,722,85 24 | Yuvraj Singh (Asia/INDIA),2000-2017,304,278,40,8701,150,36.55,9924,87.67,14,52,18,908,155 25 | DL Haynes (WI),1978-1994,238,237,28,8648,152*,41.37,13707,63.09,17,57,13,768+,53+ 26 | LRPL Taylor (NZ),2006-2021,233,217,39,8581,181*,48.2,10287,83.41,21,51,9,712,146 27 | MS Atapattu (SL),1990-2007,268,259,32,8529,132*,37.57,12594,67.72,11,59,13,734,15 28 | ME Waugh (AUS),1988-2002,244,236,20,8500,173,39.35,11053,76.9,18,50,16,651,57 29 | V Sehwag (Asia/ICC/INDIA),1999-2013,251,245,9,8273,219,35.05,7929,104.33,15,38,14,1132,136 30 | HM Amla (SA),2008-2019,181,178,14,8113,159,49.46,9178,88.39,27,39,4,822,53 31 | HH Gibbs (SA),1996-2010,248,240,16,8094,175,36.13,9721,83.26,21,37,22,930,128 32 | Shahid Afridi (Asia/ICC/PAK),1996-2015,398,369,27,8064,124,23.57,6892,117,6,39,30,730,351 33 | SP Fleming (ICC/NZ),1994-2007,280,269,21,8037,134*,32.4,11242,71.49,8,49,17,823,63 34 | MJ Clarke (AUS),2003-2015,245,223,44,7981,130,44.58,10104,78.98,8,58,10,665,53 35 | Tamim Iqbal (BAN),2007-2022,225,223,10,7826,158,36.74,9963,78.55,14,52,19,853,99 36 | EJG Morgan (ENG/IRE),2006-2021,246,228,34,7701,148,39.69,8439,91.25,14,47,16,654,220 37 | SR Waugh (AUS),1986-2002,325,288,58,7569,120*,32.9,9971,75.91,3,45,15,530,68 38 | Shoaib Malik (PAK),1999-2019,287,258,40,7534,143,34.55,9199,81.9,9,44,15,603,113 39 | A Ranatunga (SL),1982-1999,269,255,47,7456,131*,35.84,9571,77.9,4,49,18,523+,64+ 40 | Javed Miandad (PAK),1975-1996,233,218,41,7381,119*,41.7,11014,67.01,8,50,8,445+,44+ 41 | Younis Khan (PAK),2000-2015,265,255,23,7249,144,31.24,9628,75.29,7,48,22,578,56 42 | Saleem Malik (PAK),1982-1999,283,256,38,7170,102,32.88,9383,76.41,5,47,19,514+,34+ 43 | NJ Astle (NZ),1995-2007,223,217,14,7090,145*,34.92,9760,72.64,16,41,19,720,86 44 | GC Smith (Afr/SA),2002-2013,197,194,10,6989,141,37.98,8648,80.81,10,47,8,788,44 45 | WU Tharanga (Asia/SL),2005-2019,235,223,17,6951,174*,33.74,9155,75.92,15,37,17,798,52 46 | MJ Guptill (NZ),2009-2021,186,183,19,6927,237*,42.23,7896,87.72,16,37,15,702,181 47 | MG Bevan (AUS),1994-2004,232,196,67,6912,108*,53.58,9320,74.16,6,46,5,450,21 48 | G Kirsten (SA),1993-2003,185,185,19,6798,188*,40.95,9436,72.04,13,45,11,659,20 49 | A Flower (ZIM),1992-2003,213,208,16,6786,145,35.34,9097,74.59,4,55,13,532+,26+ 50 | Shakib Al Hasan (BAN),2006-2022,221,209,30,6755,134*,37.73,8212,82.25,9,50,12,611,46 51 | IVA Richards (WI),1975-1991,187,167,24,6721,189*,47,7451,90.2,11,45,7,600+,126+ 52 | Mushfiqur Rahim (BAN),2006-2022,233,218,36,6697,144,36.79,8481,78.96,8,41,10,527,85 53 | BRM Taylor (ZIM),2004-2021,205,203,15,6684,145*,35.55,8721,76.64,11,39,15,599,106 54 | Mohammad Hafeez (PAK),2003-2019,218,216,15,6614,140*,32.9,8633,76.61,11,38,19,664,110 55 | GW Flower (ZIM),1992-2010,221,214,18,6571,142*,33.52,9723,67.58,6,40,18,557+,37+ 56 | Ijaz Ahmed (PAK),1986-2000,250,232,29,6564,139*,32.33,8174,80.3,10,37,14,531+,87+ 57 | AR Border (AUS),1979-1994,273,252,39,6524,127*,30.62,9134,71.42,3,39,11,500,43 58 | S Dhawan (INDIA),2010-2022,149,146,8,6284,143,45.53,6730,93.37,17,35,5,783,74 59 | RB Richardson (WI),1983-1996,224,217,30,6248,122,33.41,9801,63.74,5,44,8,541+,54+ 60 | KS Williamson (NZ),2010-2020,151,144,14,6173,148,47.48,7551,81.75,13,39,5,563,49 61 | ML Hayden (AUS/ICC),1993-2008,161,155,15,6133,181*,43.8,7767,78.96,10,36,9,636,87 62 | JE Root (ENG),2013-2021,152,142,23,6109,133*,51.33,7034,86.84,16,35,5,491,44 63 | BB McCullum (NZ),2002-2016,260,228,28,6083,166,30.41,6312,96.37,5,32,20,577,200 64 | DM Jones (AUS),1984-1994,164,161,25,6068,145,44.61,8362,72.56,7,46,6,380+,64+ 65 | DC Boon (AUS),1984-1995,181,177,16,5964,122,37.04,9157,65.13,5,37,6,494,16 66 | JN Rhodes (SA),1992-2003,245,220,51,5935,121,35.11,7336,80.9,2,33,12,392,47 67 | Ramiz Raja (PAK),1985-1997,198,197,15,5841,119*,32.09,9226,63.31,9,31,15,469+,14+ 68 | AD Mathews (SL),2008-2021,218,188,48,5835,139*,41.67,7004,83.3,3,40,15,461,89 69 | RR Sarwan (WI),2000-2013,181,169,33,5804,120*,42.67,7663,75.74,5,38,8,480,58 70 | CL Hooper (WI),1987-2003,227,206,43,5761,113*,35.34,7517,76.63,7,29,7,409,65 71 | SR Watson (AUS),2002-2015,190,169,27,5757,185*,40.54,6365,90.44,9,33,12,570,131 72 | Q de Kock (SA),2013-2022,129,129,6,5658,178,46,5895,95.97,17,28,4,646,88 73 | H Masakadza (ZIM),2001-2019,209,208,4,5658,178*,27.73,7728,73.21,5,34,15,585,86 74 | SK Raina (INDIA),2005-2018,226,194,35,5615,116*,35.31,6005,93.5,5,36,14,476,120 75 | MN Samuels (WI),2000-2018,207,196,26,5606,133*,32.97,7463,75.11,10,30,11,526,118 76 | WJ Cronje (SA),1992-2000,188,175,31,5565,112,38.64,7277,76.47,2,39,8,366,94 77 | F du Plessis (SA),2011-2019,143,136,20,5507,185,47.47,6215,88.6,12,35,3,495,66 78 | DA Warner (AUS),2009-2020,128,126,6,5455,179,45.45,5710,95.53,18,23,2,571,85 79 | MEK Hussey (AUS),2004-2012,185,157,44,5442,109*,48.15,6243,87.16,3,39,3,383,80 80 | IR Bell (ENG),2004-2015,161,157,14,5416,141,37.87,7019,77.16,4,35,6,525,32 81 | A Jadeja (INDIA),1992-2000,196,179,36,5359,119,37.47,7678,69.79,6,30,10,366,85 82 | DR Martyn (AUS),1992-2006,208,182,51,5346,144*,40.8,6877,77.73,5,37,10,441,22 83 | G Gambhir (INDIA),2003-2013,147,143,11,5238,150*,39.68,6144,85.25,11,34,11,561,17 84 | AJ Finch (AUS),2013-2020,132,128,3,5232,153*,41.85,5917,88.42,17,29,11,518,126 85 | ADR Campbell (ZIM),1992-2003,188,184,14,5185,131*,30.5,7834,66.18,7,30,11,427+,44+ 86 | RS Mahanama (SL),1986-1999,213,198,23,5162,119*,29.49,8521,60.57,4,35,15,331+,17+ 87 | CG Greenidge (WI),1975-1991,128,127,13,5134,133*,45.03,7908,64.92,11,31,3,470+,81+ 88 | Misbah-ul-Haq (PAK),2002-2015,162,149,31,5122,96*,43.4,6945,73.75,0,42,6,342,83 89 | JP Duminy (SA),2004-2019,199,179,40,5117,150*,36.81,6052,84.55,4,27,7,347,75 90 | PD Collingwood (ENG),2001-2011,197,181,37,5092,120*,35.36,6614,76.98,5,26,7,365,74 91 | A Symonds (AUS),1998-2009,198,161,33,5088,156,39.75,5504,92.44,6,30,15,449,103 92 | Abdul Razzaq (Asia/PAK),1996-2011,265,228,57,5080,112,29.7,6252,81.25,3,23,14,382,124 93 | PR Stirling (IRE),2008-2022,136,133,3,5047,177,38.82,5840,86.42,12,26,10,541,114 -------------------------------------------------------------------------------- /bowler_most_wickets_ODI.csv: -------------------------------------------------------------------------------- 1 | Player,Span,Mat,Inns,Balls,Runs,Wkts,Ave,Econ,SR,4,5 2 | M Muralitharan (Asia/ICC/SL),1993-2011,350,341,18811,12326,534,23.08,3.93,35.2,15,10 3 | Wasim Akram (PAK),1984-2003,356,351,18186,11812,502,23.52,3.89,36.2,17,6 4 | Waqar Younis (PAK),1989-2003,262,258,12698,9919,416,23.84,4.68,30.5,14,13 5 | WPUJC Vaas (Asia/SL),1994-2008,322,320,15775,11014,400,27.53,4.18,39.4,9,4 6 | Shahid Afridi (Asia/ICC/PAK),1996-2015,398,372,17670,13632,395,34.51,4.62,44.7,4,9 7 | SM Pollock (Afr/ICC/SA),1996-2008,303,297,15712,9631,393,24.5,3.67,39.9,12,5 8 | GD McGrath (AUS/ICC),1993-2007,250,248,12970,8391,381,22.02,3.88,34,9,7 9 | B Lee (AUS),2000-2012,221,217,11185,8877,380,23.36,4.76,29.4,14,9 10 | SL Malinga (SL),2004-2019,226,220,10936,9760,338,28.87,5.35,32.3,11,8 11 | A Kumble (Asia/INDIA),1990-2007,271,265,14496,10412,337,30.89,4.3,43,8,2 12 | ST Jayasuriya (Asia/SL),1989-2011,445,368,14874,11871,323,36.75,4.78,46,8,4 13 | J Srinath (INDIA),1991-2003,229,227,11935,8847,315,28.08,4.44,37.8,7,3 14 | DL Vettori (ICC/NZ),1997-2015,295,277,14060,9674,305,31.71,4.12,46,8,2 15 | SK Warne (AUS/ICC),1993-2005,194,191,10642,7541,293,25.73,4.25,36.3,12,1 16 | Saqlain Mushtaq (PAK),1995-2003,169,165,8770,6275,288,21.78,4.29,30.4,11,6 17 | AB Agarkar (INDIA),1998-2007,191,188,9484,8021,288,27.85,5.07,32.9,10,2 18 | Shakib Al Hasan (BAN),2006-2022,221,218,11351,8401,285,29.47,4.44,39.8,9,3 19 | Z Khan (Asia/INDIA),2000-2012,200,197,10097,8301,282,29.43,4.93,35.8,7,1 20 | JH Kallis (Afr/ICC/SA),1996-2014,328,283,10750,8680,273,31.79,4.84,39.3,2,2 21 | AA Donald (SA),1991-2003,164,162,8561,5926,272,21.78,4.15,31.4,11,2 22 | Mashrafe Mortaza (Asia/BAN),2001-2020,220,220,10922,8893,270,32.93,4.88,40.4,7,1 23 | JM Anderson (ENG),2002-2015,194,191,9584,7861,269,29.22,4.92,35.6,11,2 24 | Abdul Razzaq (Asia/PAK),1996-2011,265,254,10941,8564,269,31.83,4.69,40.6,8,3 25 | Harbhajan Singh (Asia/INDIA),1998-2015,236,227,12479,8973,269,33.35,4.31,46.3,2,3 26 | M Ntini (ICC/SA),1998-2009,173,171,8687,6559,266,24.65,4.53,32.6,8,4 27 | N Kapil Dev (INDIA),1978-1994,225,221,11202,6945,253,27.45,3.71,44.2,3,1 28 | Shoaib Akhtar (Asia/ICC/PAK),1998-2011,163,162,7764,6169,247,24.97,4.76,31.4,6,4 29 | KD Mills (NZ),2001-2015,170,169,8230,6485,240,27.02,4.72,34.2,8,1 30 | MG Johnson (AUS),2005-2015,153,150,7489,6038,239,25.26,4.83,31.3,9,3 31 | HH Streak (Afr/ZIM),1993-2005,189,185,9468,7129,239,29.82,4.51,39.6,7,1 32 | D Gough (ENG/ICC),1994-2006,159,156,8470,6209,235,26.42,4.39,36,10,2 33 | CA Walsh (WI),1985-2000,205,204,10822,6918,227,30.47,3.83,47.6,6,1 34 | CEL Ambrose (WI),1988-2000,176,175,9353,5429,225,24.12,3.48,41.5,6,4 35 | Abdur Razzak (BAN),2004-2014,153,152,7965,6065,207,29.29,4.56,38.4,5,4 36 | CJ McDermott (AUS),1985-1996,138,138,7461,5018,203,24.71,4.03,36.7,4,1 37 | CZ Harris (NZ),1990-2004,250,232,10667,7613,203,37.5,4.28,52.5,2,1 38 | CL Cairns (ICC/NZ),1991-2006,215,186,8168,6594,201,32.8,4.84,40.6,3,1 39 | DJ Bravo (WI),2004-2014,164,150,6511,5874,199,29.51,5.41,32.7,6,1 40 | KMDN Kulasekara (SL),2003-2017,184,181,8263,6751,199,33.92,4.9,41.5,4,1 41 | DW Steyn (Afr/SA),2005-2019,125,124,6256,5087,196,25.95,4.87,31.9,4,3 42 | BKV Prasad (INDIA),1994-2001,161,160,8129,6332,196,32.3,4.67,41.4,3,1 43 | MA Starc (AUS),2010-2021,99,99,5099,4379,195,22.45,5.15,26.1,11,8 44 | SR Waugh (AUS),1986-2002,325,207,8883,6761,195,34.67,4.56,45.5,3,0 45 | CL Hooper (WI),1987-2003,227,203,9573,6958,193,36.05,4.36,49.6,3,0 46 | L Klusener (SA),1996-2004,171,164,7336,5751,192,29.95,4.7,38.2,1,6 47 | TG Southee (NZ),2008-2020,143,141,7195,6558,190,34.51,5.46,37.8,4,3 48 | M Morkel (Afr/SA),2007-2018,117,114,5760,4761,188,25.32,4.95,30.6,7,2 49 | RA Jadeja (INDIA),2009-2020,168,164,8557,7024,188,37.36,4.92,45.5,7,1 50 | CRD Fernando (Asia/SL),2001-2012,147,141,6507,5648,187,30.2,5.2,34.7,3,1 51 | Saeed Ajmal (PAK),2008-2015,113,112,6000,4182,184,22.72,4.18,32.6,6,2 52 | Imran Khan (PAK),1974-1992,175,153,7461,4844,182,26.61,3.89,40.9,3,1 53 | Aaqib Javed (PAK),1988-1998,163,159,8012,5721,182,31.43,4.28,44,2,4 54 | Umar Gul (PAK),2003-2016,130,128,6064,5253,179,29.34,5.19,33.8,4,2 55 | SCJ Broad (ENG),2006-2016,121,121,6109,5364,178,30.13,5.26,34.3,9,1 56 | NLTC Perera (SL),2009-2021,166,157,5900,5740,175,32.8,5.83,33.7,5,4 57 | NW Bracken (AUS),2001-2009,116,116,5759,4240,174,24.36,4.41,33,5,2 58 | Imran Tahir (SA),2011-2019,107,104,5541,4297,173,24.83,4.65,32,7,3 59 | JDP Oram (NZ),2001-2012,160,154,6911,5047,173,29.17,4.38,39.9,3,2 60 | IK Pathan (INDIA),2004-2012,120,118,5855,5142,173,29.72,5.26,33.8,5,2 61 | A Flintoff (ENG/ICC),1999-2009,141,119,5624,4121,169,24.38,4.39,33.2,6,2 62 | TA Boult (NZ),2012-2021,93,93,5117,4261,169,25.21,4.99,30.2,8,5 63 | SR Watson (AUS),2002-2015,190,163,6466,5342,168,31.79,4.95,38.4,3,0 64 | CH Gayle (ICC/WI),1999-2019,301,199,7424,5926,167,35.48,4.78,44.4,3,1 65 | Mushtaq Ahmed (PAK),1989-2003,144,142,7543,5361,161,33.29,4.26,46.8,3,1 66 | AU Rashid (ENG),2009-2021,112,106,5573,5251,159,33.02,5.65,35,7,2 67 | RJ Hadlee (NZ),1973-1990,115,112,6182,3407,158,21.56,3.3,39.1,1,5 68 | Shoaib Malik (PAK),1999-2019,287,217,7958,6192,158,39.18,4.66,50.3,1,0 69 | MD Marshall (WI),1980-1992,136,134,7175,4233,157,26.96,3.53,45.7,6,0 70 | M Prabhakar (INDIA),1984-1996,130,127,6360,4534,157,28.87,4.27,40.5,4,2 71 | A Nehra (Asia/INDIA),2001-2011,120,120,5751,4981,157,31.72,5.19,36.6,5,2 72 | GB Hogg (AUS),1996-2008,123,113,5564,4188,156,26.84,4.51,35.6,3,2 73 | CR Woakes (ENG),2011-2021,106,102,5016,4567,155,29.46,5.46,32.3,10,3 74 | SR Tendulkar (INDIA),1989-2012,463,270,8054,6850,154,44.48,5.1,52.2,4,2 75 | BAW Mendis (SL),2008-2015,87,84,4154,3324,152,21.86,4.8,27.3,7,3 76 | Rashid Khan (AFG),2015-2022,80,76,4074,2821,151,18.68,4.15,26.9,5,4 77 | UDU Chandana (SL),1994-2007,147,136,6142,4818,151,31.9,4.7,40.6,4,1 78 | R Ashwin (INDIA),2010-2022,113,111,6141,5058,151,33.49,4.94,40.6,1,0 79 | -------------------------------------------------------------------------------- /gre.csv: -------------------------------------------------------------------------------- 1 | Date,verbal_score,quant_score 2 | 09/01/2021,0,1 3 | 09/02/2021,1,2 4 | 09/03/2021,2,3 5 | 09/04/2021,3, 6 | 09/05/2021,4, 7 | 09/06/2021,, 8 | 09/07/2021,,7 9 | 09/08/2021,7,8 10 | 09/09/2021,8,9 11 | 09/10/2021,9,10 --------------------------------------------------------------------------------