├── ComputerSales.csv ├── Financial Sample.xlsx ├── Pandas Tutorial.ipynb ├── README.md └── icecreamsales.csv /ComputerSales.csv: -------------------------------------------------------------------------------- 1 | Sale ID,Contact,Sex,Age,State,Product ID,Product Type,Sale Price,Profit,Lead,Month,Year 2 | 1,Paul Thomas,M,43,OH,M01-F0024,Desktop,479.99,143.39,Website,January,2018 3 | 2,Margo Simms,F,37,WV,GT13-0024,Desktop,1249.99,230.89,Flyer 4,January,2018 4 | 3,Sam Stine,M,26,PA,I3670,Desktop,649.99,118.64,Website,February,2018 5 | 4,Moe Eggert,M,35,PA,I3593,Laptop,399.99,72.09,Website,March,2018 6 | 5,Jessica Elk,F,55,PA,15M-ED,Laptop,699.99,98.09,Flyer 4,March,2018 7 | 6,Sally Struthers,F,45,PA,GT13-0024,Desktop,1249.99,230.89,Flyer 2,April,2018 8 | 7,Michelle Samms,F,46,OH,GA401IV,Laptop,1349.99,180.34,Email,May,2018 9 | 8,Mick Roberts,M,23,OH,MY2J2LL,Tablet,999.99,146.69,Website,July,2018 10 | 9,Ed Klondike,M,52,OH,81TC00,Laptop,649.99,122.34,Email,July,2018 11 | 10,Phil Jones,M,56,WV,M01-F0024,Desktop,479.99,143.39,Flyer 2,August,2018 12 | 11,Rick James,M,49,PA,GA401IV,Laptop,1349.99,180.34,Flyer 3,November,2018 13 | 12,Sue Etna,F,54,OH,GT13-0024,Desktop,1249.99,230.89,Flyer 2,November,2018 14 | 13,Jason Case,M,57,PA,81TC00,Laptop,649.99,122.34,Email,November,2018 15 | 14,Doug Johnson,M,51,PA,I3670,Desktop,649.99,118.64,Website,December,2018 16 | 15,Andy Sands,M,56,OH,MY2J2LL,Tablet,999.99,146.69,Flyer 1,December,2018 17 | 16,Kim Collins,F,49,PA,I3593,Laptop,399.99,72.09,Flyer 2,January,2019 18 | 17,Edna Sanders,F,46,OH,15M-ED,Laptop,699.99,98.09,Email,February,2019 19 | 18,Michelle Samms,F,46,NY,MY2J2LL,Tablet,999.99,146.69,Website,March,2019 20 | 19,Mick Roberts,M,23,PA,I3593,Laptop,399.99,72.09,Flyer 4,March,2019 21 | 20,Sally Struthers,F,45,NY,81TC00,Laptop,649.99,122.34,Website,April,2019 22 | 21,Jason Case,M,57,PA,M01-F0024,Desktop,479.99,143.39,Flyer 4,May,2019 23 | 22,Doug Johnson,M,51,PA,GA401IV,Laptop,1349.99,180.34,Website,August,2019 24 | 23,Paul Thomas,M,43,OH,81TC00,Laptop,649.99,122.34,Website,August,2019 25 | 24,Margo Simms,F,37,WV,Q526FA,Laptop,1049.99,143.09,Flyer 4,November,2019 26 | 25,Michelle Samms,F,46,NY,I3670,Desktop,649.99,118.64,Flyer 2,November,2019 27 | 26,Mick Roberts,M,23,PA,Q526FA,Laptop,1049.99,143.09,Email,November,2019 28 | 27,Ed Klondike,M,52,OH,Q526FA,Laptop,1049.99,143.09,Website,December,2019 29 | 28,Moe Eggert,M,35,PA,15M-ED,Laptop,699.99,98.09,Email,December,2019 30 | 29,Jessica Elk,F,55,PA,GA401IV,Laptop,1349.99,180.34,Flyer 2,December,2019 31 | 30,Phil Jones,M,56,WV,M01-F0024,Desktop,479.99,143.39,Flyer 2,January,2020 32 | 31,Rick James,M,49,PA,GA401IV,Laptop,1349.99,180.34,Flyer 1,January,2020 33 | 32,Sue Etna,F,54,OH,GT13-0024,Desktop,1249.99,230.89,Flyer 2,February,2020 34 | 33,Kim Collins,F,49,PA,I3593,Laptop,399.99,72.09,Flyer 2,March,2020 35 | 34,Edna Sanders,F,46,OH,15M-ED,Laptop,699.99,98.09,Email,March,2020 36 | 35,Michelle Samms,F,46,NY,MY2J2LL,Tablet,999.99,146.69,Website,April,2020 37 | 36,Sally Struthers,F,45,NY,81TC00,Laptop,649.99,122.34,Website,April,2020 38 | 37,Jason Case,M,57,PA,M01-F0024,Desktop,479.99,143.39,Flyer 4,April,2020 39 | 38,Doug Johnson,M,51,PA,GA401IV,Laptop,1349.99,180.34,Website,May,2020 40 | 39,Moe Eggert,M,35,PA,I3593,Laptop,399.99,72.09,Website,May,2020 -------------------------------------------------------------------------------- /Financial Sample.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/derekbanas/pandas-tutorial/9338573fd6b203a985121d3dc6bc0f03101f5530/Financial Sample.xlsx -------------------------------------------------------------------------------- /Pandas Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Pandas Tutorial" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Pandas provides numerous tools to work with tabular data like you'd find in spreadsheets or databases. It is widely used for data preparation, cleaning, and analysis. It can work with a wide variety of data and provides many visualization options. It is built on top of NumPy." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Series" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 24, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "'rand_nums'" 33 | ] 34 | }, 35 | "execution_count": 24, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "import numpy as np\n", 42 | "import pandas as pd\n", 43 | "\n", 44 | "# Pandas uses something called a dataframe. It is a \n", 45 | "# 2D data structure that can hold multiple data types.\n", 46 | "# Columns have labels.\n", 47 | "\n", 48 | "# Series are built on top of NumPy arrays. \n", 49 | "# Create a series by first creating a list\n", 50 | "list_1 = ['a', 'b', 'c', 'd']\n", 51 | "# I can define that I want the series indexes to be the\n", 52 | "# provided labels\n", 53 | "labels = [1, 2, 3, 4]\n", 54 | "ser_1 = pd.Series(data=list_1, index=labels)\n", 55 | "\n", 56 | "# You can also add a NumPy array\n", 57 | "arr_1 = np.array([1, 2, 3, 4])\n", 58 | "ser_2 = pd.Series(arr_1)\n", 59 | "\n", 60 | "# You can quickly add labels and values with a dictionary\n", 61 | "dict_1 = {\"f_name\": \"Derek\", \n", 62 | " \"l_name\": \"Banas\", \n", 63 | " \"age\": 44}\n", 64 | "ser_3 = pd.Series(dict_1)\n", 65 | "\n", 66 | "# Get data by label\n", 67 | "ser_3[\"f_name\"]\n", 68 | "\n", 69 | "# You can get the datatype\n", 70 | "ser_2.dtype\n", 71 | "\n", 72 | "# You can perform math operations on series\n", 73 | "ser_2 + ser_2\n", 74 | "ser_2 - ser_2\n", 75 | "ser_2 * ser_2\n", 76 | "ser_2 / ser_2\n", 77 | "\n", 78 | "# You can pass them into NumPy methods\n", 79 | "# See NumPy tutorial for more math methods\n", 80 | "np.exp(ser_2)\n", 81 | "\n", 82 | "# The difference between Series and ndarray is that operations\n", 83 | "# align by labels\n", 84 | "# Create a series from a dictionary\n", 85 | "ser_4 = pd.Series({4: 5, 5: 6, 6: 7, 7: 8})\n", 86 | "# If labels don't align you will get NaN\n", 87 | "ser_2 + ser_4\n", 88 | "\n", 89 | "# You can assign names to series\n", 90 | "ser_4 = pd.Series({8: 9, 9: 10}, name='rand_nums')\n", 91 | "ser_4.name\n" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "### DataFrames" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "DataFrames are the most commonly used data structure with Pandas. They are made up of multiple series that share the same index / label. They can contain multiple data types. They can be created from dicts, series, lists or other dataframes. " 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "### Creating DataFrames" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 25, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "(2, 3)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "from numpy import random\n", 130 | "\n", 131 | "# Create random matrix 2x3 with values between 10 and 50\n", 132 | "arr_2 = np.random.randint(10, 50, size=(2, 3))\n", 133 | "\n", 134 | "# Create DF with data, row labels & column labels\n", 135 | "df_1 = pd.DataFrame(arr_2, ['A', 'B'], ['C', 'D', 'E'])\n", 136 | "\n", 137 | "# Create a DF from multiple series in a dict\n", 138 | "# If series are of different lengthes extra spaces are NaN\n", 139 | "dict_3 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),\n", 140 | " 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}\n", 141 | "df_2 = pd.DataFrame(dict_3)\n", 142 | "df_2\n", 143 | "\n", 144 | "# from_dict accepts a column labels and lists\n", 145 | "pd.DataFrame.from_dict(dict([('A', [1,2,3]), ('B', [4,5,6])]))\n", 146 | "\n", 147 | "# You can assign the keys as row labels and column labels separate\n", 148 | "# with orient='index'\n", 149 | "pd.DataFrame.from_dict(dict([('A', [1,2,3]), ('B', [4,5,6])]),\n", 150 | " orient='index', columns=['one','two','three'])\n", 151 | "\n", 152 | "# Get number of rows and columns as tuple\n", 153 | "print(df_1.shape)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "### Editing & Retrieving Data" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 26, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | " D E\n", 173 | "A 23 23\n", 174 | "B 34 49\n" 175 | ] 176 | }, 177 | { 178 | "data": { 179 | "text/html": [ 180 | "
\n", 181 | "\n", 194 | "\n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | "
A
01.0
19.0
23.0
34.0
\n", 220 | "
" 221 | ], 222 | "text/plain": [ 223 | " A\n", 224 | "0 1.0\n", 225 | "1 9.0\n", 226 | "2 3.0\n", 227 | "3 4.0" 228 | ] 229 | }, 230 | "execution_count": 26, 231 | "metadata": {}, 232 | "output_type": "execute_result" 233 | } 234 | ], 235 | "source": [ 236 | "# Grab a column\n", 237 | "df_1['C']\n", 238 | "# Get multiple columns\n", 239 | "df_1[['C', 'E']]\n", 240 | "\n", 241 | "# Grabb a row as a series\n", 242 | "df_1.loc['A']\n", 243 | "# Grab row by index position\n", 244 | "df_1.iloc[1]\n", 245 | "\n", 246 | "# Grab cell with Row & Column\n", 247 | "df_1.loc['A', 'C']\n", 248 | "# Grab multiple cells by defining rows wanted & the\n", 249 | "# columns from those rows\n", 250 | "print(df_1.loc[['A', 'B'], ['D', 'E']])\n", 251 | "\n", 252 | "# Make new column\n", 253 | "df_1['Total'] = df_1['C'] + df_1['D'] + df_1['E']\n", 254 | "df_1\n", 255 | "\n", 256 | "# You can perform multiple calculations\n", 257 | "df_2['mult'] = df_2['one'] * df_2['two']\n", 258 | "df_2\n", 259 | "\n", 260 | "# Make a new row by appending\n", 261 | "dict_2 = {'C': 44, 'D': 45, 'E': 46}\n", 262 | "new_row = pd.Series(dict_2, name='F')\n", 263 | "df_1 = df_1.append(new_row)\n", 264 | "\n", 265 | "# Delete column and set inplace to True which is required\n", 266 | "# because Pandas tries to help you not delete data\n", 267 | "# by accident\n", 268 | "df_1.drop('Total', axis=1, inplace=True)\n", 269 | "df_1\n", 270 | "# Delete a row\n", 271 | "df_1.drop('B', axis=0, inplace=True)\n", 272 | "df_1\n", 273 | "\n", 274 | "# Create a new column and make it the index\n", 275 | "df_1['Sex'] = ['Men', 'Women']\n", 276 | "df_1.set_index('Sex', inplace=True)\n", 277 | "\n", 278 | "# You can reset index values to numbers\n", 279 | "#df_1.reset_index(inplace=True)\n", 280 | "df_1\n", 281 | "\n", 282 | "# Assign can be used to create a column while leaving the\n", 283 | "# original DF untouched\n", 284 | "df_2.assign(div=df_2['one'] / df_2['two'])\n", 285 | "\n", 286 | "# You can pass in a function as well\n", 287 | "df_2.assign(div=lambda x: (x['one'] / x['two']))\n", 288 | "\n", 289 | "# Combine DataFrames while keeping df_3 data unless\n", 290 | "# there is a NaN value\n", 291 | "df_3 = pd.DataFrame({'A': [1., np.nan, 3., np.nan]})\n", 292 | "df_4 = pd.DataFrame({'A': [8., 9., 2., 4.]})\n", 293 | "df_3.combine_first(df_4)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### Conditional Selection" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 27, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "name": "stdout", 310 | "output_type": "stream", 311 | "text": [ 312 | " C D E\n", 313 | "A 19 38 16\n", 314 | "B 17 14 13\n", 315 | "Greater than 40\n", 316 | " C D E\n", 317 | "A False False False\n", 318 | "B False False False\n", 319 | "Greater than 45\n", 320 | " C D E\n", 321 | "A False False False\n", 322 | "B False False False\n", 323 | "Series([], Name: C, dtype: int64)\n", 324 | "\n", 325 | "Empty DataFrame\n", 326 | "Columns: [C, D]\n", 327 | "Index: []\n", 328 | "\n", 329 | " X Y Z\n", 330 | "A 1 2 3\n", 331 | "B 4 5 6\n", 332 | "C 7 8 9 \n", 333 | "\n" 334 | ] 335 | }, 336 | { 337 | "data": { 338 | "text/html": [ 339 | "
\n", 340 | "\n", 353 | "\n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | "
XYZ
B456
\n", 371 | "
" 372 | ], 373 | "text/plain": [ 374 | " X Y Z\n", 375 | "B 4 5 6" 376 | ] 377 | }, 378 | "execution_count": 27, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "arr_2 = np.random.randint(10, 50, size=(2, 3))\n", 385 | "df_1 = pd.DataFrame(arr_2, ['A', 'B'], ['C', 'D', 'E'])\n", 386 | "print(df_1)\n", 387 | "\n", 388 | "# You can use conditional operators to retrieve a table\n", 389 | "# based on the condition\n", 390 | "print(\"Greater than 40\\n\", df_1 > 40.0)\n", 391 | "\n", 392 | "# You can use comparison operater functions as well like\n", 393 | "# gt, lt, ge, le, eq, ne\n", 394 | "print(\"Greater than 45\\n\", df_1.gt(45.0))\n", 395 | "\n", 396 | "# You can place conditions in brackets as well\n", 397 | "bool_1 = df_1 >= 45.0\n", 398 | "df_1[bool_1]\n", 399 | "\n", 400 | "# Get bools for a column\n", 401 | "df_1['E'] > 40\n", 402 | "\n", 403 | "# Return a row if cell value in column matches a condition\n", 404 | "df_1[df_1['E']>30]\n", 405 | "\n", 406 | "# You can focus on a column based on resulting dataframe\n", 407 | "df_2 = df_1[df_1['E']>30]\n", 408 | "df_2['C']\n", 409 | "\n", 410 | "# You can stack these commands\n", 411 | "print(df_1[df_1['E']>20]['C'])\n", 412 | "print()\n", 413 | "\n", 414 | "# You can also grab multiple columns\n", 415 | "print(df_1[df_1['E']>20][['C', 'D']])\n", 416 | "print()\n", 417 | "\n", 418 | "# You can use multiple conditions\n", 419 | "arr_3 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n", 420 | "df_2 = pd.DataFrame(arr_3, ['A', 'B', 'C'], ['X', 'Y', 'Z'])\n", 421 | "print(df_2, \"\\n\")\n", 422 | "# You can use or | to combine conditions as well\n", 423 | "df_2[(df_2['X']>3) & (df_2['X']<7)]\n", 424 | "\n" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "### File Input / Output " 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "Pandas can work with the following types of data : CSV, Plain Text, JSON, XML, PDF, SQL, HTML, XLSX, DOCX, ZIP, Images Hierarchical Data Format, MP3, and MP4." 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 28, 444 | "metadata": {}, 445 | "outputs": [ 446 | { 447 | "name": "stdout", 448 | "output_type": "stream", 449 | "text": [ 450 | " student_id first_name last_name email street \\\n", 451 | "0 1 Dale Cooper dcooper@aol.com 123 Main St \n", 452 | "1 2 Harry Truman htruman@aol.com 202 South St \n", 453 | "2 3 Shelly Johnson sjohnson@aol.com 9 Pond Rd \n", 454 | "3 4 Bobby Briggs bbriggs@aol.com 14 12th St \n", 455 | "4 5 Donna Hayward dhayward@aol.com 120 16th St \n", 456 | "5 6 Audrey Horne ahorne@aol.com 342 19th St \n", 457 | "6 7 James Hurley jhurley@aol.com 2578 Cliff St \n", 458 | "7 8 Lucy Moran lmoran@aol.com 178 Dover St \n", 459 | "8 9 Tommy Hill thill@aol.com 672 High Plains \n", 460 | "9 10 Andy Brennan abrennan@aol.com 281 4th St \n", 461 | "10 13 Frank Silva fsilva@aol.com 666 Hell St \n", 462 | "11 14 Frank Silva fsilva@aol.com 666 Hell St \n", 463 | "12 15 Frank Silva fsilva@aol.com 666 Hell St \n", 464 | "13 16 Frank Silva fsilva@aol.com 666 Hell St \n", 465 | "\n", 466 | " city state zip phone birth_date sex \\\n", 467 | "0 Yakima WA 98901 792-223-8901 1959-02-22 M \n", 468 | "1 Vancouver WA 98660 792-223-9810 1946-01-24 M \n", 469 | "2 Sparks NV 89431 792-223-6734 1970-12-12 F \n", 470 | "3 San Diego CA 92101 792-223-6178 1967-05-24 M \n", 471 | "4 Davenport IA 52801 792-223-2001 1970-03-24 F \n", 472 | "5 Detroit MI 48222 792-223-2001 1965-02-01 F \n", 473 | "6 Queens NY 11427 792-223-1890 1967-01-02 M \n", 474 | "7 Hollywood CA 90078 792-223-9678 1954-11-27 F \n", 475 | "8 Tucson AZ 85701 792-223-1115 1951-12-21 M \n", 476 | "9 Jacksonville NC 28540 792-223-8902 1960-12-27 M \n", 477 | "10 Yakima WA 98901 792-223-8966 1959-02-22 M \n", 478 | "11 Yakima WA 98901 792-223-8966 1959-02-22 M \n", 479 | "12 Yakima WA 98901 792-223-8966 1959-02-22 M \n", 480 | "13 Yakima WA 98901 792-223-8966 1959-02-22 M \n", 481 | "\n", 482 | " date_entered lunch_cost \n", 483 | "0 2019-12-10 13:09:03 3.5 \n", 484 | "1 2019-12-10 13:19:12 3.5 \n", 485 | "2 2019-12-10 13:19:12 3.5 \n", 486 | "3 2019-12-10 13:19:12 3.5 \n", 487 | "4 2019-12-10 13:19:12 3.5 \n", 488 | "5 2019-12-10 13:19:12 3.5 \n", 489 | "6 2019-12-10 13:19:12 3.5 \n", 490 | "7 2019-12-10 13:19:12 3.5 \n", 491 | "8 2019-12-10 13:19:12 3.5 \n", 492 | "9 2019-12-10 13:19:12 3.5 \n", 493 | "10 2020-08-09 13:42:56 3.5 \n", 494 | "11 2020-08-11 09:54:40 3.5 \n", 495 | "12 2020-08-12 16:43:43 3.5 \n", 496 | "13 2020-08-12 16:54:12 3.5 \n" 497 | ] 498 | }, 499 | { 500 | "data": { 501 | "text/plain": [ 502 | "0 OH\n", 503 | "1 WV\n", 504 | "2 PA\n", 505 | "3 PA\n", 506 | "4 PA\n", 507 | "5 PA\n", 508 | "6 OH\n", 509 | "7 OH\n", 510 | "8 OH\n", 511 | "9 WV\n", 512 | "10 PA\n", 513 | "11 OH\n", 514 | "12 PA\n", 515 | "13 PA\n", 516 | "14 OH\n", 517 | "15 PA\n", 518 | "16 OH\n", 519 | "17 NY\n", 520 | "18 PA\n", 521 | "19 NY\n", 522 | "20 PA\n", 523 | "21 PA\n", 524 | "22 OH\n", 525 | "23 WV\n", 526 | "24 NY\n", 527 | "25 PA\n", 528 | "26 OH\n", 529 | "27 PA\n", 530 | "28 PA\n", 531 | "29 WV\n", 532 | "30 PA\n", 533 | "31 OH\n", 534 | "32 PA\n", 535 | "33 OH\n", 536 | "34 NY\n", 537 | "35 NY\n", 538 | "36 PA\n", 539 | "37 PA\n", 540 | "38 PA\n", 541 | "Name: State, dtype: object" 542 | ] 543 | }, 544 | "execution_count": 28, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "import pymysql\n", 551 | "\n", 552 | "# Read a CSV file\n", 553 | "# Type pd.read_ [TAB] to see the file types you can read\n", 554 | "cs_df = pd.read_csv('ComputerSales.csv')\n", 555 | "\n", 556 | "# Save a CSV file, but don't save the index as a column\n", 557 | "cs_df.to_csv('ComputerSalesBU.csv', index=False)\n", 558 | "\n", 559 | "# You can read data from Excel, but not formulas and macros\n", 560 | "pd.read_excel('Financial Sample.xlsx',0)\n", 561 | "\n", 562 | "# Write to Excel\n", 563 | "cs_df.to_excel('ComputerSales.xlsx')\n", 564 | "\n", 565 | "# Check if written\n", 566 | "pd.read_excel('ComputerSales.xlsx',0)\n", 567 | "\n", 568 | "# Read from MySQL Database\n", 569 | "try:\n", 570 | " db_connection = pymysql.connect(db='students', user='studentadmin', passwd='TurtleDove', host='localhost', port=3306)\n", 571 | "\n", 572 | " stud_df = pd.read_sql('SELECT * FROM students', con=db_connection)\n", 573 | " # print(stud_df)\n", 574 | "except Exception as e:\n", 575 | " print(\"Exception : {}\".format(e))\n", 576 | "finally:\n", 577 | " db_connection.close()\n", 578 | " \n", 579 | "\n", 580 | "# Write to table \n", 581 | "try:\n", 582 | " db_connection = pymysql.connect(db='students', user='studentadmin', passwd='TurtleDove', host='localhost', port=3306)\n", 583 | " # Used to issue queries\n", 584 | " cursor = db_connection.cursor()\n", 585 | " # Query to enter new student\n", 586 | " insert_stmt = \"INSERT INTO students VALUES(NULL, 'Frank', 'Silva', 'fsilva@aol.com', '666 Hell St', 'Yakima', 'WA', 98901, '792-223-8966', '1959-2-22', 'M', NOW(), 3.50)\"\n", 587 | " # Execute query\n", 588 | " cursor.execute(insert_stmt)\n", 589 | " # Commit changes to DB\n", 590 | " db_connection.commit()\n", 591 | " stud_df = pd.read_sql('SELECT * FROM students', con=db_connection)\n", 592 | " print(stud_df)\n", 593 | "except Exception as e:\n", 594 | " print(\"Exception : {}\".format(e))\n", 595 | "finally:\n", 596 | " db_connection.close()\n", 597 | "\n", 598 | "# Just get 1 column of data \n", 599 | "cs_df_st = pd.read_csv('ComputerSales.csv', usecols=[\"State\"], squeeze=True)\n", 600 | "cs_df_st\n" 601 | ] 602 | }, 603 | { 604 | "cell_type": "markdown", 605 | "metadata": {}, 606 | "source": [ 607 | "### Basics & Math" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 29, 613 | "metadata": {}, 614 | "outputs": [ 615 | { 616 | "name": "stdout", 617 | "output_type": "stream", 618 | "text": [ 619 | " one two\n", 620 | "a 1.0 1.0\n", 621 | "b 2.0 2.0\n", 622 | "c 3.0 3.0\n", 623 | "d 0.0 4.0\n" 624 | ] 625 | }, 626 | { 627 | "data": { 628 | "text/html": [ 629 | "
\n", 630 | "\n", 643 | "\n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | "
onetwo
aFalseFalse
bFalseFalse
cFalseFalse
dTrueFalse
\n", 674 | "
" 675 | ], 676 | "text/plain": [ 677 | " one two\n", 678 | "a False False\n", 679 | "b False False\n", 680 | "c False False\n", 681 | "d True False" 682 | ] 683 | }, 684 | "execution_count": 29, 685 | "metadata": {}, 686 | "output_type": "execute_result" 687 | } 688 | ], 689 | "source": [ 690 | "# Display 1st 5 rows\n", 691 | "cs_df.head()\n", 692 | "# Display last 5 rows\n", 693 | "cs_df.tail()\n", 694 | "# Get 1st 2\n", 695 | "cs_df[:2]\n", 696 | "# Get 1st through 5 with a 2 step\n", 697 | "cs_df[:5:2]\n", 698 | "\n", 699 | "# Get indexes\n", 700 | "cs_df.index.array\n", 701 | "# Get NumPy array\n", 702 | "cs_df.to_numpy()\n", 703 | "# Get array from series\n", 704 | "ser_1.array\n", 705 | "\n", 706 | "dict_3 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),\n", 707 | " 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}\n", 708 | "df_2 = pd.DataFrame(dict_3)\n", 709 | "\n", 710 | "# You can replace NaN values with 0 or anything else\n", 711 | "print(df_2.fillna(0))\n", 712 | "# Get values in row 2\n", 713 | "row = df_2.iloc[1]\n", 714 | "# Add items in row 2 to all rows including row 2\n", 715 | "# You can do the same with sub, mul, and div\n", 716 | "df_2.add(row, axis='columns')\n", 717 | "\n", 718 | "# Get column 2\n", 719 | "col = df_2['two']\n", 720 | "# Subtract from other columns\n", 721 | "df_2.sub(col, axis=0)\n", 722 | "\n", 723 | "# Check if empty\n", 724 | "df_2.empty\n", 725 | "\n", 726 | "# Transform executes a function on a dataframe\n", 727 | "df_5 = pd.DataFrame({'A': range(3), 'B': range(1, 4)})\n", 728 | "df_5.transform(lambda x: x+1)\n", 729 | "df_5.transform(lambda x: x**2)\n", 730 | "df_5.transform(lambda x: np.sqrt(x))\n", 731 | "# You can transform using multiple functions\n", 732 | "df_5.transform([lambda x: x**2, lambda x: x**3])\n", 733 | "# Passing a dictionary allows you to perform different calculations\n", 734 | "# on different columns\n", 735 | "df_5.transform({'A': lambda x: x**2, 'B': lambda x: x**3})\n", 736 | "\n", 737 | "# map performs a function on a series\n", 738 | "df_5['A'].map(lambda x: x**2)\n", 739 | "\n", 740 | "# applymap does the same on a dataframe\n", 741 | "df_5.applymap(lambda x: x**2)\n", 742 | "\n", 743 | "# Get unique values in column 2 of DF\n", 744 | "df_2['two'].unique()\n", 745 | "\n", 746 | "# Get number of uniques\n", 747 | "df_2['two'].nunique()\n", 748 | "\n", 749 | "# Get the number of times each value showed in column 2\n", 750 | "df_2['two'].value_counts()\n", 751 | "\n", 752 | "# Get column names\n", 753 | "df_2.columns\n", 754 | "\n", 755 | "# Get index info\n", 756 | "df_2.index\n", 757 | "\n", 758 | "# Return a DF that lists null values as True\n", 759 | "df_2.isnull()" 760 | ] 761 | }, 762 | { 763 | "cell_type": "markdown", 764 | "metadata": {}, 765 | "source": [ 766 | "### Group Data" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 30, 772 | "metadata": {}, 773 | "outputs": [ 774 | { 775 | "data": { 776 | "text/html": [ 777 | "
\n", 778 | "\n", 795 | "\n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | "
Sales
countmeanstdmin25%50%75%max
Store
12.022.05.65685418.020.022.024.026.0
22.017.07.07106812.014.517.019.522.0
\n", 849 | "
" 850 | ], 851 | "text/plain": [ 852 | " Sales \n", 853 | " count mean std min 25% 50% 75% max\n", 854 | "Store \n", 855 | "1 2.0 22.0 5.656854 18.0 20.0 22.0 24.0 26.0\n", 856 | "2 2.0 17.0 7.071068 12.0 14.5 17.0 19.5 22.0" 857 | ] 858 | }, 859 | "execution_count": 30, 860 | "metadata": {}, 861 | "output_type": "execute_result" 862 | } 863 | ], 864 | "source": [ 865 | "# Groupby allows you to group rows based on a columnand perform a function\n", 866 | "# that combines those values (Aggregate Function)\n", 867 | "dict_5 = {'Store': [1,2,1,2], 'Flavor': ['Choc', 'Van', 'Straw', 'Choc'], \n", 868 | " 'Sales': [26, 12, 18, 22]}\n", 869 | "\n", 870 | "df_11 = pd.DataFrame(dict_5)\n", 871 | "\n", 872 | "# Group data by the store number\n", 873 | "by_store = df_11.groupby('Store')\n", 874 | "# Get mean sales by store\n", 875 | "by_store.mean()\n", 876 | "\n", 877 | "# Get sales total just for store 1\n", 878 | "by_store.sum().loc[1]\n", 879 | "\n", 880 | "# You can use multiple functions of get a bunch\n", 881 | "by_store.describe()" 882 | ] 883 | }, 884 | { 885 | "cell_type": "markdown", 886 | "metadata": {}, 887 | "source": [ 888 | "### Concatenate Merge & Join Data" 889 | ] 890 | }, 891 | { 892 | "cell_type": "code", 893 | "execution_count": 31, 894 | "metadata": {}, 895 | "outputs": [ 896 | { 897 | "data": { 898 | "text/html": [ 899 | "
\n", 900 | "\n", 913 | "\n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | "
ABCD
11.04.07.010.0
22.05.0NaNNaN
33.06.0NaNNaN
4NaNNaN8.011.0
5NaNNaN9.012.0
\n", 961 | "
" 962 | ], 963 | "text/plain": [ 964 | " A B C D\n", 965 | "1 1.0 4.0 7.0 10.0\n", 966 | "2 2.0 5.0 NaN NaN\n", 967 | "3 3.0 6.0 NaN NaN\n", 968 | "4 NaN NaN 8.0 11.0\n", 969 | "5 NaN NaN 9.0 12.0" 970 | ] 971 | }, 972 | "execution_count": 31, 973 | "metadata": {}, 974 | "output_type": "execute_result" 975 | } 976 | ], 977 | "source": [ 978 | "# You can concatenate DFs in the order DFs are provided\n", 979 | "df_12 = pd.DataFrame({'A': [1,2,3],\n", 980 | " 'B': [4,5,6]},\n", 981 | " index=[1,2,3])\n", 982 | "df_13 = pd.DataFrame({'A': [7,8,9],\n", 983 | " 'B': [10,11,12]},\n", 984 | " index=[4,5,6])\n", 985 | "pd.concat([df_12, df_13])\n", 986 | "\n", 987 | "# Merge 2 DFs using their shared key column\n", 988 | "df_12 = pd.DataFrame({'A': [1,2,3],\n", 989 | " 'B': [4,5,6],\n", 990 | " 'key': [1,2,3]})\n", 991 | "df_13 = pd.DataFrame({'A': [7,8,9],\n", 992 | " 'B': [10,11,12],\n", 993 | " 'key': [1,2,3]})\n", 994 | "# inner merges at the intersection of keys\n", 995 | "pd.merge(df_12, df_13, how='inner', on='key')\n", 996 | "# how='left' or 'right' : Use keys from left or right frame\n", 997 | "# how='outer' : Use union of keys\n", 998 | "\n", 999 | "# You can join DFs with different indexes and instead of using \n", 1000 | "# keys use a column\n", 1001 | "df_12 = pd.DataFrame({'A': [1,2,3],\n", 1002 | " 'B': [4,5,6]},\n", 1003 | " index=[1,2,3])\n", 1004 | "df_13 = pd.DataFrame({'C': [7,8,9],\n", 1005 | " 'D': [10,11,12]},\n", 1006 | " index=[1,4,5])\n", 1007 | "df_12.join(df_13, how='outer')" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "markdown", 1012 | "metadata": {}, 1013 | "source": [ 1014 | "### Statistics" 1015 | ] 1016 | }, 1017 | { 1018 | "cell_type": "code", 1019 | "execution_count": 33, 1020 | "metadata": {}, 1021 | "outputs": [ 1022 | { 1023 | "name": "stdout", 1024 | "output_type": "stream", 1025 | "text": [ 1026 | " one two\n", 1027 | "a 1.0 1.0\n", 1028 | "b 2.0 2.0\n", 1029 | "c 3.0 3.0\n", 1030 | "d NaN 4.0\n" 1031 | ] 1032 | }, 1033 | { 1034 | "data": { 1035 | "text/html": [ 1036 | "
\n", 1037 | "\n", 1050 | "\n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | "
onetwo
mean2.02.500000
std1.01.290994
\n", 1071 | "
" 1072 | ], 1073 | "text/plain": [ 1074 | " one two\n", 1075 | "mean 2.0 2.500000\n", 1076 | "std 1.0 1.290994" 1077 | ] 1078 | }, 1079 | "execution_count": 33, 1080 | "metadata": {}, 1081 | "output_type": "execute_result" 1082 | } 1083 | ], 1084 | "source": [ 1085 | "# Get ice cream sales data\n", 1086 | "ics_df = pd.read_csv('icecreamsales.csv')\n", 1087 | "ics_df\n", 1088 | "\n", 1089 | "# Get total count of both columns\n", 1090 | "ics_df.count()\n", 1091 | "\n", 1092 | "# skipna skips null / NaN values\n", 1093 | "ics_df.sum(skipna=True)\n", 1094 | "# Get mean for named column\n", 1095 | "ics_df[\"Sales\"].mean()\n", 1096 | "ics_df[\"Sales\"].median()\n", 1097 | "ics_df[\"Sales\"].mode()\n", 1098 | "ics_df[\"Sales\"].min()\n", 1099 | "ics_df[\"Sales\"].max()\n", 1100 | "ics_df[\"Sales\"].prod() # Product of values\n", 1101 | "ics_df[\"Sales\"].std() # Standard deviation\n", 1102 | "ics_df[\"Sales\"].var() # Variance\n", 1103 | "ics_df[\"Sales\"].sem() # Standard error\n", 1104 | "# Negative : Left long tail, Positive : Right long tail\n", 1105 | "ics_df[\"Sales\"].skew()\n", 1106 | "# Kurtosis : < 3 less outliers, 3 Normal Distribution,\n", 1107 | "# > 3 more outliers\n", 1108 | "ics_df[\"Sales\"].kurt()\n", 1109 | "ics_df[\"Sales\"].quantile(.5)\n", 1110 | "ics_df[\"Sales\"].cumsum()\n", 1111 | "ics_df[\"Sales\"].cumprod()\n", 1112 | "ics_df[\"Sales\"].cummax()\n", 1113 | "ics_df[\"Sales\"].cummin()\n", 1114 | "\n", 1115 | "# Multiple stats at once\n", 1116 | "ics_df.describe()\n", 1117 | "\n", 1118 | "ser_dice = pd.Series(data=[2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, \n", 1119 | " 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8,\n", 1120 | " 8, 8, 9, 9, 9, 9, 10, 10, 10, 11, 11, 12])\n", 1121 | "# Count for each value in series\n", 1122 | "ser_dice.value_counts()\n", 1123 | "\n", 1124 | "# You can perform calculations on multiple columns using\n", 1125 | "# aggregate\n", 1126 | "print(df_2)\n", 1127 | "df_2.agg(np.mean)\n", 1128 | "\n", 1129 | "# You can do this with multiple functions\n", 1130 | "df_2.agg(['mean', 'std'])\n", 1131 | "\n" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "markdown", 1136 | "metadata": {}, 1137 | "source": [ 1138 | "### Iteration" 1139 | ] 1140 | }, 1141 | { 1142 | "cell_type": "code", 1143 | "execution_count": 37, 1144 | "metadata": {}, 1145 | "outputs": [ 1146 | { 1147 | "name": "stdout", 1148 | "output_type": "stream", 1149 | "text": [ 1150 | "0\n", 1151 | "1\n", 1152 | "2\n", 1153 | "3\n", 1154 | "4\n", 1155 | "\n", 1156 | " C D E\n", 1157 | "B 22 40 23\n", 1158 | "C 44 42 45\n", 1159 | "C\n", 1160 | "B 22\n", 1161 | "C 44\n", 1162 | "Name: C, dtype: int64\n", 1163 | "D\n", 1164 | "B 40\n", 1165 | "C 42\n", 1166 | "Name: D, dtype: int64\n", 1167 | "E\n", 1168 | "B 23\n", 1169 | "C 45\n", 1170 | "Name: E, dtype: int64\n", 1171 | "\n", 1172 | "B\n", 1173 | "C 22\n", 1174 | "D 40\n", 1175 | "E 23\n", 1176 | "Name: B, dtype: int64\n", 1177 | "C\n", 1178 | "C 44\n", 1179 | "D 42\n", 1180 | "E 45\n", 1181 | "Name: C, dtype: int64\n", 1182 | "\n", 1183 | "Pandas(Index='B', C=22, D=40, E=23)\n", 1184 | "Pandas(Index='C', C=44, D=42, E=45)\n" 1185 | ] 1186 | } 1187 | ], 1188 | "source": [ 1189 | "# Iterating over series\n", 1190 | "ser_7 = pd.Series(range(5), index=['a', 'b', 'c', 'd', 'e'])\n", 1191 | "for col in ser_7:\n", 1192 | " print(col)\n", 1193 | " \n", 1194 | "print()\n", 1195 | "# Iterating over DFs\n", 1196 | "arr_4 = np.random.randint(10, 50, size=(2, 3))\n", 1197 | "df_8 = pd.DataFrame(arr_4, ['B', 'C'], ['C', 'D', 'E'])\n", 1198 | "print(df_8)\n", 1199 | "\n", 1200 | "# items allows you to iterate through key value pairs to make\n", 1201 | "# calculations 1 column at a time\n", 1202 | "for label, ser in df_8.items():\n", 1203 | " print(label)\n", 1204 | " print(ser)\n", 1205 | " \n", 1206 | "print()\n", 1207 | "# You can also iterate through rows\n", 1208 | "for index, row in df_8.iterrows():\n", 1209 | " print(f\"{index}\\n{row}\")\n", 1210 | "print()\n", 1211 | "\n", 1212 | "# Get a tuple that contains row data\n", 1213 | "for row in df_8.itertuples():\n", 1214 | " print(row)" 1215 | ] 1216 | }, 1217 | { 1218 | "cell_type": "markdown", 1219 | "metadata": {}, 1220 | "source": [ 1221 | "### Sorting" 1222 | ] 1223 | }, 1224 | { 1225 | "cell_type": "code", 1226 | "execution_count": 38, 1227 | "metadata": {}, 1228 | "outputs": [ 1229 | { 1230 | "data": { 1231 | "text/html": [ 1232 | "
\n", 1233 | "\n", 1246 | "\n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | "
CDE
B224023
C444245
\n", 1270 | "
" 1271 | ], 1272 | "text/plain": [ 1273 | " C D E\n", 1274 | "B 22 40 23\n", 1275 | "C 44 42 45" 1276 | ] 1277 | }, 1278 | "execution_count": 38, 1279 | "metadata": {}, 1280 | "output_type": "execute_result" 1281 | } 1282 | ], 1283 | "source": [ 1284 | "df_8\n", 1285 | "\n", 1286 | "# Sorting by index will return the same results if indexes\n", 1287 | "# are in order, to reverse indexes mark ascending as False\n", 1288 | "df_8.sort_index(ascending=False)\n", 1289 | "\n", 1290 | "# Sort by value for column D (Use the same function for series)\n", 1291 | "df_8.sort_values(by='D')\n" 1292 | ] 1293 | }, 1294 | { 1295 | "cell_type": "markdown", 1296 | "metadata": {}, 1297 | "source": [ 1298 | "### Passing Data to Functions" 1299 | ] 1300 | }, 1301 | { 1302 | "cell_type": "code", 1303 | "execution_count": 39, 1304 | "metadata": {}, 1305 | "outputs": [ 1306 | { 1307 | "name": "stdout", 1308 | "output_type": "stream", 1309 | "text": [ 1310 | "Total Profit : 5459.010000000001\n" 1311 | ] 1312 | }, 1313 | { 1314 | "data": { 1315 | "text/html": [ 1316 | "
\n", 1317 | "\n", 1330 | "\n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | "
Sale IDContactSexAgeStateProduct IDProduct TypeSale PriceProfitLeadMonthYearFirst NameLast NameAge Group
01Paul ThomasM43OHM01-F0024Desktop479.99143.39WebsiteJanuary2018PaulThomas30-50
12Margo SimmsF37WVGT13-0024Desktop1249.99230.89Flyer 4January2018MargoSimms30-50
23Sam StineM26PAI3670Desktop649.99118.64WebsiteFebruary2018SamStine<30
34Moe EggertM35PAI3593Laptop399.9972.09WebsiteMarch2018MoeEggert30-50
45Jessica ElkF55PA15M-EDLaptop699.9998.09Flyer 4March2018JessicaElk>50
\n", 1444 | "
" 1445 | ], 1446 | "text/plain": [ 1447 | " Sale ID Contact Sex Age State Product ID Product Type Sale Price \\\n", 1448 | "0 1 Paul Thomas M 43 OH M01-F0024 Desktop 479.99 \n", 1449 | "1 2 Margo Simms F 37 WV GT13-0024 Desktop 1249.99 \n", 1450 | "2 3 Sam Stine M 26 PA I3670 Desktop 649.99 \n", 1451 | "3 4 Moe Eggert M 35 PA I3593 Laptop 399.99 \n", 1452 | "4 5 Jessica Elk F 55 PA 15M-ED Laptop 699.99 \n", 1453 | "\n", 1454 | " Profit Lead Month Year First Name Last Name Age Group \n", 1455 | "0 143.39 Website January 2018 Paul Thomas 30-50 \n", 1456 | "1 230.89 Flyer 4 January 2018 Margo Simms 30-50 \n", 1457 | "2 118.64 Website February 2018 Sam Stine <30 \n", 1458 | "3 72.09 Website March 2018 Moe Eggert 30-50 \n", 1459 | "4 98.09 Flyer 4 March 2018 Jessica Elk >50 " 1460 | ] 1461 | }, 1462 | "execution_count": 39, 1463 | "metadata": {}, 1464 | "output_type": "execute_result" 1465 | } 1466 | ], 1467 | "source": [ 1468 | "import sys\n", 1469 | "\n", 1470 | "# You can pass DataFrames and Series into functions\n", 1471 | "def get_profit_total(df):\n", 1472 | " prof_ser = df['Profit']\n", 1473 | " print(f\"Total Profit : {prof_ser.sum()}\")\n", 1474 | "\n", 1475 | "get_profit_total(cs_df)\n", 1476 | "\n", 1477 | "# Receives a DataFrame, splits the contact into new columns\n", 1478 | "# being first and last name\n", 1479 | "def split_name(df):\n", 1480 | " def get_names(full_name):\n", 1481 | " # Split contact at space\n", 1482 | " f_name, l_name = full_name.split()\n", 1483 | " # Create a series with first & last names in columns\n", 1484 | " # with those labels\n", 1485 | " return pd.Series(\n", 1486 | " (f_name, l_name),\n", 1487 | " index=['First Name', 'Last Name']\n", 1488 | " )\n", 1489 | " # apply() executes the function on all names in Contact column\n", 1490 | " names = df['Contact'].apply(get_names)\n", 1491 | " df[names.columns] = names\n", 1492 | " return df\n", 1493 | "\n", 1494 | "# Run function and display top 5 results\n", 1495 | "split_name(cs_df).head()\n", 1496 | "\n", 1497 | "# Will assign people to different age groups based on age\n", 1498 | "def create_age_groups(df):\n", 1499 | " # Must have 1 more bins than labels\n", 1500 | " bins = [0, 30, 50, sys.maxsize]\n", 1501 | " # Group labels\n", 1502 | " labels = ['<30', '30-50', '>50']\n", 1503 | " \n", 1504 | " # cut puts values into certain groups based on intervals\n", 1505 | " # The group assigned to <30 has an age between 0 and 30\n", 1506 | " # between 30 & 50 is assigned 30-50 and so on\n", 1507 | " age_group = pd.cut(df['Age'], bins=bins, labels=labels)\n", 1508 | " # Create new column and return new dataframe info\n", 1509 | " df['Age Group'] = age_group\n", 1510 | " return df\n", 1511 | "\n", 1512 | "create_age_groups(cs_df)\n", 1513 | "\n", 1514 | "# You can use a pipe to pass a dataframe to multiple functions\n", 1515 | "cs_df.pipe(split_name).pipe(create_age_groups).head()\n", 1516 | "\n" 1517 | ] 1518 | }, 1519 | { 1520 | "cell_type": "markdown", 1521 | "metadata": {}, 1522 | "source": [ 1523 | "### Aligning, Reindexing and Renaming Labels" 1524 | ] 1525 | }, 1526 | { 1527 | "cell_type": "code", 1528 | "execution_count": 41, 1529 | "metadata": {}, 1530 | "outputs": [ 1531 | { 1532 | "name": "stdout", 1533 | "output_type": "stream", 1534 | "text": [ 1535 | "a 0\n", 1536 | "b 1\n", 1537 | "c 2\n", 1538 | "d 3\n", 1539 | "dtype: int64\n", 1540 | "b 1\n", 1541 | "c 2\n", 1542 | "d 3\n", 1543 | "e 4\n", 1544 | "dtype: int64\n" 1545 | ] 1546 | }, 1547 | { 1548 | "data": { 1549 | "text/html": [ 1550 | "
\n", 1551 | "\n", 1564 | "\n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | "
MenWomenPets
1362338
2223216
\n", 1588 | "
" 1589 | ], 1590 | "text/plain": [ 1591 | " Men Women Pets\n", 1592 | "1 36 23 38\n", 1593 | "2 22 32 16" 1594 | ] 1595 | }, 1596 | "execution_count": 41, 1597 | "metadata": {}, 1598 | "output_type": "execute_result" 1599 | } 1600 | ], 1601 | "source": [ 1602 | "ser_6 = pd.Series(range(5), index=['a', 'b', 'c', 'd', 'e'])\n", 1603 | "sl_1 = ser_6[:4]\n", 1604 | "sl_2 = ser_6[1:]\n", 1605 | "print(sl_1)\n", 1606 | "print(sl_2)\n", 1607 | "# Align both series by the union of their indexes\n", 1608 | "sl_1.align(sl_2)\n", 1609 | "# Align by calling series\n", 1610 | "sl_1.align(sl_2, join='left')\n", 1611 | "# Use passed series indexes\n", 1612 | "sl_1.align(sl_2, join='right')\n", 1613 | "# Get where indexes intersect\n", 1614 | "sl_1.align(sl_2, join='inner')\n", 1615 | "\n", 1616 | "# You can use align with DFs as well\n", 1617 | "arr_3 = np.random.randint(10, 50, size=(2, 3))\n", 1618 | "df_6 = pd.DataFrame(arr_3, ['A', 'B'], ['C', 'D', 'E'])\n", 1619 | "arr_3 = np.random.randint(10, 50, size=(2, 3))\n", 1620 | "df_7 = pd.DataFrame(arr_3, ['B', 'C'], ['C', 'D', 'E'])\n", 1621 | "df_6\n", 1622 | "df_6.align(df_7)\n", 1623 | "\n", 1624 | "# reindex allows you to align data by index\n", 1625 | "ser_6.reindex(['c','b','a'])\n", 1626 | "\n", 1627 | "# Do the same with DFs\n", 1628 | "df_6.reindex(['B','A'])\n", 1629 | "\n", 1630 | "# Drop is very similar to reindex except it receives labels\n", 1631 | "# you don't want to include\n", 1632 | "df_6.drop(['A'], axis=0)\n", 1633 | "df_6.drop(['D'], axis=1)\n", 1634 | "\n", 1635 | "# You can rename labels\n", 1636 | "df_6.rename(columns={'C': 'Men', 'D': 'Women', 'E': 'Pets'},\n", 1637 | " index={'A': 1, 'B': 2})" 1638 | ] 1639 | }, 1640 | { 1641 | "cell_type": "markdown", 1642 | "metadata": {}, 1643 | "source": [ 1644 | "### MultiIndex" 1645 | ] 1646 | }, 1647 | { 1648 | "cell_type": "code", 1649 | "execution_count": 42, 1650 | "metadata": {}, 1651 | "outputs": [ 1652 | { 1653 | "name": "stdout", 1654 | "output_type": "stream", 1655 | "text": [ 1656 | "[('Day 1', 1), ('Day 1', 2), ('Day 1', 3), ('Day 2', 1), ('Day 2', 2), ('Day 2', 3)]\n", 1657 | " M F\n", 1658 | "Day 1 1 682 514\n", 1659 | " 2 525 613\n", 1660 | " 3 542 576\n", 1661 | "Day 2 1 553 651\n", 1662 | " 2 676 677\n", 1663 | " 3 645 676\n" 1664 | ] 1665 | }, 1666 | { 1667 | "data": { 1668 | "text/html": [ 1669 | "
\n", 1670 | "\n", 1683 | "\n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | "
CFM
AB
Day 11NaN1.0
22.0NaN
3NaN3.0
Day 214.0NaN
2NaN5.0
36.0NaN
\n", 1733 | "
" 1734 | ], 1735 | "text/plain": [ 1736 | "C F M\n", 1737 | "A B \n", 1738 | "Day 1 1 NaN 1.0\n", 1739 | " 2 2.0 NaN\n", 1740 | " 3 NaN 3.0\n", 1741 | "Day 2 1 4.0 NaN\n", 1742 | " 2 NaN 5.0\n", 1743 | " 3 6.0 NaN" 1744 | ] 1745 | }, 1746 | "execution_count": 42, 1747 | "metadata": {}, 1748 | "output_type": "execute_result" 1749 | } 1750 | ], 1751 | "source": [ 1752 | "# Multi-level indexing allows you to store data on multiple\n", 1753 | "# dimensions\n", 1754 | "days = ['Day 1', 'Day 1', 'Day 1', 'Day 2', 'Day 2', 'Day 2']\n", 1755 | "meals = [1,2,3,1,2,3]\n", 1756 | "# zip pairs the days and meals arrays \n", 1757 | "# Then we create a list of those paired tuples\n", 1758 | "hier_index = list(zip(days, meals))\n", 1759 | "print(hier_index)\n", 1760 | "# Converts list of tuples into each row and column\n", 1761 | "hier_index = pd.MultiIndex.from_tuples(hier_index)\n", 1762 | "# Generate random array representing calories eaten per meal\n", 1763 | "arr_5 = np.random.randint(500, 700, size=(6, 2))\n", 1764 | "df_9 = pd.DataFrame(arr_5, hier_index, ['M', 'F'])\n", 1765 | "print(df_9)\n", 1766 | "\n", 1767 | "# Grab the day 1 DF\n", 1768 | "df_9.loc['Day 1']\n", 1769 | "\n", 1770 | "# Grab 1st row as a series\n", 1771 | "df_9.loc['Day 1'].loc[1]\n", 1772 | "\n", 1773 | "# Grab calories eaten by the female on day 2 for the 2nd meal\n", 1774 | "df_9.loc['Day 2'].loc[2]['F']\n", 1775 | "\n", 1776 | "# We can assign names to the Day and Meals Column\n", 1777 | "df_9.index.names = ['Day', 'Meal']\n", 1778 | "df_9\n", 1779 | "\n", 1780 | "# Get a cross section\n", 1781 | "# This gets me the Day 2 DF\n", 1782 | "df_9.xs('Day 2')\n", 1783 | "\n", 1784 | "# Get calories for the 1st meal for both days by saying what\n", 1785 | "# meal index you want and the Meal column name\n", 1786 | "df_9.xs(1, level='Meal')\n", 1787 | "\n", 1788 | "# Create a MultiIndex out of a DF using a pivot table\n", 1789 | "dict_6 = {'A':['Day 1', 'Day 1', 'Day 1', 'Day 2', 'Day 2', 'Day 2'],\n", 1790 | " 'B': [1,2,3,1,2,3],\n", 1791 | " 'C': ['M', 'F', 'M', 'F', 'M', 'F'],\n", 1792 | " 'D': [1,2,3,4,5,6]}\n", 1793 | "df_14 = pd.DataFrame(dict_6)\n", 1794 | "# Designate the D column is the data\n", 1795 | "# Make A & B a multilevel index\n", 1796 | "# Define column names come from column C\n", 1797 | "# You will have NaNs where data was missing\n", 1798 | "df_14.pivot_table(values='D', index=['A','B'], columns=['C'])" 1799 | ] 1800 | }, 1801 | { 1802 | "cell_type": "markdown", 1803 | "metadata": {}, 1804 | "source": [ 1805 | "### Handling Missing Data" 1806 | ] 1807 | }, 1808 | { 1809 | "cell_type": "code", 1810 | "execution_count": 43, 1811 | "metadata": {}, 1812 | "outputs": [ 1813 | { 1814 | "name": "stdout", 1815 | "output_type": "stream", 1816 | "text": [ 1817 | " A B C\n", 1818 | "0 1.0 4.0 7.0\n", 1819 | "1 2.0 NaN 8.0\n", 1820 | "2 NaN NaN 9.0\n" 1821 | ] 1822 | }, 1823 | { 1824 | "data": { 1825 | "text/html": [ 1826 | "
\n", 1827 | "\n", 1840 | "\n", 1841 | " \n", 1842 | " \n", 1843 | " \n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | " \n", 1859 | " \n", 1860 | " \n", 1861 | " \n", 1862 | " \n", 1863 | " \n", 1864 | " \n", 1865 | " \n", 1866 | " \n", 1867 | " \n", 1868 | " \n", 1869 | "
ABC
01.04.07.0
12.0NaN8.0
2NaNNaN9.0
\n", 1870 | "
" 1871 | ], 1872 | "text/plain": [ 1873 | " A B C\n", 1874 | "0 1.0 4.0 7.0\n", 1875 | "1 2.0 NaN 8.0\n", 1876 | "2 NaN NaN 9.0" 1877 | ] 1878 | }, 1879 | "execution_count": 43, 1880 | "metadata": {}, 1881 | "output_type": "execute_result" 1882 | } 1883 | ], 1884 | "source": [ 1885 | "dict_4 = {'A': [1,2,np.nan], 'B': [4, np.nan, np.nan], 'C': [7.,8.,9.]}\n", 1886 | "df_10 = pd.DataFrame(dict_4)\n", 1887 | "print(df_10)\n", 1888 | "\n", 1889 | "# Drop missing data from DF (Drops any row with missing values)\n", 1890 | "df_10.dropna()\n", 1891 | "\n", 1892 | "# Drop all columns with any missing data\n", 1893 | "df_10.dropna(axis=1)\n", 1894 | "\n", 1895 | "# Drop row unless it has at least 2 non-NaN values\n", 1896 | "df_10.dropna(thresh=2)\n", 1897 | "\n", 1898 | "# Fill NaN values with 0\n", 1899 | "df_10.fillna(value=0.0)\n", 1900 | "\n", 1901 | "# Fill A column with the mean of column\n", 1902 | "df_10['A'].fillna(value=df_10['A'].mean())\n", 1903 | "\n", 1904 | "# Fill with previous value\n", 1905 | "df_10.fillna(method='ffill')\n", 1906 | "\n", 1907 | "# Fill with next value (Only works if there is a next value)\n", 1908 | "df_10.fillna(method='bfill')" 1909 | ] 1910 | }, 1911 | { 1912 | "cell_type": "markdown", 1913 | "metadata": {}, 1914 | "source": [ 1915 | "### Experimenting with Data" 1916 | ] 1917 | }, 1918 | { 1919 | "cell_type": "code", 1920 | "execution_count": 44, 1921 | "metadata": {}, 1922 | "outputs": [ 1923 | { 1924 | "name": "stdout", 1925 | "output_type": "stream", 1926 | "text": [ 1927 | "Index(['Sale ID', 'Contact', 'Sex', 'Age', 'State', 'Product ID',\n", 1928 | " 'Product Type', 'Sale Price', 'Profit', 'Lead', 'Month', 'Year',\n", 1929 | " 'First Name', 'Last Name', 'Age Group'],\n", 1930 | " dtype='object')\n" 1931 | ] 1932 | }, 1933 | { 1934 | "data": { 1935 | "text/plain": [ 1936 | "False 35\n", 1937 | "True 4\n", 1938 | "Name: Profit, dtype: int64" 1939 | ] 1940 | }, 1941 | "execution_count": 44, 1942 | "metadata": {}, 1943 | "output_type": "execute_result" 1944 | } 1945 | ], 1946 | "source": [ 1947 | "cs_df.head() # Get 1st 5\n", 1948 | "print(cs_df.columns) # Get column names\n", 1949 | "cs_df['Profit'].mean() # Average profit per item\n", 1950 | "# Get the product with the highest profit\n", 1951 | "cs_df[['Product ID', 'Profit']].max(axis=0).head()\n", 1952 | "# Number of people who purchased from WV\n", 1953 | "cs_df[cs_df['State']=='WV']['State'].count()\n", 1954 | "# Number of purchases in 2019\n", 1955 | "len(cs_df[cs_df['Year']==2019].index)\n", 1956 | "# Get number of sales for each product type\n", 1957 | "cs_df['Product ID'].value_counts()\n", 1958 | "# Get list of customers that bought a specific product\n", 1959 | "cs_df[cs_df['Product ID']=='M01-F0024']['Contact']\n", 1960 | "# How many made a website purchase for a profit over $200\n", 1961 | "cs_df[(cs_df['Lead']=='Website') & (cs_df['Profit']>150)]['Lead'].count()\n", 1962 | "# Find out how many product profit amounts include .89 in cents\n", 1963 | "cs_df['Profit'].apply(lambda cents: str(cents).split('.')[1]=='89').value_counts()\n" 1964 | ] 1965 | }, 1966 | { 1967 | "cell_type": "markdown", 1968 | "metadata": {}, 1969 | "source": [ 1970 | "### Visualization" 1971 | ] 1972 | }, 1973 | { 1974 | "cell_type": "code", 1975 | "execution_count": 99, 1976 | "metadata": {}, 1977 | "outputs": [ 1978 | { 1979 | "data": { 1980 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD4CAYAAAAD6PrjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAVMklEQVR4nO3df7BX9X3n8ec7IF4RDVHRpVwsuIMm6FwRiT83RkKdUqvCZpPKTmyA2LpO2CZxTVpJOqtjxsgkJuaHTSkmKka3VtREs25jLIqdZDZYSBgikIgpKdxIlZISRVER3vvH93D2Ihf4Er7ne7j3Ph8zzvecz/f8eJ+R+b7u+ZxzPicyE0mSAN5WdwGSpEOHoSBJKhkKkqSSoSBJKhkKkqTS4LoLOBjHHXdcjhkzpu4yJKlPWb58+b9l5ojevuvToTBmzBiWLVtWdxmS1KdExL/s7Tu7jyRJJUNBklQyFCRJpT59TUGSmrV9+3a6u7t57bXX6i6lbTo6Oujs7OSwww5reh1DQdKA0N3dzVFHHcWYMWOIiLrLqVxmsnnzZrq7uxk7dmzT69l9JGlAeO211zj22GMHRCAARATHHnvsAZ8ZGQqSBoyBEgi7/DbHayhIkkpeU5A0IN36+LMt3d41F52832WGDRvG1q1by/m77rqLZcuWcdtttzF//nyGDh3Khz/84V7XXbJkCUOGDOG8885rWc29MRQ04LT6x2BvmvmRkHa5+uqr9/n9kiVLGDZsmKGgQ0c7fkz9IdVAdcMNNzBs2DA++clP8tWvfpX58+czePBgxo8fz7x585g/fz6DBg3innvu4Wtf+xrvec97KqljQIeCP3KS2mnbtm1MmDChnP/1r3/NZZddtsdy8+bNY926dRx++OFs2bKF4cOHc/XVV5ehUaUBHQqS1E5HHHEEK1asKOd3XVN4q66uLj70oQ8xffp0pk+f3s4SvftIkg41jz76KHPmzGH58uWceeaZvPnmm23bt6EgSYeQnTt3smHDBiZPnsznP/95tmzZwtatWznqqKN4+eWXK9+/3UeSBqRD9Xrfjh07uOKKK/jNb35DZnLNNdcwfPhwLr30Uj7wgQ/w8MMPe6FZA0e7bheV6tDzGQWAWbNmMWvWLKBx99EuP/jBD/ZY9+STT2blypVVlgfYfSRJ6sFQkCSV7D6qmM9CSOpLKg2FiLgG+BMggZ8Cs4GhwN8BY4BfAn+Umf9eLD8XuBLYAXwsMx+rsj6pUk/eXP0+Js+tfh8aUCrrPoqIUcDHgEmZeRowCJgBXAcszsxxwOJinogYX3x/KjAV+HpEDKqqPknSnqq+pjAYOCIiBtM4Q3gemAYsLL5fCOx6XG8acF9mvp6Z64DngLMqrk+S1ENl3UeZ+auIuAVYD2wDvp+Z34+IEzJzY7HMxog4vlhlFPCjHpvoLtokqfVa3b3XZFded3c3c+bMYfXq1ezcuZNLLrmEL3zhCwwZMmS35S688EJuueUWJk2atFt7z+G2q1Bl99E7aPz1Pxb4HeDIiLhiX6v00pa9bPeqiFgWEcs2bdrUmmIlqQ0yk/e///1Mnz6dtWvX8uyzz7J161Y+85nP1F1aqcruo98D1mXmpszcDjwEnAe8EBEjAYrPF4vlu4HRPdbvpNHdtJvMXJCZkzJz0ogRIyosX5Ja64knnqCjo4PZs2cDMGjQIG699VbuuOMOXnnlFWbMmEFXVxeXX34527ZtK9e78847Ofnkk3nve9/LD3/4w7J90aJFnHbaaZx++ulccMEFLamxyruP1gPnRMRQGt1HU4BlwCvATGBe8flwsfwjwP+KiC/ROLMYBzxdYX2S1FarVq3izDPP3K3t6KOP5sQTT+SLX/wiQ4cOZeXKlaxcuZKJEycCsHHjRq6//nqWL1/O29/+diZPnswZZ5wBwI033shjjz3GqFGj2LJlS0tqrOxMITOXAg8AP6ZxO+rbgAU0wuCiiFgLXFTMk5mrgPuB1cD3gDmZuaOq+iSp3TKTiD17yjOTp556iiuuaPSwd3V10dXVBcDSpUu58MILGTFiBEOGDOHyyy8v1zv//POZNWsWt99+Ozt2tObnstLnFDLzeuD6tzS/TuOsobflbwJuqrKm/sjxgqS+4dRTT+XBBx/cre2ll15iw4YNHH/88b0GBrDX9vnz57N06VIeffRRJkyYwIoVKzj22GMPqkaHuZCkNpkyZQqvvvoqd999N9AYEfXaa69l1qxZTJ06lXvvvReAZ555phz87uyzz2bJkiVs3ryZ7du3s2jRonJ7v/jFLzj77LO58cYbOe6449iwYcNB1+gwF5IGphqeBo8Ivv3tb/PRj36Uz372s+zcuZOLL76Yz33uc+zcuZPZs2fT1dXFhAkTOOusxmNaI0eO5IYbbuDcc89l5MiRTJw4sewq+tSnPsXatWvJTKZMmcLpp59+0DUaCpLURqNHj+a73/1ur9/dd999vbbPnj27vGOpp4ceeqiltYHdR5KkHgwFSVLJUJA0YGTuMUhCv/bbHK+hIGlA6OjoYPPmzQMmGDKTzZs309HRcUDreaFZ0oDQ2dlJd3c3A2nMtI6ODjo7Ow9oHUNB0oBw2GGHMXbs2LrLOOTZfSRJKhkKkqSSoSBJKhkKkqSSoSBJKhkKkqSSoSBJKhkKkqSSoSBJKvlEswacc9YvaM+OTjq41yJKdfBMQZJUMhQkSSVDQZJUMhQkSSVDQZJUMhQkSSVDQZJUMhQkSSVDQZJU8olmqS978ubq9zF5bvX70CHDMwVJUslQkCSVDAVJUslQkCSVDAVJUslQkCSVDAVJUslQkCSVDAVJUslQkCSVKg2FiBgeEQ9ExM8iYk1EnBsRx0TE4xGxtvh8R4/l50bEcxHx84j4/SprkyTtqeozha8A38vMdwKnA2uA64DFmTkOWFzMExHjgRnAqcBU4OsRMaji+iRJPVQWChFxNHAB8E2AzHwjM7cA04CFxWILgenF9DTgvsx8PTPXAc8BZ1VVnyRpT1WeKZwEbALujIifRMQ3IuJI4ITM3AhQfB5fLD8K2NBj/e6iTZLUJlWGwmBgIvDXmXkG8ApFV9FeRC9tucdCEVdFxLKIWLZp06bWVCpJAqoNhW6gOzOXFvMP0AiJFyJiJEDx+WKP5Uf3WL8TeP6tG83MBZk5KTMnjRgxorLiJWkgqiwUMvNfgQ0RcUrRNAVYDTwCzCzaZgIPF9OPADMi4vCIGAuMA56uqj5J0p6qfvPanwH3RsQQ4J+B2TSC6P6IuBJYD3wQIDNXRcT9NILjTWBOZu6ouD5JUg+VhkJmrgAm9fLVlL0sfxNwU5U1SZL2zieaJUklQ0GSVDIUJEklQ0GSVDIUJEklQ0GSVKr6OQVJfd2TN7dnP5Pntmc/2qemzhQi4rSqC5Ek1a/Z7qP5EfF0RHw0IoZXWpEkqTZNdR9l5n+KiHHAR4BlEfE0cGdmPl5pdRpwzlm/oO4SpAGt6QvNmbkW+EvgL4D3Al8tXrP5/qqKkyS1V7PXFLoi4lYar9N8H3BpZr6rmL61wvokSW3U7N1HtwG3A5/OzG27GjPz+Yj4y0oqkyS1XbOhcDGwbddQ1hHxNqAjM1/NzG9VVp0kqa2avabwD8ARPeaHFm2SpH6k2VDoyMytu2aK6aHVlCRJqkuzofBKREzcNRMRZwLb9rG8JKkPavaawieARRHxfDE/Eri8mpIkSXVp9uG1f4qIdwKnAAH8LDO3V1qZJKntDmRAvHcDY4p1zogIMvPuSqqSJNWiqVCIiG8B/xFYAewomhMwFCSpH2n2TGESMD4zs8piJEn1avbuo2eA/1BlIZKk+jV7pnAcsLoYHfX1XY2ZeVklVUmSatFsKNxQZRGSpENDs7ekPhURvwuMy8x/iIihwKBqS5MktVuzQ2f/KfAA8DdF0yjgO1UVJUmqR7MXmucA5wMvQfnCneOrKkqSVI9mQ+H1zHxj10xEDKbxnIIkqR9pNhSeiohPA0dExEXAIuC71ZUlSapDs6FwHbAJ+Cnw34D/Q+N9zZKkfqTZu4920ngd5+3VliNJqlOzYx+to5drCJl5UssrkiTV5kDGPtqlA/ggcEzry5E0YD15c/X7mDy3+n30cU1dU8jMzT3++1Vmfhl4X8W1SZLarNnuo4k9Zt9G48zhqEoqkiTVptnuoy/2mH4T+CXwRy2vRpJUq2bvPppcdSE69J2zfkHdJUiqWLPdR/9jX99n5pdaU44kqU4HcvfRu4FHivlLgX8ENlRRlCSpHgfykp2JmfkyQETcACzKzD/Z34oRMQhYBvwqMy+JiGOAvwPGUFybyMx/L5adC1xJ4z3QH8vMxw7oaCRJB6XZYS5OBN7oMf8GjR/1ZnwcWNNj/jpgcWaOAxYX80TEeGAGcCowFfh6ESiSpDZpNhS+BTwdETdExPXAUuDu/a0UEZ3AHwLf6NE8DVhYTC8Epvdovy8zX8/MdcBzwFlN1idJaoFm7z66KSL+HnhP0TQ7M3/SxKpfBv6c3Z9pOCEzNxbb3RgRu97LMAr4UY/luou23UTEVcBVACeeeGIz5UuSmtTsmQLAUOClzPwK0B0RY/e1cERcAryYmcub3H700tbbeEsLMnNSZk4aMWJEk5uWJDWj2VtSr6dxB9IpwJ3AYcA9NN7GtjfnA5dFxMU0xks6OiLuAV6IiJHFWcJI4MVi+W5gdI/1O4HnD+RgJEkHp9kzhf8MXAa8ApCZz7OfYS4yc25mdmbmGBoXkJ/IzCto3NY6s1hsJvBwMf0IMCMiDi/OQsYBTx/AsUiSDlKzt6S+kZkZEQkQEUcexD7nAfdHxJXAehojrpKZqyLifmA1jaE05mTmjoPYjyTpADUbCvdHxN8AwyPiT4GPcAAv3MnMJcCSYnozMGUvy90E3NTsdiVJrbXfUIiIoPGw2TuBl2hcV/ifmfl4xbVJktpsv6FQdBt9JzPPBAwCSerHmr3Q/KOIeHellUiSatfsNYXJwNUR8UsadyAFjZOIrqoKkyS13z5DISJOzMz1wB+0qR5JUo32d6bwHRqjo/5LRDyYmf+lHUVJkuqxv2sKPYeeOKnKQiRJ9dtfKORepiVJ/dD+uo9Oj4iXaJwxHFFMw/+/0Hx0pdVJktpqn6GQmb7kRpIGkAMZOluS1M8ZCpKkkqEgSSoZCpKkkqEgSSoZCpKkkqEgSSoZCpKkkqEgSSoZCpKkUrMv2ZGkvu/Jm9uzn8lz27OfChgK/cA56xfUXYKkfsLuI0lSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSqbJQiIjREfFkRKyJiFUR8fGi/ZiIeDwi1haf7+ixztyIeC4ifh4Rv19VbZKk3lV5pvAmcG1mvgs4B5gTEeOB64DFmTkOWFzMU3w3AzgVmAp8PSIGVVifJOktKguFzNyYmT8upl8G1gCjgGnAwmKxhcD0YnoacF9mvp6Z64DngLOqqk+StKe2XFOIiDHAGcBS4ITM3AiN4ACOLxYbBWzosVp30fbWbV0VEcsiYtmmTZuqLFuSBpzKQyEihgEPAp/IzJf2tWgvbblHQ+aCzJyUmZNGjBjRqjIlSVQcChFxGI1AuDczHyqaX4iIkcX3I4EXi/ZuYHSP1TuB56usT5K0uyrvPgrgm8CazPxSj68eAWYW0zOBh3u0z4iIwyNiLDAOeLqq+iRJexpc4bbPB/4Y+GlErCjaPg3MA+6PiCuB9cAHATJzVUTcD6ymcefSnMzcUWF9kqS3qCwUMvMH9H6dAGDKXta5CbipqpokSfvmE82SpJKhIEkqGQqSpJKhIEkqGQqSpJKhIEkqVfmcgoBz1i+ouwRJapqhIEmt9uTN1e9j8txKNmv3kSSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpdMiFQkRMjYifR8RzEXFd3fVI0kBySIVCRAwC/gr4A2A88F8jYny9VUnSwHFIhQJwFvBcZv5zZr4B3AdMq7kmSRowBtddwFuMAjb0mO8Gzu65QERcBVxVzG6NiJ+3qbaDdRzwb3UXUaH+fHweW9/Vj4/v0wdzbL+7ty8OtVCIXtpyt5nMBcCC9pTTOhGxLDMn1V1HVfrz8XlsfVd/Pr6qju1Q6z7qBkb3mO8Enq+pFkkacA61UPgnYFxEjI2IIcAM4JGaa5KkAeOQ6j7KzDcj4r8DjwGDgDsyc1XNZbVKn+vyOkD9+fg8tr6rPx9fJccWmbn/pSRJA8Kh1n0kSaqRoSBJKhkKFYuI0RHxZESsiYhVEfHxumtqtYgYFBE/iYj/XXctrRQRwyPigYj4WfH/79y6a2qliLim+Df5TET8bUR01F3Tbysi7oiIFyPimR5tx0TE4xGxtvh8R501Hoy9HN8Xin+bKyPi2xExvBX7MhSq9yZwbWa+CzgHmNMPh+74OLCm7iIq8BXge5n5TuB0+tExRsQo4GPApMw8jcaNHTPqreqg3AVMfUvbdcDizBwHLC7m+6q72PP4HgdOy8wu4Flgbit2ZChULDM3ZuaPi+mXafywjKq3qtaJiE7gD4Fv1F1LK0XE0cAFwDcBMvONzNxSb1UtNxg4IiIGA0Ppw88EZeY/Ar9+S/M0YGExvRCY3taiWqi348vM72fmm8Xsj2g813XQDIU2iogxwBnA0noraakvA38O7Ky7kBY7CdgE3Fl0jX0jIo6su6hWycxfAbcA64GNwG8y8/v1VtVyJ2TmRmj8cQYcX3M9VfoI8Pet2JCh0CYRMQx4EPhEZr5Udz2tEBGXAC9m5vK6a6nAYGAi8NeZeQbwCn27+2E3Rf/6NGAs8DvAkRFxRb1V6bcREZ+h0U19byu2Zyi0QUQcRiMQ7s3Mh+qup4XOBy6LiF/SGNH2fRFxT70ltUw30J2Zu87qHqAREv3F7wHrMnNTZm4HHgLOq7mmVnshIkYCFJ8v1lxPy0XETOAS4EPZoofODIWKRUTQ6Jdek5lfqrueVsrMuZnZmZljaFykfCIz+8Vfm5n5r8CGiDilaJoCrK6xpFZbD5wTEUOLf6NT6EcX0guPADOL6ZnAwzXW0nIRMRX4C+CyzHy1Vds1FKp3PvDHNP6KXlH8d3HdRakpfwbcGxErgQnA52qup2WKM6AHgB8DP6XxW9Bnh4SIiL8F/i9wSkR0R8SVwDzgoohYC1xUzPdJezm+24CjgMeL35X5LdmXw1xIknbxTEGSVDIUJEklQ0GSVDIUJEklQ0GSVDIUJEklQ0GSVPp/1qHH61K7WgoAAAAASUVORK5CYII=\n", 1981 | "text/plain": [ 1982 | "
" 1983 | ] 1984 | }, 1985 | "metadata": { 1986 | "needs_background": "light" 1987 | }, 1988 | "output_type": "display_data" 1989 | } 1990 | ], 1991 | "source": [ 1992 | "# Library usef to create advanced static, animated and\n", 1993 | "# interactive visualizations\n", 1994 | "import matplotlib.pyplot as plt\n", 1995 | "\n", 1996 | "# Displays matplotlib plots in the Notebook\n", 1997 | "%matplotlib inline\n", 1998 | "\n", 1999 | "# Histograms provide an approximation of the distribution of\n", 2000 | "# results. You create them by dividing the range of values into \n", 2001 | "# bins or buckets. Then you count how many of the results fall\n", 2002 | "# into each bin.\n", 2003 | "# Rolls 2 dice 5000 times and charts the frequency and \n", 2004 | "# a histogram\n", 2005 | "\n", 2006 | "# Even though the odds increase as you approach 7 and then\n", 2007 | "# decrease again (1 way to roll a 2 / 6 ways to roll a 7)\n", 2008 | "# over many rolls they are nearly equal.\n", 2009 | "df_dice = pd.DataFrame(\n", 2010 | " np.random.randint(1,7,5000),\n", 2011 | " columns = ['Hist'])\n", 2012 | "df_dice['Odds'] = df_dice['Hist'] + np.random.randint(1,7,5000)\n", 2013 | "# Alpha decreases the opacity in the chart\n", 2014 | "ax = df_dice.plot.hist(bins=12, alpha=0.5)\n", 2015 | "\n", 2016 | "# Basic plot using 1000 random values that create cumulative sums\n", 2017 | "# over an increasing date range\n", 2018 | "ser_5 = pd.Series(np.random.randn(1000),\n", 2019 | " index=pd.date_range('11/15/2017', periods=1000))\n", 2020 | "ser_5 = ser_5.cumsum()\n", 2021 | "# ser_5.plot()\n", 2022 | "\n", 2023 | "# Display 3 random plots\n", 2024 | "df_15 = pd.DataFrame(np.random.randn(1000, 3),\n", 2025 | " index=pd.date_range('11/15/2017', periods=1000),\n", 2026 | " columns=list('ABC'))\n", 2027 | "df_15 = df_15.cumsum()\n", 2028 | "# df_15.plot()\n", 2029 | "\n", 2030 | "# Make bar chart from 5 random values\n", 2031 | "# pd.DataFrame(np.random.randn(5)).plot.bar()\n", 2032 | "\n", 2033 | "# Make MultiBar Charts\n", 2034 | "vals = ['A', 'B', 'C', 'D']\n", 2035 | "df_15 = pd.DataFrame(np.random.rand(10,4), columns=vals)\n", 2036 | "# df_15.plot.bar()\n", 2037 | "\n", 2038 | "# Area plot \n", 2039 | "# Define x range and y values\n", 2040 | "x_rng = range(1,15)\n", 2041 | "y_vals = [1,5,4,7,6,9,5,7,10,14,10,12,9,8]\n", 2042 | "# Change fill color and opacity\n", 2043 | "# plt.fill_between(x_rng, y_vals, color=\"skyblue\", alpha=0.5)\n", 2044 | "# plt.show()\n", 2045 | "\n", 2046 | "# Area plot with multiple areas\n", 2047 | "# pd.DataFrame(np.random.rand(10,3), columns=['A','B','C']).plot.area()\n", 2048 | "\n", 2049 | "# Create a scatterplot with 100 random values\n", 2050 | "# pd.DataFrame(np.random.rand(100,2), \n", 2051 | "# columns=['A','B']).plot.scatter(x='A', y='B')\n", 2052 | "\n", 2053 | "# Multiple column scatter plots\n", 2054 | "df_15 = pd.DataFrame(np.random.rand(50,4), columns=['A','B','C','D'])\n", 2055 | "# ax = df_15.plot.scatter(x='A', y='B', color='DarkBlue', label='Grp 1')\n", 2056 | "# df_15.plot.scatter(x='C', y='D', color='Orange', label='Grp 2', ax=ax)\n", 2057 | "\n", 2058 | "# Pie Charts with 4 random values\n", 2059 | "# pd.Series(np.random.rand(4),\n", 2060 | "# index=['a','b','c','d'], \n", 2061 | "# name='Pie').plot.pie(figsize=(6,6))" 2062 | ] 2063 | } 2064 | ], 2065 | "metadata": { 2066 | "kernelspec": { 2067 | "display_name": "Python 3", 2068 | "language": "python", 2069 | "name": "python3" 2070 | }, 2071 | "language_info": { 2072 | "codemirror_mode": { 2073 | "name": "ipython", 2074 | "version": 3 2075 | }, 2076 | "file_extension": ".py", 2077 | "mimetype": "text/x-python", 2078 | "name": "python", 2079 | "nbconvert_exporter": "python", 2080 | "pygments_lexer": "ipython3", 2081 | "version": "3.7.7" 2082 | } 2083 | }, 2084 | "nbformat": 4, 2085 | "nbformat_minor": 4 2086 | } 2087 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pandas-tutorial 2 | This is the cheat sheet Jupyter Notebook I made for my Pandas Learn in One Video Tutorial. I basically condensed the Pandas API down into this one cheat sheet with hundreds of examples. I hope you find it useful. 3 | -------------------------------------------------------------------------------- /icecreamsales.csv: -------------------------------------------------------------------------------- 1 | Temperature,Sales 2 | 37,292 3 | 40,228 4 | 49,324 5 | 61,376 6 | 72,440 7 | 79,496 8 | 83,536 9 | 81,556 10 | 75,496 11 | 64,412 12 | 53,324 13 | 40,320 --------------------------------------------------------------------------------