├── ComputerSales.csv
├── Financial Sample.xlsx
├── Pandas Tutorial.ipynb
├── README.md
└── icecreamsales.csv


/ComputerSales.csv:
--------------------------------------------------------------------------------
 1 | Sale ID,Contact,Sex,Age,State,Product ID,Product Type,Sale Price,Profit,Lead,Month,Year
 2 | 1,Paul Thomas,M,43,OH,M01-F0024,Desktop,479.99,143.39,Website,January,2018
 3 | 2,Margo Simms,F,37,WV,GT13-0024,Desktop,1249.99,230.89,Flyer 4,January,2018
 4 | 3,Sam Stine,M,26,PA,I3670,Desktop,649.99,118.64,Website,February,2018
 5 | 4,Moe Eggert,M,35,PA,I3593,Laptop,399.99,72.09,Website,March,2018
 6 | 5,Jessica Elk,F,55,PA,15M-ED,Laptop,699.99,98.09,Flyer 4,March,2018
 7 | 6,Sally Struthers,F,45,PA,GT13-0024,Desktop,1249.99,230.89,Flyer 2,April,2018
 8 | 7,Michelle Samms,F,46,OH,GA401IV,Laptop,1349.99,180.34,Email,May,2018
 9 | 8,Mick Roberts,M,23,OH,MY2J2LL,Tablet,999.99,146.69,Website,July,2018
10 | 9,Ed Klondike,M,52,OH,81TC00,Laptop,649.99,122.34,Email,July,2018
11 | 10,Phil Jones,M,56,WV,M01-F0024,Desktop,479.99,143.39,Flyer 2,August,2018
12 | 11,Rick James,M,49,PA,GA401IV,Laptop,1349.99,180.34,Flyer 3,November,2018
13 | 12,Sue Etna,F,54,OH,GT13-0024,Desktop,1249.99,230.89,Flyer 2,November,2018
14 | 13,Jason Case,M,57,PA,81TC00,Laptop,649.99,122.34,Email,November,2018
15 | 14,Doug Johnson,M,51,PA,I3670,Desktop,649.99,118.64,Website,December,2018
16 | 15,Andy Sands,M,56,OH,MY2J2LL,Tablet,999.99,146.69,Flyer 1,December,2018
17 | 16,Kim Collins,F,49,PA,I3593,Laptop,399.99,72.09,Flyer 2,January,2019
18 | 17,Edna Sanders,F,46,OH,15M-ED,Laptop,699.99,98.09,Email,February,2019
19 | 18,Michelle Samms,F,46,NY,MY2J2LL,Tablet,999.99,146.69,Website,March,2019
20 | 19,Mick Roberts,M,23,PA,I3593,Laptop,399.99,72.09,Flyer 4,March,2019
21 | 20,Sally Struthers,F,45,NY,81TC00,Laptop,649.99,122.34,Website,April,2019
22 | 21,Jason Case,M,57,PA,M01-F0024,Desktop,479.99,143.39,Flyer 4,May,2019
23 | 22,Doug Johnson,M,51,PA,GA401IV,Laptop,1349.99,180.34,Website,August,2019
24 | 23,Paul Thomas,M,43,OH,81TC00,Laptop,649.99,122.34,Website,August,2019
25 | 24,Margo Simms,F,37,WV,Q526FA,Laptop,1049.99,143.09,Flyer 4,November,2019
26 | 25,Michelle Samms,F,46,NY,I3670,Desktop,649.99,118.64,Flyer 2,November,2019
27 | 26,Mick Roberts,M,23,PA,Q526FA,Laptop,1049.99,143.09,Email,November,2019
28 | 27,Ed Klondike,M,52,OH,Q526FA,Laptop,1049.99,143.09,Website,December,2019
29 | 28,Moe Eggert,M,35,PA,15M-ED,Laptop,699.99,98.09,Email,December,2019
30 | 29,Jessica Elk,F,55,PA,GA401IV,Laptop,1349.99,180.34,Flyer 2,December,2019
31 | 30,Phil Jones,M,56,WV,M01-F0024,Desktop,479.99,143.39,Flyer 2,January,2020
32 | 31,Rick James,M,49,PA,GA401IV,Laptop,1349.99,180.34,Flyer 1,January,2020
33 | 32,Sue Etna,F,54,OH,GT13-0024,Desktop,1249.99,230.89,Flyer 2,February,2020
34 | 33,Kim Collins,F,49,PA,I3593,Laptop,399.99,72.09,Flyer 2,March,2020
35 | 34,Edna Sanders,F,46,OH,15M-ED,Laptop,699.99,98.09,Email,March,2020
36 | 35,Michelle Samms,F,46,NY,MY2J2LL,Tablet,999.99,146.69,Website,April,2020
37 | 36,Sally Struthers,F,45,NY,81TC00,Laptop,649.99,122.34,Website,April,2020
38 | 37,Jason Case,M,57,PA,M01-F0024,Desktop,479.99,143.39,Flyer 4,April,2020
39 | 38,Doug Johnson,M,51,PA,GA401IV,Laptop,1349.99,180.34,Website,May,2020
40 | 39,Moe Eggert,M,35,PA,I3593,Laptop,399.99,72.09,Website,May,2020


--------------------------------------------------------------------------------
/Financial Sample.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/derekbanas/pandas-tutorial/9338573fd6b203a985121d3dc6bc0f03101f5530/Financial Sample.xlsx


--------------------------------------------------------------------------------
/Pandas Tutorial.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "## Pandas Tutorial"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "Pandas provides numerous tools to work with tabular data like you'd find in spreadsheets or databases. It is widely used for data preparation, cleaning, and analysis. It can work with a wide variety of data and provides many visualization options. It is built on top of NumPy."
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {},
  20 |    "source": [
  21 |     "### Series"
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "code",
  26 |    "execution_count": 24,
  27 |    "metadata": {},
  28 |    "outputs": [
  29 |     {
  30 |      "data": {
  31 |       "text/plain": [
  32 |        "'rand_nums'"
  33 |       ]
  34 |      },
  35 |      "execution_count": 24,
  36 |      "metadata": {},
  37 |      "output_type": "execute_result"
  38 |     }
  39 |    ],
  40 |    "source": [
  41 |     "import numpy as np\n",
  42 |     "import pandas as pd\n",
  43 |     "\n",
  44 |     "# Pandas uses something called a dataframe. It is a \n",
  45 |     "# 2D data structure that can hold multiple data types.\n",
  46 |     "# Columns have labels.\n",
  47 |     "\n",
  48 |     "# Series are built on top of NumPy arrays. \n",
  49 |     "# Create a series by first creating a list\n",
  50 |     "list_1 = ['a', 'b', 'c', 'd']\n",
  51 |     "# I can define that I want the series indexes to be the\n",
  52 |     "# provided labels\n",
  53 |     "labels = [1, 2, 3, 4]\n",
  54 |     "ser_1 = pd.Series(data=list_1, index=labels)\n",
  55 |     "\n",
  56 |     "# You can also add a NumPy array\n",
  57 |     "arr_1 = np.array([1, 2, 3, 4])\n",
  58 |     "ser_2 = pd.Series(arr_1)\n",
  59 |     "\n",
  60 |     "# You can quickly add labels and values with a dictionary\n",
  61 |     "dict_1 = {\"f_name\": \"Derek\", \n",
  62 |     "              \"l_name\": \"Banas\", \n",
  63 |     "              \"age\": 44}\n",
  64 |     "ser_3 = pd.Series(dict_1)\n",
  65 |     "\n",
  66 |     "# Get data by label\n",
  67 |     "ser_3[\"f_name\"]\n",
  68 |     "\n",
  69 |     "# You can get the datatype\n",
  70 |     "ser_2.dtype\n",
  71 |     "\n",
  72 |     "# You can perform math operations on series\n",
  73 |     "ser_2 + ser_2\n",
  74 |     "ser_2 - ser_2\n",
  75 |     "ser_2 * ser_2\n",
  76 |     "ser_2 / ser_2\n",
  77 |     "\n",
  78 |     "# You can pass them into NumPy methods\n",
  79 |     "# See NumPy tutorial for more math methods\n",
  80 |     "np.exp(ser_2)\n",
  81 |     "\n",
  82 |     "# The difference between Series and ndarray is that operations\n",
  83 |     "# align by labels\n",
  84 |     "# Create a series from a dictionary\n",
  85 |     "ser_4 = pd.Series({4: 5, 5: 6, 6: 7, 7: 8})\n",
  86 |     "# If labels don't align you will get NaN\n",
  87 |     "ser_2 + ser_4\n",
  88 |     "\n",
  89 |     "# You can assign names to series\n",
  90 |     "ser_4 = pd.Series({8: 9, 9: 10}, name='rand_nums')\n",
  91 |     "ser_4.name\n"
  92 |    ]
  93 |   },
  94 |   {
  95 |    "cell_type": "markdown",
  96 |    "metadata": {},
  97 |    "source": [
  98 |     "### DataFrames"
  99 |    ]
 100 |   },
 101 |   {
 102 |    "cell_type": "markdown",
 103 |    "metadata": {},
 104 |    "source": [
 105 |     "DataFrames are the most commonly used data structure with Pandas. They are made up of multiple series that share the same index / label. They can contain multiple data types. They can be created from dicts, series, lists or other dataframes. "
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "markdown",
 110 |    "metadata": {},
 111 |    "source": [
 112 |     "### Creating DataFrames"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "code",
 117 |    "execution_count": 25,
 118 |    "metadata": {},
 119 |    "outputs": [
 120 |     {
 121 |      "name": "stdout",
 122 |      "output_type": "stream",
 123 |      "text": [
 124 |       "(2, 3)\n"
 125 |      ]
 126 |     }
 127 |    ],
 128 |    "source": [
 129 |     "from numpy import random\n",
 130 |     "\n",
 131 |     "# Create random matrix 2x3 with values between 10 and 50\n",
 132 |     "arr_2 = np.random.randint(10, 50, size=(2, 3))\n",
 133 |     "\n",
 134 |     "# Create DF with data, row labels & column labels\n",
 135 |     "df_1 = pd.DataFrame(arr_2, ['A', 'B'], ['C', 'D', 'E'])\n",
 136 |     "\n",
 137 |     "# Create a DF from multiple series in a dict\n",
 138 |     "# If series are of different lengthes extra spaces are NaN\n",
 139 |     "dict_3 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),\n",
 140 |     "         'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}\n",
 141 |     "df_2 = pd.DataFrame(dict_3)\n",
 142 |     "df_2\n",
 143 |     "\n",
 144 |     "# from_dict accepts a column labels and lists\n",
 145 |     "pd.DataFrame.from_dict(dict([('A', [1,2,3]), ('B', [4,5,6])]))\n",
 146 |     "\n",
 147 |     "# You can assign the keys as row labels and column labels separate\n",
 148 |     "# with orient='index'\n",
 149 |     "pd.DataFrame.from_dict(dict([('A', [1,2,3]), ('B', [4,5,6])]),\n",
 150 |     "                      orient='index', columns=['one','two','three'])\n",
 151 |     "\n",
 152 |     "# Get number of rows and columns as tuple\n",
 153 |     "print(df_1.shape)"
 154 |    ]
 155 |   },
 156 |   {
 157 |    "cell_type": "markdown",
 158 |    "metadata": {},
 159 |    "source": [
 160 |     "### Editing & Retrieving Data"
 161 |    ]
 162 |   },
 163 |   {
 164 |    "cell_type": "code",
 165 |    "execution_count": 26,
 166 |    "metadata": {},
 167 |    "outputs": [
 168 |     {
 169 |      "name": "stdout",
 170 |      "output_type": "stream",
 171 |      "text": [
 172 |       "    D   E\n",
 173 |       "A  23  23\n",
 174 |       "B  34  49\n"
 175 |      ]
 176 |     },
 177 |     {
 178 |      "data": {
 179 |       "text/html": [
 180 |        "<div>\n",
 181 |        "<style scoped>\n",
 182 |        "    .dataframe tbody tr th:only-of-type {\n",
 183 |        "        vertical-align: middle;\n",
 184 |        "    }\n",
 185 |        "\n",
 186 |        "    .dataframe tbody tr th {\n",
 187 |        "        vertical-align: top;\n",
 188 |        "    }\n",
 189 |        "\n",
 190 |        "    .dataframe thead th {\n",
 191 |        "        text-align: right;\n",
 192 |        "    }\n",
 193 |        "</style>\n",
 194 |        "<table border=\"1\" class=\"dataframe\">\n",
 195 |        "  <thead>\n",
 196 |        "    <tr style=\"text-align: right;\">\n",
 197 |        "      <th></th>\n",
 198 |        "      <th>A</th>\n",
 199 |        "    </tr>\n",
 200 |        "  </thead>\n",
 201 |        "  <tbody>\n",
 202 |        "    <tr>\n",
 203 |        "      <th>0</th>\n",
 204 |        "      <td>1.0</td>\n",
 205 |        "    </tr>\n",
 206 |        "    <tr>\n",
 207 |        "      <th>1</th>\n",
 208 |        "      <td>9.0</td>\n",
 209 |        "    </tr>\n",
 210 |        "    <tr>\n",
 211 |        "      <th>2</th>\n",
 212 |        "      <td>3.0</td>\n",
 213 |        "    </tr>\n",
 214 |        "    <tr>\n",
 215 |        "      <th>3</th>\n",
 216 |        "      <td>4.0</td>\n",
 217 |        "    </tr>\n",
 218 |        "  </tbody>\n",
 219 |        "</table>\n",
 220 |        "</div>"
 221 |       ],
 222 |       "text/plain": [
 223 |        "     A\n",
 224 |        "0  1.0\n",
 225 |        "1  9.0\n",
 226 |        "2  3.0\n",
 227 |        "3  4.0"
 228 |       ]
 229 |      },
 230 |      "execution_count": 26,
 231 |      "metadata": {},
 232 |      "output_type": "execute_result"
 233 |     }
 234 |    ],
 235 |    "source": [
 236 |     "# Grab a column\n",
 237 |     "df_1['C']\n",
 238 |     "# Get multiple columns\n",
 239 |     "df_1[['C', 'E']]\n",
 240 |     "\n",
 241 |     "# Grabb a row as a series\n",
 242 |     "df_1.loc['A']\n",
 243 |     "# Grab row by index position\n",
 244 |     "df_1.iloc[1]\n",
 245 |     "\n",
 246 |     "# Grab cell with Row & Column\n",
 247 |     "df_1.loc['A', 'C']\n",
 248 |     "# Grab multiple cells by defining rows wanted & the\n",
 249 |     "# columns from those rows\n",
 250 |     "print(df_1.loc[['A', 'B'], ['D', 'E']])\n",
 251 |     "\n",
 252 |     "# Make new column\n",
 253 |     "df_1['Total'] = df_1['C'] + df_1['D'] + df_1['E']\n",
 254 |     "df_1\n",
 255 |     "\n",
 256 |     "# You can perform multiple calculations\n",
 257 |     "df_2['mult'] = df_2['one'] * df_2['two']\n",
 258 |     "df_2\n",
 259 |     "\n",
 260 |     "# Make a new row by appending\n",
 261 |     "dict_2 = {'C': 44, 'D': 45, 'E': 46}\n",
 262 |     "new_row = pd.Series(dict_2, name='F')\n",
 263 |     "df_1 = df_1.append(new_row)\n",
 264 |     "\n",
 265 |     "# Delete column and set inplace to True which is required\n",
 266 |     "# because Pandas tries to help you not delete data\n",
 267 |     "# by accident\n",
 268 |     "df_1.drop('Total', axis=1, inplace=True)\n",
 269 |     "df_1\n",
 270 |     "# Delete a row\n",
 271 |     "df_1.drop('B', axis=0, inplace=True)\n",
 272 |     "df_1\n",
 273 |     "\n",
 274 |     "# Create a new column and make it the index\n",
 275 |     "df_1['Sex'] = ['Men', 'Women']\n",
 276 |     "df_1.set_index('Sex', inplace=True)\n",
 277 |     "\n",
 278 |     "# You can reset index values to numbers\n",
 279 |     "#df_1.reset_index(inplace=True)\n",
 280 |     "df_1\n",
 281 |     "\n",
 282 |     "# Assign can be used to create a column while leaving the\n",
 283 |     "# original DF untouched\n",
 284 |     "df_2.assign(div=df_2['one'] / df_2['two'])\n",
 285 |     "\n",
 286 |     "# You can pass in a function as well\n",
 287 |     "df_2.assign(div=lambda x: (x['one'] / x['two']))\n",
 288 |     "\n",
 289 |     "# Combine DataFrames while keeping df_3 data unless\n",
 290 |     "# there is a NaN value\n",
 291 |     "df_3 = pd.DataFrame({'A': [1., np.nan, 3., np.nan]})\n",
 292 |     "df_4 = pd.DataFrame({'A': [8., 9., 2., 4.]})\n",
 293 |     "df_3.combine_first(df_4)"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "markdown",
 298 |    "metadata": {},
 299 |    "source": [
 300 |     "### Conditional Selection"
 301 |    ]
 302 |   },
 303 |   {
 304 |    "cell_type": "code",
 305 |    "execution_count": 27,
 306 |    "metadata": {},
 307 |    "outputs": [
 308 |     {
 309 |      "name": "stdout",
 310 |      "output_type": "stream",
 311 |      "text": [
 312 |       "    C   D   E\n",
 313 |       "A  19  38  16\n",
 314 |       "B  17  14  13\n",
 315 |       "Greater than 40\n",
 316 |       "        C      D      E\n",
 317 |       "A  False  False  False\n",
 318 |       "B  False  False  False\n",
 319 |       "Greater than 45\n",
 320 |       "        C      D      E\n",
 321 |       "A  False  False  False\n",
 322 |       "B  False  False  False\n",
 323 |       "Series([], Name: C, dtype: int64)\n",
 324 |       "\n",
 325 |       "Empty DataFrame\n",
 326 |       "Columns: [C, D]\n",
 327 |       "Index: []\n",
 328 |       "\n",
 329 |       "   X  Y  Z\n",
 330 |       "A  1  2  3\n",
 331 |       "B  4  5  6\n",
 332 |       "C  7  8  9 \n",
 333 |       "\n"
 334 |      ]
 335 |     },
 336 |     {
 337 |      "data": {
 338 |       "text/html": [
 339 |        "<div>\n",
 340 |        "<style scoped>\n",
 341 |        "    .dataframe tbody tr th:only-of-type {\n",
 342 |        "        vertical-align: middle;\n",
 343 |        "    }\n",
 344 |        "\n",
 345 |        "    .dataframe tbody tr th {\n",
 346 |        "        vertical-align: top;\n",
 347 |        "    }\n",
 348 |        "\n",
 349 |        "    .dataframe thead th {\n",
 350 |        "        text-align: right;\n",
 351 |        "    }\n",
 352 |        "</style>\n",
 353 |        "<table border=\"1\" class=\"dataframe\">\n",
 354 |        "  <thead>\n",
 355 |        "    <tr style=\"text-align: right;\">\n",
 356 |        "      <th></th>\n",
 357 |        "      <th>X</th>\n",
 358 |        "      <th>Y</th>\n",
 359 |        "      <th>Z</th>\n",
 360 |        "    </tr>\n",
 361 |        "  </thead>\n",
 362 |        "  <tbody>\n",
 363 |        "    <tr>\n",
 364 |        "      <th>B</th>\n",
 365 |        "      <td>4</td>\n",
 366 |        "      <td>5</td>\n",
 367 |        "      <td>6</td>\n",
 368 |        "    </tr>\n",
 369 |        "  </tbody>\n",
 370 |        "</table>\n",
 371 |        "</div>"
 372 |       ],
 373 |       "text/plain": [
 374 |        "   X  Y  Z\n",
 375 |        "B  4  5  6"
 376 |       ]
 377 |      },
 378 |      "execution_count": 27,
 379 |      "metadata": {},
 380 |      "output_type": "execute_result"
 381 |     }
 382 |    ],
 383 |    "source": [
 384 |     "arr_2 = np.random.randint(10, 50, size=(2, 3))\n",
 385 |     "df_1 = pd.DataFrame(arr_2, ['A', 'B'], ['C', 'D', 'E'])\n",
 386 |     "print(df_1)\n",
 387 |     "\n",
 388 |     "# You can use conditional operators to retrieve a table\n",
 389 |     "# based on the condition\n",
 390 |     "print(\"Greater than 40\\n\", df_1 > 40.0)\n",
 391 |     "\n",
 392 |     "# You can use comparison operater functions as well like\n",
 393 |     "# gt, lt, ge, le, eq, ne\n",
 394 |     "print(\"Greater than 45\\n\", df_1.gt(45.0))\n",
 395 |     "\n",
 396 |     "# You can place conditions in brackets as well\n",
 397 |     "bool_1 = df_1 >= 45.0\n",
 398 |     "df_1[bool_1]\n",
 399 |     "\n",
 400 |     "# Get bools for a column\n",
 401 |     "df_1['E'] > 40\n",
 402 |     "\n",
 403 |     "# Return a row if cell value in column matches a condition\n",
 404 |     "df_1[df_1['E']>30]\n",
 405 |     "\n",
 406 |     "# You can focus on a column based on resulting dataframe\n",
 407 |     "df_2 = df_1[df_1['E']>30]\n",
 408 |     "df_2['C']\n",
 409 |     "\n",
 410 |     "# You can stack these commands\n",
 411 |     "print(df_1[df_1['E']>20]['C'])\n",
 412 |     "print()\n",
 413 |     "\n",
 414 |     "# You can also grab multiple columns\n",
 415 |     "print(df_1[df_1['E']>20][['C', 'D']])\n",
 416 |     "print()\n",
 417 |     "\n",
 418 |     "# You can use multiple conditions\n",
 419 |     "arr_3 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n",
 420 |     "df_2 = pd.DataFrame(arr_3, ['A', 'B', 'C'], ['X', 'Y', 'Z'])\n",
 421 |     "print(df_2, \"\\n\")\n",
 422 |     "# You can use or | to combine conditions as well\n",
 423 |     "df_2[(df_2['X']>3) & (df_2['X']<7)]\n",
 424 |     "\n"
 425 |    ]
 426 |   },
 427 |   {
 428 |    "cell_type": "markdown",
 429 |    "metadata": {},
 430 |    "source": [
 431 |     "### File Input / Output "
 432 |    ]
 433 |   },
 434 |   {
 435 |    "cell_type": "markdown",
 436 |    "metadata": {},
 437 |    "source": [
 438 |     "Pandas can work with the following types of data : CSV, Plain Text, JSON, XML, PDF, SQL, HTML, XLSX, DOCX, ZIP, Images Hierarchical Data Format, MP3, and MP4."
 439 |    ]
 440 |   },
 441 |   {
 442 |    "cell_type": "code",
 443 |    "execution_count": 28,
 444 |    "metadata": {},
 445 |    "outputs": [
 446 |     {
 447 |      "name": "stdout",
 448 |      "output_type": "stream",
 449 |      "text": [
 450 |       "    student_id first_name last_name             email           street  \\\n",
 451 |       "0            1       Dale    Cooper   dcooper@aol.com      123 Main St   \n",
 452 |       "1            2      Harry    Truman   htruman@aol.com     202 South St   \n",
 453 |       "2            3     Shelly   Johnson  sjohnson@aol.com        9 Pond Rd   \n",
 454 |       "3            4      Bobby    Briggs   bbriggs@aol.com       14 12th St   \n",
 455 |       "4            5      Donna   Hayward  dhayward@aol.com      120 16th St   \n",
 456 |       "5            6     Audrey     Horne    ahorne@aol.com      342 19th St   \n",
 457 |       "6            7      James    Hurley   jhurley@aol.com    2578 Cliff St   \n",
 458 |       "7            8       Lucy     Moran    lmoran@aol.com     178 Dover St   \n",
 459 |       "8            9      Tommy      Hill     thill@aol.com  672 High Plains   \n",
 460 |       "9           10       Andy   Brennan  abrennan@aol.com       281 4th St   \n",
 461 |       "10          13      Frank     Silva    fsilva@aol.com      666 Hell St   \n",
 462 |       "11          14      Frank     Silva    fsilva@aol.com      666 Hell St   \n",
 463 |       "12          15      Frank     Silva    fsilva@aol.com      666 Hell St   \n",
 464 |       "13          16      Frank     Silva    fsilva@aol.com      666 Hell St   \n",
 465 |       "\n",
 466 |       "            city state    zip         phone  birth_date sex  \\\n",
 467 |       "0         Yakima    WA  98901  792-223-8901  1959-02-22   M   \n",
 468 |       "1      Vancouver    WA  98660  792-223-9810  1946-01-24   M   \n",
 469 |       "2         Sparks    NV  89431  792-223-6734  1970-12-12   F   \n",
 470 |       "3      San Diego    CA  92101  792-223-6178  1967-05-24   M   \n",
 471 |       "4      Davenport    IA  52801  792-223-2001  1970-03-24   F   \n",
 472 |       "5        Detroit    MI  48222  792-223-2001  1965-02-01   F   \n",
 473 |       "6         Queens    NY  11427  792-223-1890  1967-01-02   M   \n",
 474 |       "7      Hollywood    CA  90078  792-223-9678  1954-11-27   F   \n",
 475 |       "8         Tucson    AZ  85701  792-223-1115  1951-12-21   M   \n",
 476 |       "9   Jacksonville    NC  28540  792-223-8902  1960-12-27   M   \n",
 477 |       "10        Yakima    WA  98901  792-223-8966  1959-02-22   M   \n",
 478 |       "11        Yakima    WA  98901  792-223-8966  1959-02-22   M   \n",
 479 |       "12        Yakima    WA  98901  792-223-8966  1959-02-22   M   \n",
 480 |       "13        Yakima    WA  98901  792-223-8966  1959-02-22   M   \n",
 481 |       "\n",
 482 |       "          date_entered  lunch_cost  \n",
 483 |       "0  2019-12-10 13:09:03         3.5  \n",
 484 |       "1  2019-12-10 13:19:12         3.5  \n",
 485 |       "2  2019-12-10 13:19:12         3.5  \n",
 486 |       "3  2019-12-10 13:19:12         3.5  \n",
 487 |       "4  2019-12-10 13:19:12         3.5  \n",
 488 |       "5  2019-12-10 13:19:12         3.5  \n",
 489 |       "6  2019-12-10 13:19:12         3.5  \n",
 490 |       "7  2019-12-10 13:19:12         3.5  \n",
 491 |       "8  2019-12-10 13:19:12         3.5  \n",
 492 |       "9  2019-12-10 13:19:12         3.5  \n",
 493 |       "10 2020-08-09 13:42:56         3.5  \n",
 494 |       "11 2020-08-11 09:54:40         3.5  \n",
 495 |       "12 2020-08-12 16:43:43         3.5  \n",
 496 |       "13 2020-08-12 16:54:12         3.5  \n"
 497 |      ]
 498 |     },
 499 |     {
 500 |      "data": {
 501 |       "text/plain": [
 502 |        "0     OH\n",
 503 |        "1     WV\n",
 504 |        "2     PA\n",
 505 |        "3     PA\n",
 506 |        "4     PA\n",
 507 |        "5     PA\n",
 508 |        "6     OH\n",
 509 |        "7     OH\n",
 510 |        "8     OH\n",
 511 |        "9     WV\n",
 512 |        "10    PA\n",
 513 |        "11    OH\n",
 514 |        "12    PA\n",
 515 |        "13    PA\n",
 516 |        "14    OH\n",
 517 |        "15    PA\n",
 518 |        "16    OH\n",
 519 |        "17    NY\n",
 520 |        "18    PA\n",
 521 |        "19    NY\n",
 522 |        "20    PA\n",
 523 |        "21    PA\n",
 524 |        "22    OH\n",
 525 |        "23    WV\n",
 526 |        "24    NY\n",
 527 |        "25    PA\n",
 528 |        "26    OH\n",
 529 |        "27    PA\n",
 530 |        "28    PA\n",
 531 |        "29    WV\n",
 532 |        "30    PA\n",
 533 |        "31    OH\n",
 534 |        "32    PA\n",
 535 |        "33    OH\n",
 536 |        "34    NY\n",
 537 |        "35    NY\n",
 538 |        "36    PA\n",
 539 |        "37    PA\n",
 540 |        "38    PA\n",
 541 |        "Name: State, dtype: object"
 542 |       ]
 543 |      },
 544 |      "execution_count": 28,
 545 |      "metadata": {},
 546 |      "output_type": "execute_result"
 547 |     }
 548 |    ],
 549 |    "source": [
 550 |     "import pymysql\n",
 551 |     "\n",
 552 |     "# Read a CSV file\n",
 553 |     "# Type pd.read_ [TAB] to see the file types you can read\n",
 554 |     "cs_df = pd.read_csv('ComputerSales.csv')\n",
 555 |     "\n",
 556 |     "# Save a CSV file, but don't save the index as a column\n",
 557 |     "cs_df.to_csv('ComputerSalesBU.csv', index=False)\n",
 558 |     "\n",
 559 |     "# You can read data from Excel, but not formulas and macros\n",
 560 |     "pd.read_excel('Financial Sample.xlsx',0)\n",
 561 |     "\n",
 562 |     "# Write to Excel\n",
 563 |     "cs_df.to_excel('ComputerSales.xlsx')\n",
 564 |     "\n",
 565 |     "# Check if written\n",
 566 |     "pd.read_excel('ComputerSales.xlsx',0)\n",
 567 |     "\n",
 568 |     "# Read from MySQL Database\n",
 569 |     "try:\n",
 570 |     "    db_connection = pymysql.connect(db='students', user='studentadmin', passwd='TurtleDove', host='localhost', port=3306)\n",
 571 |     "\n",
 572 |     "    stud_df = pd.read_sql('SELECT * FROM students', con=db_connection)\n",
 573 |     "    # print(stud_df)\n",
 574 |     "except Exception as e:\n",
 575 |     "    print(\"Exception : {}\".format(e))\n",
 576 |     "finally:\n",
 577 |     "    db_connection.close()\n",
 578 |     "    \n",
 579 |     "\n",
 580 |     "# Write to table \n",
 581 |     "try:\n",
 582 |     "    db_connection = pymysql.connect(db='students', user='studentadmin', passwd='TurtleDove', host='localhost', port=3306)\n",
 583 |     "    # Used to issue queries\n",
 584 |     "    cursor = db_connection.cursor()\n",
 585 |     "    # Query to enter new student\n",
 586 |     "    insert_stmt = \"INSERT INTO students VALUES(NULL, 'Frank', 'Silva', 'fsilva@aol.com', '666 Hell St', 'Yakima', 'WA', 98901, '792-223-8966', '1959-2-22', 'M', NOW(), 3.50)\"\n",
 587 |     "    # Execute query\n",
 588 |     "    cursor.execute(insert_stmt)\n",
 589 |     "    # Commit changes to DB\n",
 590 |     "    db_connection.commit()\n",
 591 |     "    stud_df = pd.read_sql('SELECT * FROM students', con=db_connection)\n",
 592 |     "    print(stud_df)\n",
 593 |     "except Exception as e:\n",
 594 |     "    print(\"Exception : {}\".format(e))\n",
 595 |     "finally:\n",
 596 |     "    db_connection.close()\n",
 597 |     "\n",
 598 |     "# Just get 1 column of data \n",
 599 |     "cs_df_st = pd.read_csv('ComputerSales.csv', usecols=[\"State\"], squeeze=True)\n",
 600 |     "cs_df_st\n"
 601 |    ]
 602 |   },
 603 |   {
 604 |    "cell_type": "markdown",
 605 |    "metadata": {},
 606 |    "source": [
 607 |     "### Basics & Math"
 608 |    ]
 609 |   },
 610 |   {
 611 |    "cell_type": "code",
 612 |    "execution_count": 29,
 613 |    "metadata": {},
 614 |    "outputs": [
 615 |     {
 616 |      "name": "stdout",
 617 |      "output_type": "stream",
 618 |      "text": [
 619 |       "   one  two\n",
 620 |       "a  1.0  1.0\n",
 621 |       "b  2.0  2.0\n",
 622 |       "c  3.0  3.0\n",
 623 |       "d  0.0  4.0\n"
 624 |      ]
 625 |     },
 626 |     {
 627 |      "data": {
 628 |       "text/html": [
 629 |        "<div>\n",
 630 |        "<style scoped>\n",
 631 |        "    .dataframe tbody tr th:only-of-type {\n",
 632 |        "        vertical-align: middle;\n",
 633 |        "    }\n",
 634 |        "\n",
 635 |        "    .dataframe tbody tr th {\n",
 636 |        "        vertical-align: top;\n",
 637 |        "    }\n",
 638 |        "\n",
 639 |        "    .dataframe thead th {\n",
 640 |        "        text-align: right;\n",
 641 |        "    }\n",
 642 |        "</style>\n",
 643 |        "<table border=\"1\" class=\"dataframe\">\n",
 644 |        "  <thead>\n",
 645 |        "    <tr style=\"text-align: right;\">\n",
 646 |        "      <th></th>\n",
 647 |        "      <th>one</th>\n",
 648 |        "      <th>two</th>\n",
 649 |        "    </tr>\n",
 650 |        "  </thead>\n",
 651 |        "  <tbody>\n",
 652 |        "    <tr>\n",
 653 |        "      <th>a</th>\n",
 654 |        "      <td>False</td>\n",
 655 |        "      <td>False</td>\n",
 656 |        "    </tr>\n",
 657 |        "    <tr>\n",
 658 |        "      <th>b</th>\n",
 659 |        "      <td>False</td>\n",
 660 |        "      <td>False</td>\n",
 661 |        "    </tr>\n",
 662 |        "    <tr>\n",
 663 |        "      <th>c</th>\n",
 664 |        "      <td>False</td>\n",
 665 |        "      <td>False</td>\n",
 666 |        "    </tr>\n",
 667 |        "    <tr>\n",
 668 |        "      <th>d</th>\n",
 669 |        "      <td>True</td>\n",
 670 |        "      <td>False</td>\n",
 671 |        "    </tr>\n",
 672 |        "  </tbody>\n",
 673 |        "</table>\n",
 674 |        "</div>"
 675 |       ],
 676 |       "text/plain": [
 677 |        "     one    two\n",
 678 |        "a  False  False\n",
 679 |        "b  False  False\n",
 680 |        "c  False  False\n",
 681 |        "d   True  False"
 682 |       ]
 683 |      },
 684 |      "execution_count": 29,
 685 |      "metadata": {},
 686 |      "output_type": "execute_result"
 687 |     }
 688 |    ],
 689 |    "source": [
 690 |     "# Display 1st 5 rows\n",
 691 |     "cs_df.head()\n",
 692 |     "# Display last 5 rows\n",
 693 |     "cs_df.tail()\n",
 694 |     "# Get 1st 2\n",
 695 |     "cs_df[:2]\n",
 696 |     "# Get 1st through 5 with a 2 step\n",
 697 |     "cs_df[:5:2]\n",
 698 |     "\n",
 699 |     "# Get indexes\n",
 700 |     "cs_df.index.array\n",
 701 |     "# Get NumPy array\n",
 702 |     "cs_df.to_numpy()\n",
 703 |     "# Get array from series\n",
 704 |     "ser_1.array\n",
 705 |     "\n",
 706 |     "dict_3 = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),\n",
 707 |     "         'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}\n",
 708 |     "df_2 = pd.DataFrame(dict_3)\n",
 709 |     "\n",
 710 |     "# You can replace NaN values with 0 or anything else\n",
 711 |     "print(df_2.fillna(0))\n",
 712 |     "# Get values in row 2\n",
 713 |     "row = df_2.iloc[1]\n",
 714 |     "# Add items in row 2 to all rows including row 2\n",
 715 |     "# You can do the same with sub, mul, and div\n",
 716 |     "df_2.add(row, axis='columns')\n",
 717 |     "\n",
 718 |     "# Get column 2\n",
 719 |     "col = df_2['two']\n",
 720 |     "# Subtract from other columns\n",
 721 |     "df_2.sub(col, axis=0)\n",
 722 |     "\n",
 723 |     "# Check if empty\n",
 724 |     "df_2.empty\n",
 725 |     "\n",
 726 |     "# Transform executes a function on a dataframe\n",
 727 |     "df_5 = pd.DataFrame({'A': range(3), 'B': range(1, 4)})\n",
 728 |     "df_5.transform(lambda x: x+1)\n",
 729 |     "df_5.transform(lambda x: x**2)\n",
 730 |     "df_5.transform(lambda x: np.sqrt(x))\n",
 731 |     "# You can transform using multiple functions\n",
 732 |     "df_5.transform([lambda x: x**2, lambda x: x**3])\n",
 733 |     "# Passing a dictionary allows you to perform different calculations\n",
 734 |     "# on different columns\n",
 735 |     "df_5.transform({'A': lambda x: x**2, 'B': lambda x: x**3})\n",
 736 |     "\n",
 737 |     "# map performs a function on a series\n",
 738 |     "df_5['A'].map(lambda x: x**2)\n",
 739 |     "\n",
 740 |     "# applymap does the same on a dataframe\n",
 741 |     "df_5.applymap(lambda x: x**2)\n",
 742 |     "\n",
 743 |     "# Get unique values in column 2 of DF\n",
 744 |     "df_2['two'].unique()\n",
 745 |     "\n",
 746 |     "# Get number of uniques\n",
 747 |     "df_2['two'].nunique()\n",
 748 |     "\n",
 749 |     "# Get the number of times each value showed in column 2\n",
 750 |     "df_2['two'].value_counts()\n",
 751 |     "\n",
 752 |     "# Get column names\n",
 753 |     "df_2.columns\n",
 754 |     "\n",
 755 |     "# Get index info\n",
 756 |     "df_2.index\n",
 757 |     "\n",
 758 |     "# Return a DF that lists null values as True\n",
 759 |     "df_2.isnull()"
 760 |    ]
 761 |   },
 762 |   {
 763 |    "cell_type": "markdown",
 764 |    "metadata": {},
 765 |    "source": [
 766 |     "### Group Data"
 767 |    ]
 768 |   },
 769 |   {
 770 |    "cell_type": "code",
 771 |    "execution_count": 30,
 772 |    "metadata": {},
 773 |    "outputs": [
 774 |     {
 775 |      "data": {
 776 |       "text/html": [
 777 |        "<div>\n",
 778 |        "<style scoped>\n",
 779 |        "    .dataframe tbody tr th:only-of-type {\n",
 780 |        "        vertical-align: middle;\n",
 781 |        "    }\n",
 782 |        "\n",
 783 |        "    .dataframe tbody tr th {\n",
 784 |        "        vertical-align: top;\n",
 785 |        "    }\n",
 786 |        "\n",
 787 |        "    .dataframe thead tr th {\n",
 788 |        "        text-align: left;\n",
 789 |        "    }\n",
 790 |        "\n",
 791 |        "    .dataframe thead tr:last-of-type th {\n",
 792 |        "        text-align: right;\n",
 793 |        "    }\n",
 794 |        "</style>\n",
 795 |        "<table border=\"1\" class=\"dataframe\">\n",
 796 |        "  <thead>\n",
 797 |        "    <tr>\n",
 798 |        "      <th></th>\n",
 799 |        "      <th colspan=\"8\" halign=\"left\">Sales</th>\n",
 800 |        "    </tr>\n",
 801 |        "    <tr>\n",
 802 |        "      <th></th>\n",
 803 |        "      <th>count</th>\n",
 804 |        "      <th>mean</th>\n",
 805 |        "      <th>std</th>\n",
 806 |        "      <th>min</th>\n",
 807 |        "      <th>25%</th>\n",
 808 |        "      <th>50%</th>\n",
 809 |        "      <th>75%</th>\n",
 810 |        "      <th>max</th>\n",
 811 |        "    </tr>\n",
 812 |        "    <tr>\n",
 813 |        "      <th>Store</th>\n",
 814 |        "      <th></th>\n",
 815 |        "      <th></th>\n",
 816 |        "      <th></th>\n",
 817 |        "      <th></th>\n",
 818 |        "      <th></th>\n",
 819 |        "      <th></th>\n",
 820 |        "      <th></th>\n",
 821 |        "      <th></th>\n",
 822 |        "    </tr>\n",
 823 |        "  </thead>\n",
 824 |        "  <tbody>\n",
 825 |        "    <tr>\n",
 826 |        "      <th>1</th>\n",
 827 |        "      <td>2.0</td>\n",
 828 |        "      <td>22.0</td>\n",
 829 |        "      <td>5.656854</td>\n",
 830 |        "      <td>18.0</td>\n",
 831 |        "      <td>20.0</td>\n",
 832 |        "      <td>22.0</td>\n",
 833 |        "      <td>24.0</td>\n",
 834 |        "      <td>26.0</td>\n",
 835 |        "    </tr>\n",
 836 |        "    <tr>\n",
 837 |        "      <th>2</th>\n",
 838 |        "      <td>2.0</td>\n",
 839 |        "      <td>17.0</td>\n",
 840 |        "      <td>7.071068</td>\n",
 841 |        "      <td>12.0</td>\n",
 842 |        "      <td>14.5</td>\n",
 843 |        "      <td>17.0</td>\n",
 844 |        "      <td>19.5</td>\n",
 845 |        "      <td>22.0</td>\n",
 846 |        "    </tr>\n",
 847 |        "  </tbody>\n",
 848 |        "</table>\n",
 849 |        "</div>"
 850 |       ],
 851 |       "text/plain": [
 852 |        "      Sales                                              \n",
 853 |        "      count  mean       std   min   25%   50%   75%   max\n",
 854 |        "Store                                                    \n",
 855 |        "1       2.0  22.0  5.656854  18.0  20.0  22.0  24.0  26.0\n",
 856 |        "2       2.0  17.0  7.071068  12.0  14.5  17.0  19.5  22.0"
 857 |       ]
 858 |      },
 859 |      "execution_count": 30,
 860 |      "metadata": {},
 861 |      "output_type": "execute_result"
 862 |     }
 863 |    ],
 864 |    "source": [
 865 |     "# Groupby allows you to group rows based on a columnand perform a function\n",
 866 |     "# that combines those values (Aggregate Function)\n",
 867 |     "dict_5 = {'Store': [1,2,1,2], 'Flavor': ['Choc', 'Van', 'Straw', 'Choc'], \n",
 868 |     "         'Sales': [26, 12, 18, 22]}\n",
 869 |     "\n",
 870 |     "df_11 = pd.DataFrame(dict_5)\n",
 871 |     "\n",
 872 |     "# Group data by the store number\n",
 873 |     "by_store = df_11.groupby('Store')\n",
 874 |     "# Get mean sales by store\n",
 875 |     "by_store.mean()\n",
 876 |     "\n",
 877 |     "# Get sales total just for store 1\n",
 878 |     "by_store.sum().loc[1]\n",
 879 |     "\n",
 880 |     "# You can use multiple functions of get a bunch\n",
 881 |     "by_store.describe()"
 882 |    ]
 883 |   },
 884 |   {
 885 |    "cell_type": "markdown",
 886 |    "metadata": {},
 887 |    "source": [
 888 |     "### Concatenate Merge & Join Data"
 889 |    ]
 890 |   },
 891 |   {
 892 |    "cell_type": "code",
 893 |    "execution_count": 31,
 894 |    "metadata": {},
 895 |    "outputs": [
 896 |     {
 897 |      "data": {
 898 |       "text/html": [
 899 |        "<div>\n",
 900 |        "<style scoped>\n",
 901 |        "    .dataframe tbody tr th:only-of-type {\n",
 902 |        "        vertical-align: middle;\n",
 903 |        "    }\n",
 904 |        "\n",
 905 |        "    .dataframe tbody tr th {\n",
 906 |        "        vertical-align: top;\n",
 907 |        "    }\n",
 908 |        "\n",
 909 |        "    .dataframe thead th {\n",
 910 |        "        text-align: right;\n",
 911 |        "    }\n",
 912 |        "</style>\n",
 913 |        "<table border=\"1\" class=\"dataframe\">\n",
 914 |        "  <thead>\n",
 915 |        "    <tr style=\"text-align: right;\">\n",
 916 |        "      <th></th>\n",
 917 |        "      <th>A</th>\n",
 918 |        "      <th>B</th>\n",
 919 |        "      <th>C</th>\n",
 920 |        "      <th>D</th>\n",
 921 |        "    </tr>\n",
 922 |        "  </thead>\n",
 923 |        "  <tbody>\n",
 924 |        "    <tr>\n",
 925 |        "      <th>1</th>\n",
 926 |        "      <td>1.0</td>\n",
 927 |        "      <td>4.0</td>\n",
 928 |        "      <td>7.0</td>\n",
 929 |        "      <td>10.0</td>\n",
 930 |        "    </tr>\n",
 931 |        "    <tr>\n",
 932 |        "      <th>2</th>\n",
 933 |        "      <td>2.0</td>\n",
 934 |        "      <td>5.0</td>\n",
 935 |        "      <td>NaN</td>\n",
 936 |        "      <td>NaN</td>\n",
 937 |        "    </tr>\n",
 938 |        "    <tr>\n",
 939 |        "      <th>3</th>\n",
 940 |        "      <td>3.0</td>\n",
 941 |        "      <td>6.0</td>\n",
 942 |        "      <td>NaN</td>\n",
 943 |        "      <td>NaN</td>\n",
 944 |        "    </tr>\n",
 945 |        "    <tr>\n",
 946 |        "      <th>4</th>\n",
 947 |        "      <td>NaN</td>\n",
 948 |        "      <td>NaN</td>\n",
 949 |        "      <td>8.0</td>\n",
 950 |        "      <td>11.0</td>\n",
 951 |        "    </tr>\n",
 952 |        "    <tr>\n",
 953 |        "      <th>5</th>\n",
 954 |        "      <td>NaN</td>\n",
 955 |        "      <td>NaN</td>\n",
 956 |        "      <td>9.0</td>\n",
 957 |        "      <td>12.0</td>\n",
 958 |        "    </tr>\n",
 959 |        "  </tbody>\n",
 960 |        "</table>\n",
 961 |        "</div>"
 962 |       ],
 963 |       "text/plain": [
 964 |        "     A    B    C     D\n",
 965 |        "1  1.0  4.0  7.0  10.0\n",
 966 |        "2  2.0  5.0  NaN   NaN\n",
 967 |        "3  3.0  6.0  NaN   NaN\n",
 968 |        "4  NaN  NaN  8.0  11.0\n",
 969 |        "5  NaN  NaN  9.0  12.0"
 970 |       ]
 971 |      },
 972 |      "execution_count": 31,
 973 |      "metadata": {},
 974 |      "output_type": "execute_result"
 975 |     }
 976 |    ],
 977 |    "source": [
 978 |     "# You can concatenate DFs in the order DFs are provided\n",
 979 |     "df_12 = pd.DataFrame({'A': [1,2,3],\n",
 980 |     "                     'B': [4,5,6]},\n",
 981 |     "                    index=[1,2,3])\n",
 982 |     "df_13 = pd.DataFrame({'A': [7,8,9],\n",
 983 |     "                     'B': [10,11,12]},\n",
 984 |     "                    index=[4,5,6])\n",
 985 |     "pd.concat([df_12, df_13])\n",
 986 |     "\n",
 987 |     "# Merge 2 DFs using their shared key column\n",
 988 |     "df_12 = pd.DataFrame({'A': [1,2,3],\n",
 989 |     "                     'B': [4,5,6],\n",
 990 |     "                     'key': [1,2,3]})\n",
 991 |     "df_13 = pd.DataFrame({'A': [7,8,9],\n",
 992 |     "                     'B': [10,11,12],\n",
 993 |     "                     'key': [1,2,3]})\n",
 994 |     "# inner merges at the intersection of keys\n",
 995 |     "pd.merge(df_12, df_13, how='inner', on='key')\n",
 996 |     "# how='left' or 'right' : Use keys from left or right frame\n",
 997 |     "# how='outer' : Use union of keys\n",
 998 |     "\n",
 999 |     "# You can join DFs with different indexes and instead of using \n",
1000 |     "# keys use a column\n",
1001 |     "df_12 = pd.DataFrame({'A': [1,2,3],\n",
1002 |     "                     'B': [4,5,6]},\n",
1003 |     "                    index=[1,2,3])\n",
1004 |     "df_13 = pd.DataFrame({'C': [7,8,9],\n",
1005 |     "                     'D': [10,11,12]},\n",
1006 |     "                    index=[1,4,5])\n",
1007 |     "df_12.join(df_13, how='outer')"
1008 |    ]
1009 |   },
1010 |   {
1011 |    "cell_type": "markdown",
1012 |    "metadata": {},
1013 |    "source": [
1014 |     "### Statistics"
1015 |    ]
1016 |   },
1017 |   {
1018 |    "cell_type": "code",
1019 |    "execution_count": 33,
1020 |    "metadata": {},
1021 |    "outputs": [
1022 |     {
1023 |      "name": "stdout",
1024 |      "output_type": "stream",
1025 |      "text": [
1026 |       "   one  two\n",
1027 |       "a  1.0  1.0\n",
1028 |       "b  2.0  2.0\n",
1029 |       "c  3.0  3.0\n",
1030 |       "d  NaN  4.0\n"
1031 |      ]
1032 |     },
1033 |     {
1034 |      "data": {
1035 |       "text/html": [
1036 |        "<div>\n",
1037 |        "<style scoped>\n",
1038 |        "    .dataframe tbody tr th:only-of-type {\n",
1039 |        "        vertical-align: middle;\n",
1040 |        "    }\n",
1041 |        "\n",
1042 |        "    .dataframe tbody tr th {\n",
1043 |        "        vertical-align: top;\n",
1044 |        "    }\n",
1045 |        "\n",
1046 |        "    .dataframe thead th {\n",
1047 |        "        text-align: right;\n",
1048 |        "    }\n",
1049 |        "</style>\n",
1050 |        "<table border=\"1\" class=\"dataframe\">\n",
1051 |        "  <thead>\n",
1052 |        "    <tr style=\"text-align: right;\">\n",
1053 |        "      <th></th>\n",
1054 |        "      <th>one</th>\n",
1055 |        "      <th>two</th>\n",
1056 |        "    </tr>\n",
1057 |        "  </thead>\n",
1058 |        "  <tbody>\n",
1059 |        "    <tr>\n",
1060 |        "      <th>mean</th>\n",
1061 |        "      <td>2.0</td>\n",
1062 |        "      <td>2.500000</td>\n",
1063 |        "    </tr>\n",
1064 |        "    <tr>\n",
1065 |        "      <th>std</th>\n",
1066 |        "      <td>1.0</td>\n",
1067 |        "      <td>1.290994</td>\n",
1068 |        "    </tr>\n",
1069 |        "  </tbody>\n",
1070 |        "</table>\n",
1071 |        "</div>"
1072 |       ],
1073 |       "text/plain": [
1074 |        "      one       two\n",
1075 |        "mean  2.0  2.500000\n",
1076 |        "std   1.0  1.290994"
1077 |       ]
1078 |      },
1079 |      "execution_count": 33,
1080 |      "metadata": {},
1081 |      "output_type": "execute_result"
1082 |     }
1083 |    ],
1084 |    "source": [
1085 |     "# Get ice cream sales data\n",
1086 |     "ics_df = pd.read_csv('icecreamsales.csv')\n",
1087 |     "ics_df\n",
1088 |     "\n",
1089 |     "# Get total count of both columns\n",
1090 |     "ics_df.count()\n",
1091 |     "\n",
1092 |     "# skipna skips null / NaN values\n",
1093 |     "ics_df.sum(skipna=True)\n",
1094 |     "# Get mean for named column\n",
1095 |     "ics_df[\"Sales\"].mean()\n",
1096 |     "ics_df[\"Sales\"].median()\n",
1097 |     "ics_df[\"Sales\"].mode()\n",
1098 |     "ics_df[\"Sales\"].min()\n",
1099 |     "ics_df[\"Sales\"].max()\n",
1100 |     "ics_df[\"Sales\"].prod() # Product of values\n",
1101 |     "ics_df[\"Sales\"].std() # Standard deviation\n",
1102 |     "ics_df[\"Sales\"].var() # Variance\n",
1103 |     "ics_df[\"Sales\"].sem() # Standard error\n",
1104 |     "# Negative : Left long tail, Positive : Right long tail\n",
1105 |     "ics_df[\"Sales\"].skew()\n",
1106 |     "# Kurtosis : < 3 less outliers, 3 Normal Distribution,\n",
1107 |     "# > 3 more outliers\n",
1108 |     "ics_df[\"Sales\"].kurt()\n",
1109 |     "ics_df[\"Sales\"].quantile(.5)\n",
1110 |     "ics_df[\"Sales\"].cumsum()\n",
1111 |     "ics_df[\"Sales\"].cumprod()\n",
1112 |     "ics_df[\"Sales\"].cummax()\n",
1113 |     "ics_df[\"Sales\"].cummin()\n",
1114 |     "\n",
1115 |     "# Multiple stats at once\n",
1116 |     "ics_df.describe()\n",
1117 |     "\n",
1118 |     "ser_dice = pd.Series(data=[2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, \n",
1119 |     "                           6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8,\n",
1120 |     "                          8, 8, 9, 9, 9, 9, 10, 10, 10, 11, 11, 12])\n",
1121 |     "# Count for each value in series\n",
1122 |     "ser_dice.value_counts()\n",
1123 |     "\n",
1124 |     "# You can perform calculations on multiple columns using\n",
1125 |     "# aggregate\n",
1126 |     "print(df_2)\n",
1127 |     "df_2.agg(np.mean)\n",
1128 |     "\n",
1129 |     "# You can do this with multiple functions\n",
1130 |     "df_2.agg(['mean', 'std'])\n",
1131 |     "\n"
1132 |    ]
1133 |   },
1134 |   {
1135 |    "cell_type": "markdown",
1136 |    "metadata": {},
1137 |    "source": [
1138 |     "### Iteration"
1139 |    ]
1140 |   },
1141 |   {
1142 |    "cell_type": "code",
1143 |    "execution_count": 37,
1144 |    "metadata": {},
1145 |    "outputs": [
1146 |     {
1147 |      "name": "stdout",
1148 |      "output_type": "stream",
1149 |      "text": [
1150 |       "0\n",
1151 |       "1\n",
1152 |       "2\n",
1153 |       "3\n",
1154 |       "4\n",
1155 |       "\n",
1156 |       "    C   D   E\n",
1157 |       "B  22  40  23\n",
1158 |       "C  44  42  45\n",
1159 |       "C\n",
1160 |       "B    22\n",
1161 |       "C    44\n",
1162 |       "Name: C, dtype: int64\n",
1163 |       "D\n",
1164 |       "B    40\n",
1165 |       "C    42\n",
1166 |       "Name: D, dtype: int64\n",
1167 |       "E\n",
1168 |       "B    23\n",
1169 |       "C    45\n",
1170 |       "Name: E, dtype: int64\n",
1171 |       "\n",
1172 |       "B\n",
1173 |       "C    22\n",
1174 |       "D    40\n",
1175 |       "E    23\n",
1176 |       "Name: B, dtype: int64\n",
1177 |       "C\n",
1178 |       "C    44\n",
1179 |       "D    42\n",
1180 |       "E    45\n",
1181 |       "Name: C, dtype: int64\n",
1182 |       "\n",
1183 |       "Pandas(Index='B', C=22, D=40, E=23)\n",
1184 |       "Pandas(Index='C', C=44, D=42, E=45)\n"
1185 |      ]
1186 |     }
1187 |    ],
1188 |    "source": [
1189 |     "# Iterating over series\n",
1190 |     "ser_7 = pd.Series(range(5), index=['a', 'b', 'c', 'd', 'e'])\n",
1191 |     "for col in ser_7:\n",
1192 |     "    print(col)\n",
1193 |     "    \n",
1194 |     "print()\n",
1195 |     "# Iterating over DFs\n",
1196 |     "arr_4 = np.random.randint(10, 50, size=(2, 3))\n",
1197 |     "df_8 = pd.DataFrame(arr_4, ['B', 'C'], ['C', 'D', 'E'])\n",
1198 |     "print(df_8)\n",
1199 |     "\n",
1200 |     "# items allows you to iterate through key value pairs to make\n",
1201 |     "# calculations 1 column at a time\n",
1202 |     "for label, ser in df_8.items():\n",
1203 |     "    print(label)\n",
1204 |     "    print(ser)\n",
1205 |     "    \n",
1206 |     "print()\n",
1207 |     "# You can also iterate through rows\n",
1208 |     "for index, row in df_8.iterrows():\n",
1209 |     "    print(f\"{index}\\n{row}\")\n",
1210 |     "print()\n",
1211 |     "\n",
1212 |     "# Get a tuple that contains row data\n",
1213 |     "for row in df_8.itertuples():\n",
1214 |     "    print(row)"
1215 |    ]
1216 |   },
1217 |   {
1218 |    "cell_type": "markdown",
1219 |    "metadata": {},
1220 |    "source": [
1221 |     "### Sorting"
1222 |    ]
1223 |   },
1224 |   {
1225 |    "cell_type": "code",
1226 |    "execution_count": 38,
1227 |    "metadata": {},
1228 |    "outputs": [
1229 |     {
1230 |      "data": {
1231 |       "text/html": [
1232 |        "<div>\n",
1233 |        "<style scoped>\n",
1234 |        "    .dataframe tbody tr th:only-of-type {\n",
1235 |        "        vertical-align: middle;\n",
1236 |        "    }\n",
1237 |        "\n",
1238 |        "    .dataframe tbody tr th {\n",
1239 |        "        vertical-align: top;\n",
1240 |        "    }\n",
1241 |        "\n",
1242 |        "    .dataframe thead th {\n",
1243 |        "        text-align: right;\n",
1244 |        "    }\n",
1245 |        "</style>\n",
1246 |        "<table border=\"1\" class=\"dataframe\">\n",
1247 |        "  <thead>\n",
1248 |        "    <tr style=\"text-align: right;\">\n",
1249 |        "      <th></th>\n",
1250 |        "      <th>C</th>\n",
1251 |        "      <th>D</th>\n",
1252 |        "      <th>E</th>\n",
1253 |        "    </tr>\n",
1254 |        "  </thead>\n",
1255 |        "  <tbody>\n",
1256 |        "    <tr>\n",
1257 |        "      <th>B</th>\n",
1258 |        "      <td>22</td>\n",
1259 |        "      <td>40</td>\n",
1260 |        "      <td>23</td>\n",
1261 |        "    </tr>\n",
1262 |        "    <tr>\n",
1263 |        "      <th>C</th>\n",
1264 |        "      <td>44</td>\n",
1265 |        "      <td>42</td>\n",
1266 |        "      <td>45</td>\n",
1267 |        "    </tr>\n",
1268 |        "  </tbody>\n",
1269 |        "</table>\n",
1270 |        "</div>"
1271 |       ],
1272 |       "text/plain": [
1273 |        "    C   D   E\n",
1274 |        "B  22  40  23\n",
1275 |        "C  44  42  45"
1276 |       ]
1277 |      },
1278 |      "execution_count": 38,
1279 |      "metadata": {},
1280 |      "output_type": "execute_result"
1281 |     }
1282 |    ],
1283 |    "source": [
1284 |     "df_8\n",
1285 |     "\n",
1286 |     "# Sorting by index will return the same results if indexes\n",
1287 |     "# are in order, to reverse indexes mark ascending as False\n",
1288 |     "df_8.sort_index(ascending=False)\n",
1289 |     "\n",
1290 |     "# Sort by value for column D (Use the same function for series)\n",
1291 |     "df_8.sort_values(by='D')\n"
1292 |    ]
1293 |   },
1294 |   {
1295 |    "cell_type": "markdown",
1296 |    "metadata": {},
1297 |    "source": [
1298 |     "### Passing Data to Functions"
1299 |    ]
1300 |   },
1301 |   {
1302 |    "cell_type": "code",
1303 |    "execution_count": 39,
1304 |    "metadata": {},
1305 |    "outputs": [
1306 |     {
1307 |      "name": "stdout",
1308 |      "output_type": "stream",
1309 |      "text": [
1310 |       "Total Profit : 5459.010000000001\n"
1311 |      ]
1312 |     },
1313 |     {
1314 |      "data": {
1315 |       "text/html": [
1316 |        "<div>\n",
1317 |        "<style scoped>\n",
1318 |        "    .dataframe tbody tr th:only-of-type {\n",
1319 |        "        vertical-align: middle;\n",
1320 |        "    }\n",
1321 |        "\n",
1322 |        "    .dataframe tbody tr th {\n",
1323 |        "        vertical-align: top;\n",
1324 |        "    }\n",
1325 |        "\n",
1326 |        "    .dataframe thead th {\n",
1327 |        "        text-align: right;\n",
1328 |        "    }\n",
1329 |        "</style>\n",
1330 |        "<table border=\"1\" class=\"dataframe\">\n",
1331 |        "  <thead>\n",
1332 |        "    <tr style=\"text-align: right;\">\n",
1333 |        "      <th></th>\n",
1334 |        "      <th>Sale ID</th>\n",
1335 |        "      <th>Contact</th>\n",
1336 |        "      <th>Sex</th>\n",
1337 |        "      <th>Age</th>\n",
1338 |        "      <th>State</th>\n",
1339 |        "      <th>Product ID</th>\n",
1340 |        "      <th>Product Type</th>\n",
1341 |        "      <th>Sale Price</th>\n",
1342 |        "      <th>Profit</th>\n",
1343 |        "      <th>Lead</th>\n",
1344 |        "      <th>Month</th>\n",
1345 |        "      <th>Year</th>\n",
1346 |        "      <th>First Name</th>\n",
1347 |        "      <th>Last Name</th>\n",
1348 |        "      <th>Age Group</th>\n",
1349 |        "    </tr>\n",
1350 |        "  </thead>\n",
1351 |        "  <tbody>\n",
1352 |        "    <tr>\n",
1353 |        "      <th>0</th>\n",
1354 |        "      <td>1</td>\n",
1355 |        "      <td>Paul Thomas</td>\n",
1356 |        "      <td>M</td>\n",
1357 |        "      <td>43</td>\n",
1358 |        "      <td>OH</td>\n",
1359 |        "      <td>M01-F0024</td>\n",
1360 |        "      <td>Desktop</td>\n",
1361 |        "      <td>479.99</td>\n",
1362 |        "      <td>143.39</td>\n",
1363 |        "      <td>Website</td>\n",
1364 |        "      <td>January</td>\n",
1365 |        "      <td>2018</td>\n",
1366 |        "      <td>Paul</td>\n",
1367 |        "      <td>Thomas</td>\n",
1368 |        "      <td>30-50</td>\n",
1369 |        "    </tr>\n",
1370 |        "    <tr>\n",
1371 |        "      <th>1</th>\n",
1372 |        "      <td>2</td>\n",
1373 |        "      <td>Margo Simms</td>\n",
1374 |        "      <td>F</td>\n",
1375 |        "      <td>37</td>\n",
1376 |        "      <td>WV</td>\n",
1377 |        "      <td>GT13-0024</td>\n",
1378 |        "      <td>Desktop</td>\n",
1379 |        "      <td>1249.99</td>\n",
1380 |        "      <td>230.89</td>\n",
1381 |        "      <td>Flyer 4</td>\n",
1382 |        "      <td>January</td>\n",
1383 |        "      <td>2018</td>\n",
1384 |        "      <td>Margo</td>\n",
1385 |        "      <td>Simms</td>\n",
1386 |        "      <td>30-50</td>\n",
1387 |        "    </tr>\n",
1388 |        "    <tr>\n",
1389 |        "      <th>2</th>\n",
1390 |        "      <td>3</td>\n",
1391 |        "      <td>Sam Stine</td>\n",
1392 |        "      <td>M</td>\n",
1393 |        "      <td>26</td>\n",
1394 |        "      <td>PA</td>\n",
1395 |        "      <td>I3670</td>\n",
1396 |        "      <td>Desktop</td>\n",
1397 |        "      <td>649.99</td>\n",
1398 |        "      <td>118.64</td>\n",
1399 |        "      <td>Website</td>\n",
1400 |        "      <td>February</td>\n",
1401 |        "      <td>2018</td>\n",
1402 |        "      <td>Sam</td>\n",
1403 |        "      <td>Stine</td>\n",
1404 |        "      <td>&lt;30</td>\n",
1405 |        "    </tr>\n",
1406 |        "    <tr>\n",
1407 |        "      <th>3</th>\n",
1408 |        "      <td>4</td>\n",
1409 |        "      <td>Moe Eggert</td>\n",
1410 |        "      <td>M</td>\n",
1411 |        "      <td>35</td>\n",
1412 |        "      <td>PA</td>\n",
1413 |        "      <td>I3593</td>\n",
1414 |        "      <td>Laptop</td>\n",
1415 |        "      <td>399.99</td>\n",
1416 |        "      <td>72.09</td>\n",
1417 |        "      <td>Website</td>\n",
1418 |        "      <td>March</td>\n",
1419 |        "      <td>2018</td>\n",
1420 |        "      <td>Moe</td>\n",
1421 |        "      <td>Eggert</td>\n",
1422 |        "      <td>30-50</td>\n",
1423 |        "    </tr>\n",
1424 |        "    <tr>\n",
1425 |        "      <th>4</th>\n",
1426 |        "      <td>5</td>\n",
1427 |        "      <td>Jessica Elk</td>\n",
1428 |        "      <td>F</td>\n",
1429 |        "      <td>55</td>\n",
1430 |        "      <td>PA</td>\n",
1431 |        "      <td>15M-ED</td>\n",
1432 |        "      <td>Laptop</td>\n",
1433 |        "      <td>699.99</td>\n",
1434 |        "      <td>98.09</td>\n",
1435 |        "      <td>Flyer 4</td>\n",
1436 |        "      <td>March</td>\n",
1437 |        "      <td>2018</td>\n",
1438 |        "      <td>Jessica</td>\n",
1439 |        "      <td>Elk</td>\n",
1440 |        "      <td>&gt;50</td>\n",
1441 |        "    </tr>\n",
1442 |        "  </tbody>\n",
1443 |        "</table>\n",
1444 |        "</div>"
1445 |       ],
1446 |       "text/plain": [
1447 |        "   Sale ID      Contact Sex  Age State Product ID Product Type  Sale Price  \\\n",
1448 |        "0        1  Paul Thomas   M   43    OH  M01-F0024      Desktop      479.99   \n",
1449 |        "1        2  Margo Simms   F   37    WV  GT13-0024      Desktop     1249.99   \n",
1450 |        "2        3    Sam Stine   M   26    PA      I3670      Desktop      649.99   \n",
1451 |        "3        4   Moe Eggert   M   35    PA      I3593       Laptop      399.99   \n",
1452 |        "4        5  Jessica Elk   F   55    PA     15M-ED       Laptop      699.99   \n",
1453 |        "\n",
1454 |        "   Profit     Lead     Month  Year First Name Last Name Age Group  \n",
1455 |        "0  143.39  Website   January  2018       Paul    Thomas     30-50  \n",
1456 |        "1  230.89  Flyer 4   January  2018      Margo     Simms     30-50  \n",
1457 |        "2  118.64  Website  February  2018        Sam     Stine       <30  \n",
1458 |        "3   72.09  Website     March  2018        Moe    Eggert     30-50  \n",
1459 |        "4   98.09  Flyer 4     March  2018    Jessica       Elk       >50  "
1460 |       ]
1461 |      },
1462 |      "execution_count": 39,
1463 |      "metadata": {},
1464 |      "output_type": "execute_result"
1465 |     }
1466 |    ],
1467 |    "source": [
1468 |     "import sys\n",
1469 |     "\n",
1470 |     "# You can pass DataFrames and Series into functions\n",
1471 |     "def get_profit_total(df):\n",
1472 |     "    prof_ser = df['Profit']\n",
1473 |     "    print(f\"Total Profit : {prof_ser.sum()}\")\n",
1474 |     "\n",
1475 |     "get_profit_total(cs_df)\n",
1476 |     "\n",
1477 |     "# Receives a DataFrame, splits the contact into new columns\n",
1478 |     "# being first and last name\n",
1479 |     "def split_name(df):\n",
1480 |     "    def get_names(full_name):\n",
1481 |     "        # Split contact at space\n",
1482 |     "        f_name, l_name = full_name.split()\n",
1483 |     "        # Create a series with first & last names in columns\n",
1484 |     "        # with those labels\n",
1485 |     "        return pd.Series(\n",
1486 |     "        (f_name, l_name),\n",
1487 |     "        index=['First Name', 'Last Name']\n",
1488 |     "        )\n",
1489 |     "    # apply() executes the function on all names in Contact column\n",
1490 |     "    names = df['Contact'].apply(get_names)\n",
1491 |     "    df[names.columns] = names\n",
1492 |     "    return df\n",
1493 |     "\n",
1494 |     "# Run function and display top 5 results\n",
1495 |     "split_name(cs_df).head()\n",
1496 |     "\n",
1497 |     "# Will assign people to different age groups based on age\n",
1498 |     "def create_age_groups(df):\n",
1499 |     "    # Must have 1 more bins than labels\n",
1500 |     "    bins = [0, 30, 50, sys.maxsize]\n",
1501 |     "    # Group labels\n",
1502 |     "    labels = ['<30', '30-50', '>50']\n",
1503 |     "    \n",
1504 |     "    # cut puts values into certain groups based on intervals\n",
1505 |     "    # The group assigned to <30 has an age between 0 and 30\n",
1506 |     "    # between 30 & 50 is assigned 30-50 and so on\n",
1507 |     "    age_group = pd.cut(df['Age'], bins=bins, labels=labels)\n",
1508 |     "    # Create new column and return new dataframe info\n",
1509 |     "    df['Age Group'] = age_group\n",
1510 |     "    return df\n",
1511 |     "\n",
1512 |     "create_age_groups(cs_df)\n",
1513 |     "\n",
1514 |     "# You can use a pipe to pass a dataframe to multiple functions\n",
1515 |     "cs_df.pipe(split_name).pipe(create_age_groups).head()\n",
1516 |     "\n"
1517 |    ]
1518 |   },
1519 |   {
1520 |    "cell_type": "markdown",
1521 |    "metadata": {},
1522 |    "source": [
1523 |     "### Aligning, Reindexing and Renaming Labels"
1524 |    ]
1525 |   },
1526 |   {
1527 |    "cell_type": "code",
1528 |    "execution_count": 41,
1529 |    "metadata": {},
1530 |    "outputs": [
1531 |     {
1532 |      "name": "stdout",
1533 |      "output_type": "stream",
1534 |      "text": [
1535 |       "a    0\n",
1536 |       "b    1\n",
1537 |       "c    2\n",
1538 |       "d    3\n",
1539 |       "dtype: int64\n",
1540 |       "b    1\n",
1541 |       "c    2\n",
1542 |       "d    3\n",
1543 |       "e    4\n",
1544 |       "dtype: int64\n"
1545 |      ]
1546 |     },
1547 |     {
1548 |      "data": {
1549 |       "text/html": [
1550 |        "<div>\n",
1551 |        "<style scoped>\n",
1552 |        "    .dataframe tbody tr th:only-of-type {\n",
1553 |        "        vertical-align: middle;\n",
1554 |        "    }\n",
1555 |        "\n",
1556 |        "    .dataframe tbody tr th {\n",
1557 |        "        vertical-align: top;\n",
1558 |        "    }\n",
1559 |        "\n",
1560 |        "    .dataframe thead th {\n",
1561 |        "        text-align: right;\n",
1562 |        "    }\n",
1563 |        "</style>\n",
1564 |        "<table border=\"1\" class=\"dataframe\">\n",
1565 |        "  <thead>\n",
1566 |        "    <tr style=\"text-align: right;\">\n",
1567 |        "      <th></th>\n",
1568 |        "      <th>Men</th>\n",
1569 |        "      <th>Women</th>\n",
1570 |        "      <th>Pets</th>\n",
1571 |        "    </tr>\n",
1572 |        "  </thead>\n",
1573 |        "  <tbody>\n",
1574 |        "    <tr>\n",
1575 |        "      <th>1</th>\n",
1576 |        "      <td>36</td>\n",
1577 |        "      <td>23</td>\n",
1578 |        "      <td>38</td>\n",
1579 |        "    </tr>\n",
1580 |        "    <tr>\n",
1581 |        "      <th>2</th>\n",
1582 |        "      <td>22</td>\n",
1583 |        "      <td>32</td>\n",
1584 |        "      <td>16</td>\n",
1585 |        "    </tr>\n",
1586 |        "  </tbody>\n",
1587 |        "</table>\n",
1588 |        "</div>"
1589 |       ],
1590 |       "text/plain": [
1591 |        "   Men  Women  Pets\n",
1592 |        "1   36     23    38\n",
1593 |        "2   22     32    16"
1594 |       ]
1595 |      },
1596 |      "execution_count": 41,
1597 |      "metadata": {},
1598 |      "output_type": "execute_result"
1599 |     }
1600 |    ],
1601 |    "source": [
1602 |     "ser_6 = pd.Series(range(5), index=['a', 'b', 'c', 'd', 'e'])\n",
1603 |     "sl_1 = ser_6[:4]\n",
1604 |     "sl_2 = ser_6[1:]\n",
1605 |     "print(sl_1)\n",
1606 |     "print(sl_2)\n",
1607 |     "# Align both series by the union of their indexes\n",
1608 |     "sl_1.align(sl_2)\n",
1609 |     "# Align by calling series\n",
1610 |     "sl_1.align(sl_2, join='left')\n",
1611 |     "# Use passed series indexes\n",
1612 |     "sl_1.align(sl_2, join='right')\n",
1613 |     "# Get where indexes intersect\n",
1614 |     "sl_1.align(sl_2, join='inner')\n",
1615 |     "\n",
1616 |     "# You can use align with DFs as well\n",
1617 |     "arr_3 = np.random.randint(10, 50, size=(2, 3))\n",
1618 |     "df_6 = pd.DataFrame(arr_3, ['A', 'B'], ['C', 'D', 'E'])\n",
1619 |     "arr_3 = np.random.randint(10, 50, size=(2, 3))\n",
1620 |     "df_7 = pd.DataFrame(arr_3, ['B', 'C'], ['C', 'D', 'E'])\n",
1621 |     "df_6\n",
1622 |     "df_6.align(df_7)\n",
1623 |     "\n",
1624 |     "# reindex allows you to align data by index\n",
1625 |     "ser_6.reindex(['c','b','a'])\n",
1626 |     "\n",
1627 |     "# Do the same with DFs\n",
1628 |     "df_6.reindex(['B','A'])\n",
1629 |     "\n",
1630 |     "# Drop is very similar to reindex except it receives labels\n",
1631 |     "# you don't want to include\n",
1632 |     "df_6.drop(['A'], axis=0)\n",
1633 |     "df_6.drop(['D'], axis=1)\n",
1634 |     "\n",
1635 |     "# You can rename labels\n",
1636 |     "df_6.rename(columns={'C': 'Men', 'D': 'Women', 'E': 'Pets'},\n",
1637 |     "           index={'A': 1, 'B': 2})"
1638 |    ]
1639 |   },
1640 |   {
1641 |    "cell_type": "markdown",
1642 |    "metadata": {},
1643 |    "source": [
1644 |     "### MultiIndex"
1645 |    ]
1646 |   },
1647 |   {
1648 |    "cell_type": "code",
1649 |    "execution_count": 42,
1650 |    "metadata": {},
1651 |    "outputs": [
1652 |     {
1653 |      "name": "stdout",
1654 |      "output_type": "stream",
1655 |      "text": [
1656 |       "[('Day 1', 1), ('Day 1', 2), ('Day 1', 3), ('Day 2', 1), ('Day 2', 2), ('Day 2', 3)]\n",
1657 |       "           M    F\n",
1658 |       "Day 1 1  682  514\n",
1659 |       "      2  525  613\n",
1660 |       "      3  542  576\n",
1661 |       "Day 2 1  553  651\n",
1662 |       "      2  676  677\n",
1663 |       "      3  645  676\n"
1664 |      ]
1665 |     },
1666 |     {
1667 |      "data": {
1668 |       "text/html": [
1669 |        "<div>\n",
1670 |        "<style scoped>\n",
1671 |        "    .dataframe tbody tr th:only-of-type {\n",
1672 |        "        vertical-align: middle;\n",
1673 |        "    }\n",
1674 |        "\n",
1675 |        "    .dataframe tbody tr th {\n",
1676 |        "        vertical-align: top;\n",
1677 |        "    }\n",
1678 |        "\n",
1679 |        "    .dataframe thead th {\n",
1680 |        "        text-align: right;\n",
1681 |        "    }\n",
1682 |        "</style>\n",
1683 |        "<table border=\"1\" class=\"dataframe\">\n",
1684 |        "  <thead>\n",
1685 |        "    <tr style=\"text-align: right;\">\n",
1686 |        "      <th></th>\n",
1687 |        "      <th>C</th>\n",
1688 |        "      <th>F</th>\n",
1689 |        "      <th>M</th>\n",
1690 |        "    </tr>\n",
1691 |        "    <tr>\n",
1692 |        "      <th>A</th>\n",
1693 |        "      <th>B</th>\n",
1694 |        "      <th></th>\n",
1695 |        "      <th></th>\n",
1696 |        "    </tr>\n",
1697 |        "  </thead>\n",
1698 |        "  <tbody>\n",
1699 |        "    <tr>\n",
1700 |        "      <th rowspan=\"3\" valign=\"top\">Day 1</th>\n",
1701 |        "      <th>1</th>\n",
1702 |        "      <td>NaN</td>\n",
1703 |        "      <td>1.0</td>\n",
1704 |        "    </tr>\n",
1705 |        "    <tr>\n",
1706 |        "      <th>2</th>\n",
1707 |        "      <td>2.0</td>\n",
1708 |        "      <td>NaN</td>\n",
1709 |        "    </tr>\n",
1710 |        "    <tr>\n",
1711 |        "      <th>3</th>\n",
1712 |        "      <td>NaN</td>\n",
1713 |        "      <td>3.0</td>\n",
1714 |        "    </tr>\n",
1715 |        "    <tr>\n",
1716 |        "      <th rowspan=\"3\" valign=\"top\">Day 2</th>\n",
1717 |        "      <th>1</th>\n",
1718 |        "      <td>4.0</td>\n",
1719 |        "      <td>NaN</td>\n",
1720 |        "    </tr>\n",
1721 |        "    <tr>\n",
1722 |        "      <th>2</th>\n",
1723 |        "      <td>NaN</td>\n",
1724 |        "      <td>5.0</td>\n",
1725 |        "    </tr>\n",
1726 |        "    <tr>\n",
1727 |        "      <th>3</th>\n",
1728 |        "      <td>6.0</td>\n",
1729 |        "      <td>NaN</td>\n",
1730 |        "    </tr>\n",
1731 |        "  </tbody>\n",
1732 |        "</table>\n",
1733 |        "</div>"
1734 |       ],
1735 |       "text/plain": [
1736 |        "C          F    M\n",
1737 |        "A     B          \n",
1738 |        "Day 1 1  NaN  1.0\n",
1739 |        "      2  2.0  NaN\n",
1740 |        "      3  NaN  3.0\n",
1741 |        "Day 2 1  4.0  NaN\n",
1742 |        "      2  NaN  5.0\n",
1743 |        "      3  6.0  NaN"
1744 |       ]
1745 |      },
1746 |      "execution_count": 42,
1747 |      "metadata": {},
1748 |      "output_type": "execute_result"
1749 |     }
1750 |    ],
1751 |    "source": [
1752 |     "# Multi-level indexing allows you to store data on multiple\n",
1753 |     "# dimensions\n",
1754 |     "days = ['Day 1', 'Day 1', 'Day 1', 'Day 2', 'Day 2', 'Day 2']\n",
1755 |     "meals = [1,2,3,1,2,3]\n",
1756 |     "# zip pairs the days and meals arrays \n",
1757 |     "# Then we create a list of those paired tuples\n",
1758 |     "hier_index = list(zip(days, meals))\n",
1759 |     "print(hier_index)\n",
1760 |     "# Converts list of tuples into each row and column\n",
1761 |     "hier_index = pd.MultiIndex.from_tuples(hier_index)\n",
1762 |     "# Generate random array representing calories eaten per meal\n",
1763 |     "arr_5 = np.random.randint(500, 700, size=(6, 2))\n",
1764 |     "df_9 = pd.DataFrame(arr_5, hier_index, ['M', 'F'])\n",
1765 |     "print(df_9)\n",
1766 |     "\n",
1767 |     "# Grab the day 1 DF\n",
1768 |     "df_9.loc['Day 1']\n",
1769 |     "\n",
1770 |     "# Grab 1st row as a series\n",
1771 |     "df_9.loc['Day 1'].loc[1]\n",
1772 |     "\n",
1773 |     "# Grab calories eaten by the female on day 2 for the 2nd meal\n",
1774 |     "df_9.loc['Day 2'].loc[2]['F']\n",
1775 |     "\n",
1776 |     "# We can assign names to the Day and Meals Column\n",
1777 |     "df_9.index.names = ['Day', 'Meal']\n",
1778 |     "df_9\n",
1779 |     "\n",
1780 |     "# Get a cross section\n",
1781 |     "# This gets me the Day 2 DF\n",
1782 |     "df_9.xs('Day 2')\n",
1783 |     "\n",
1784 |     "# Get calories for the 1st meal for both days by saying what\n",
1785 |     "# meal index you want and the Meal column name\n",
1786 |     "df_9.xs(1, level='Meal')\n",
1787 |     "\n",
1788 |     "# Create a MultiIndex out of a DF using a pivot table\n",
1789 |     "dict_6 = {'A':['Day 1', 'Day 1', 'Day 1', 'Day 2', 'Day 2', 'Day 2'],\n",
1790 |     "         'B': [1,2,3,1,2,3],\n",
1791 |     "         'C': ['M', 'F', 'M', 'F', 'M', 'F'],\n",
1792 |     "         'D': [1,2,3,4,5,6]}\n",
1793 |     "df_14 = pd.DataFrame(dict_6)\n",
1794 |     "# Designate the D column is the data\n",
1795 |     "# Make A & B a multilevel index\n",
1796 |     "# Define column names come from column C\n",
1797 |     "# You will have NaNs where data was missing\n",
1798 |     "df_14.pivot_table(values='D', index=['A','B'], columns=['C'])"
1799 |    ]
1800 |   },
1801 |   {
1802 |    "cell_type": "markdown",
1803 |    "metadata": {},
1804 |    "source": [
1805 |     "### Handling Missing Data"
1806 |    ]
1807 |   },
1808 |   {
1809 |    "cell_type": "code",
1810 |    "execution_count": 43,
1811 |    "metadata": {},
1812 |    "outputs": [
1813 |     {
1814 |      "name": "stdout",
1815 |      "output_type": "stream",
1816 |      "text": [
1817 |       "     A    B    C\n",
1818 |       "0  1.0  4.0  7.0\n",
1819 |       "1  2.0  NaN  8.0\n",
1820 |       "2  NaN  NaN  9.0\n"
1821 |      ]
1822 |     },
1823 |     {
1824 |      "data": {
1825 |       "text/html": [
1826 |        "<div>\n",
1827 |        "<style scoped>\n",
1828 |        "    .dataframe tbody tr th:only-of-type {\n",
1829 |        "        vertical-align: middle;\n",
1830 |        "    }\n",
1831 |        "\n",
1832 |        "    .dataframe tbody tr th {\n",
1833 |        "        vertical-align: top;\n",
1834 |        "    }\n",
1835 |        "\n",
1836 |        "    .dataframe thead th {\n",
1837 |        "        text-align: right;\n",
1838 |        "    }\n",
1839 |        "</style>\n",
1840 |        "<table border=\"1\" class=\"dataframe\">\n",
1841 |        "  <thead>\n",
1842 |        "    <tr style=\"text-align: right;\">\n",
1843 |        "      <th></th>\n",
1844 |        "      <th>A</th>\n",
1845 |        "      <th>B</th>\n",
1846 |        "      <th>C</th>\n",
1847 |        "    </tr>\n",
1848 |        "  </thead>\n",
1849 |        "  <tbody>\n",
1850 |        "    <tr>\n",
1851 |        "      <th>0</th>\n",
1852 |        "      <td>1.0</td>\n",
1853 |        "      <td>4.0</td>\n",
1854 |        "      <td>7.0</td>\n",
1855 |        "    </tr>\n",
1856 |        "    <tr>\n",
1857 |        "      <th>1</th>\n",
1858 |        "      <td>2.0</td>\n",
1859 |        "      <td>NaN</td>\n",
1860 |        "      <td>8.0</td>\n",
1861 |        "    </tr>\n",
1862 |        "    <tr>\n",
1863 |        "      <th>2</th>\n",
1864 |        "      <td>NaN</td>\n",
1865 |        "      <td>NaN</td>\n",
1866 |        "      <td>9.0</td>\n",
1867 |        "    </tr>\n",
1868 |        "  </tbody>\n",
1869 |        "</table>\n",
1870 |        "</div>"
1871 |       ],
1872 |       "text/plain": [
1873 |        "     A    B    C\n",
1874 |        "0  1.0  4.0  7.0\n",
1875 |        "1  2.0  NaN  8.0\n",
1876 |        "2  NaN  NaN  9.0"
1877 |       ]
1878 |      },
1879 |      "execution_count": 43,
1880 |      "metadata": {},
1881 |      "output_type": "execute_result"
1882 |     }
1883 |    ],
1884 |    "source": [
1885 |     "dict_4 = {'A': [1,2,np.nan], 'B': [4, np.nan, np.nan], 'C': [7.,8.,9.]}\n",
1886 |     "df_10 = pd.DataFrame(dict_4)\n",
1887 |     "print(df_10)\n",
1888 |     "\n",
1889 |     "# Drop missing data from DF (Drops any row with missing values)\n",
1890 |     "df_10.dropna()\n",
1891 |     "\n",
1892 |     "# Drop all columns with any missing data\n",
1893 |     "df_10.dropna(axis=1)\n",
1894 |     "\n",
1895 |     "# Drop row unless it has at least 2 non-NaN values\n",
1896 |     "df_10.dropna(thresh=2)\n",
1897 |     "\n",
1898 |     "# Fill NaN values with 0\n",
1899 |     "df_10.fillna(value=0.0)\n",
1900 |     "\n",
1901 |     "# Fill A column with the mean of column\n",
1902 |     "df_10['A'].fillna(value=df_10['A'].mean())\n",
1903 |     "\n",
1904 |     "# Fill with previous value\n",
1905 |     "df_10.fillna(method='ffill')\n",
1906 |     "\n",
1907 |     "# Fill with next value (Only works if there is a next value)\n",
1908 |     "df_10.fillna(method='bfill')"
1909 |    ]
1910 |   },
1911 |   {
1912 |    "cell_type": "markdown",
1913 |    "metadata": {},
1914 |    "source": [
1915 |     "### Experimenting with Data"
1916 |    ]
1917 |   },
1918 |   {
1919 |    "cell_type": "code",
1920 |    "execution_count": 44,
1921 |    "metadata": {},
1922 |    "outputs": [
1923 |     {
1924 |      "name": "stdout",
1925 |      "output_type": "stream",
1926 |      "text": [
1927 |       "Index(['Sale ID', 'Contact', 'Sex', 'Age', 'State', 'Product ID',\n",
1928 |       "       'Product Type', 'Sale Price', 'Profit', 'Lead', 'Month', 'Year',\n",
1929 |       "       'First Name', 'Last Name', 'Age Group'],\n",
1930 |       "      dtype='object')\n"
1931 |      ]
1932 |     },
1933 |     {
1934 |      "data": {
1935 |       "text/plain": [
1936 |        "False    35\n",
1937 |        "True      4\n",
1938 |        "Name: Profit, dtype: int64"
1939 |       ]
1940 |      },
1941 |      "execution_count": 44,
1942 |      "metadata": {},
1943 |      "output_type": "execute_result"
1944 |     }
1945 |    ],
1946 |    "source": [
1947 |     "cs_df.head() # Get 1st 5\n",
1948 |     "print(cs_df.columns) # Get column names\n",
1949 |     "cs_df['Profit'].mean() # Average profit per item\n",
1950 |     "# Get the product with the highest profit\n",
1951 |     "cs_df[['Product ID', 'Profit']].max(axis=0).head()\n",
1952 |     "# Number of people who purchased from WV\n",
1953 |     "cs_df[cs_df['State']=='WV']['State'].count()\n",
1954 |     "# Number of purchases in 2019\n",
1955 |     "len(cs_df[cs_df['Year']==2019].index)\n",
1956 |     "# Get number of sales for each product type\n",
1957 |     "cs_df['Product ID'].value_counts()\n",
1958 |     "# Get list of customers that bought a specific product\n",
1959 |     "cs_df[cs_df['Product ID']=='M01-F0024']['Contact']\n",
1960 |     "# How many made a website purchase for a profit over $200\n",
1961 |     "cs_df[(cs_df['Lead']=='Website') & (cs_df['Profit']>150)]['Lead'].count()\n",
1962 |     "# Find out how many product profit amounts include .89 in cents\n",
1963 |     "cs_df['Profit'].apply(lambda cents: str(cents).split('.')[1]=='89').value_counts()\n"
1964 |    ]
1965 |   },
1966 |   {
1967 |    "cell_type": "markdown",
1968 |    "metadata": {},
1969 |    "source": [
1970 |     "### Visualization"
1971 |    ]
1972 |   },
1973 |   {
1974 |    "cell_type": "code",
1975 |    "execution_count": 99,
1976 |    "metadata": {},
1977 |    "outputs": [
1978 |     {
1979 |      "data": {
1980 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD4CAYAAAAD6PrjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAVMklEQVR4nO3df7BX9X3n8ec7IF4RDVHRpVwsuIMm6FwRiT83RkKdUqvCZpPKTmyA2LpO2CZxTVpJOqtjxsgkJuaHTSkmKka3VtREs25jLIqdZDZYSBgikIgpKdxIlZISRVER3vvH93D2Ihf4Er7ne7j3Ph8zzvecz/f8eJ+R+b7u+ZxzPicyE0mSAN5WdwGSpEOHoSBJKhkKkqSSoSBJKhkKkqTS4LoLOBjHHXdcjhkzpu4yJKlPWb58+b9l5ojevuvToTBmzBiWLVtWdxmS1KdExL/s7Tu7jyRJJUNBklQyFCRJpT59TUGSmrV9+3a6u7t57bXX6i6lbTo6Oujs7OSwww5reh1DQdKA0N3dzVFHHcWYMWOIiLrLqVxmsnnzZrq7uxk7dmzT69l9JGlAeO211zj22GMHRCAARATHHnvsAZ8ZGQqSBoyBEgi7/DbHayhIkkpeU5A0IN36+LMt3d41F52832WGDRvG1q1by/m77rqLZcuWcdtttzF//nyGDh3Khz/84V7XXbJkCUOGDOG8885rWc29MRQ04LT6x2BvmvmRkHa5+uqr9/n9kiVLGDZsmKGgQ0c7fkz9IdVAdcMNNzBs2DA++clP8tWvfpX58+czePBgxo8fz7x585g/fz6DBg3innvu4Wtf+xrvec97KqljQIeCP3KS2mnbtm1MmDChnP/1r3/NZZddtsdy8+bNY926dRx++OFs2bKF4cOHc/XVV5ehUaUBHQqS1E5HHHEEK1asKOd3XVN4q66uLj70oQ8xffp0pk+f3s4SvftIkg41jz76KHPmzGH58uWceeaZvPnmm23bt6EgSYeQnTt3smHDBiZPnsznP/95tmzZwtatWznqqKN4+eWXK9+/3UeSBqRD9Xrfjh07uOKKK/jNb35DZnLNNdcwfPhwLr30Uj7wgQ/w8MMPe6FZA0e7bheV6tDzGQWAWbNmMWvWLKBx99EuP/jBD/ZY9+STT2blypVVlgfYfSRJ6sFQkCSV7D6qmM9CSOpLKg2FiLgG+BMggZ8Cs4GhwN8BY4BfAn+Umf9eLD8XuBLYAXwsMx+rsj6pUk/eXP0+Js+tfh8aUCrrPoqIUcDHgEmZeRowCJgBXAcszsxxwOJinogYX3x/KjAV+HpEDKqqPknSnqq+pjAYOCIiBtM4Q3gemAYsLL5fCOx6XG8acF9mvp6Z64DngLMqrk+S1ENl3UeZ+auIuAVYD2wDvp+Z34+IEzJzY7HMxog4vlhlFPCjHpvoLtokqfVa3b3XZFded3c3c+bMYfXq1ezcuZNLLrmEL3zhCwwZMmS35S688EJuueUWJk2atFt7z+G2q1Bl99E7aPz1Pxb4HeDIiLhiX6v00pa9bPeqiFgWEcs2bdrUmmIlqQ0yk/e///1Mnz6dtWvX8uyzz7J161Y+85nP1F1aqcruo98D1mXmpszcDjwEnAe8EBEjAYrPF4vlu4HRPdbvpNHdtJvMXJCZkzJz0ogRIyosX5Ja64knnqCjo4PZs2cDMGjQIG699VbuuOMOXnnlFWbMmEFXVxeXX34527ZtK9e78847Ofnkk3nve9/LD3/4w7J90aJFnHbaaZx++ulccMEFLamxyruP1gPnRMRQGt1HU4BlwCvATGBe8flwsfwjwP+KiC/ROLMYBzxdYX2S1FarVq3izDPP3K3t6KOP5sQTT+SLX/wiQ4cOZeXKlaxcuZKJEycCsHHjRq6//nqWL1/O29/+diZPnswZZ5wBwI033shjjz3GqFGj2LJlS0tqrOxMITOXAg8AP6ZxO+rbgAU0wuCiiFgLXFTMk5mrgPuB1cD3gDmZuaOq+iSp3TKTiD17yjOTp556iiuuaPSwd3V10dXVBcDSpUu58MILGTFiBEOGDOHyyy8v1zv//POZNWsWt99+Ozt2tObnstLnFDLzeuD6tzS/TuOsobflbwJuqrKm/sjxgqS+4dRTT+XBBx/cre2ll15iw4YNHH/88b0GBrDX9vnz57N06VIeffRRJkyYwIoVKzj22GMPqkaHuZCkNpkyZQqvvvoqd999N9AYEfXaa69l1qxZTJ06lXvvvReAZ555phz87uyzz2bJkiVs3ryZ7du3s2jRonJ7v/jFLzj77LO58cYbOe6449iwYcNB1+gwF5IGphqeBo8Ivv3tb/PRj36Uz372s+zcuZOLL76Yz33uc+zcuZPZs2fT1dXFhAkTOOusxmNaI0eO5IYbbuDcc89l5MiRTJw4sewq+tSnPsXatWvJTKZMmcLpp59+0DUaCpLURqNHj+a73/1ur9/dd999vbbPnj27vGOpp4ceeqiltYHdR5KkHgwFSVLJUJA0YGTuMUhCv/bbHK+hIGlA6OjoYPPmzQMmGDKTzZs309HRcUDreaFZ0oDQ2dlJd3c3A2nMtI6ODjo7Ow9oHUNB0oBw2GGHMXbs2LrLOOTZfSRJKhkKkqSSoSBJKhkKkqSSoSBJKhkKkqSSoSBJKhkKkqSSoSBJKvlEswacc9YvaM+OTjq41yJKdfBMQZJUMhQkSSVDQZJUMhQkSSVDQZJUMhQkSSVDQZJUMhQkSSVDQZJU8olmqS978ubq9zF5bvX70CHDMwVJUslQkCSVDAVJUslQkCSVDAVJUslQkCSVDAVJUslQkCSVDAVJUslQkCSVKg2FiBgeEQ9ExM8iYk1EnBsRx0TE4xGxtvh8R4/l50bEcxHx84j4/SprkyTtqeozha8A38vMdwKnA2uA64DFmTkOWFzMExHjgRnAqcBU4OsRMaji+iRJPVQWChFxNHAB8E2AzHwjM7cA04CFxWILgenF9DTgvsx8PTPXAc8BZ1VVnyRpT1WeKZwEbALujIifRMQ3IuJI4ITM3AhQfB5fLD8K2NBj/e6iTZLUJlWGwmBgIvDXmXkG8ApFV9FeRC9tucdCEVdFxLKIWLZp06bWVCpJAqoNhW6gOzOXFvMP0AiJFyJiJEDx+WKP5Uf3WL8TeP6tG83MBZk5KTMnjRgxorLiJWkgqiwUMvNfgQ0RcUrRNAVYDTwCzCzaZgIPF9OPADMi4vCIGAuMA56uqj5J0p6qfvPanwH3RsQQ4J+B2TSC6P6IuBJYD3wQIDNXRcT9NILjTWBOZu6ouD5JUg+VhkJmrgAm9fLVlL0sfxNwU5U1SZL2zieaJUklQ0GSVDIUJEklQ0GSVDIUJEklQ0GSVKr6OQVJfd2TN7dnP5Pntmc/2qemzhQi4rSqC5Ek1a/Z7qP5EfF0RHw0IoZXWpEkqTZNdR9l5n+KiHHAR4BlEfE0cGdmPl5pdRpwzlm/oO4SpAGt6QvNmbkW+EvgL4D3Al8tXrP5/qqKkyS1V7PXFLoi4lYar9N8H3BpZr6rmL61wvokSW3U7N1HtwG3A5/OzG27GjPz+Yj4y0oqkyS1XbOhcDGwbddQ1hHxNqAjM1/NzG9VVp0kqa2avabwD8ARPeaHFm2SpH6k2VDoyMytu2aK6aHVlCRJqkuzofBKREzcNRMRZwLb9rG8JKkPavaawieARRHxfDE/Eri8mpIkSXVp9uG1f4qIdwKnAAH8LDO3V1qZJKntDmRAvHcDY4p1zogIMvPuSqqSJNWiqVCIiG8B/xFYAewomhMwFCSpH2n2TGESMD4zs8piJEn1avbuo2eA/1BlIZKk+jV7pnAcsLoYHfX1XY2ZeVklVUmSatFsKNxQZRGSpENDs7ekPhURvwuMy8x/iIihwKBqS5MktVuzQ2f/KfAA8DdF0yjgO1UVJUmqR7MXmucA5wMvQfnCneOrKkqSVI9mQ+H1zHxj10xEDKbxnIIkqR9pNhSeiohPA0dExEXAIuC71ZUlSapDs6FwHbAJ+Cnw34D/Q+N9zZKkfqTZu4920ngd5+3VliNJqlOzYx+to5drCJl5UssrkiTV5kDGPtqlA/ggcEzry5E0YD15c/X7mDy3+n30cU1dU8jMzT3++1Vmfhl4X8W1SZLarNnuo4k9Zt9G48zhqEoqkiTVptnuoy/2mH4T+CXwRy2vRpJUq2bvPppcdSE69J2zfkHdJUiqWLPdR/9jX99n5pdaU44kqU4HcvfRu4FHivlLgX8ENlRRlCSpHgfykp2JmfkyQETcACzKzD/Z34oRMQhYBvwqMy+JiGOAvwPGUFybyMx/L5adC1xJ4z3QH8vMxw7oaCRJB6XZYS5OBN7oMf8GjR/1ZnwcWNNj/jpgcWaOAxYX80TEeGAGcCowFfh6ESiSpDZpNhS+BTwdETdExPXAUuDu/a0UEZ3AHwLf6NE8DVhYTC8Epvdovy8zX8/MdcBzwFlN1idJaoFm7z66KSL+HnhP0TQ7M3/SxKpfBv6c3Z9pOCEzNxbb3RgRu97LMAr4UY/luou23UTEVcBVACeeeGIz5UuSmtTsmQLAUOClzPwK0B0RY/e1cERcAryYmcub3H700tbbeEsLMnNSZk4aMWJEk5uWJDWj2VtSr6dxB9IpwJ3AYcA9NN7GtjfnA5dFxMU0xks6OiLuAV6IiJHFWcJI4MVi+W5gdI/1O4HnD+RgJEkHp9kzhf8MXAa8ApCZz7OfYS4yc25mdmbmGBoXkJ/IzCto3NY6s1hsJvBwMf0IMCMiDi/OQsYBTx/AsUiSDlKzt6S+kZkZEQkQEUcexD7nAfdHxJXAehojrpKZqyLifmA1jaE05mTmjoPYjyTpADUbCvdHxN8AwyPiT4GPcAAv3MnMJcCSYnozMGUvy90E3NTsdiVJrbXfUIiIoPGw2TuBl2hcV/ifmfl4xbVJktpsv6FQdBt9JzPPBAwCSerHmr3Q/KOIeHellUiSatfsNYXJwNUR8UsadyAFjZOIrqoKkyS13z5DISJOzMz1wB+0qR5JUo32d6bwHRqjo/5LRDyYmf+lHUVJkuqxv2sKPYeeOKnKQiRJ9dtfKORepiVJ/dD+uo9Oj4iXaJwxHFFMw/+/0Hx0pdVJktpqn6GQmb7kRpIGkAMZOluS1M8ZCpKkkqEgSSoZCpKkkqEgSSoZCpKkkqEgSSoZCpKkkqEgSSoZCpKkUrMv2ZGkvu/Jm9uzn8lz27OfChgK/cA56xfUXYKkfsLuI0lSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSyVCQJJUMBUlSqbJQiIjREfFkRKyJiFUR8fGi/ZiIeDwi1haf7+ixztyIeC4ifh4Rv19VbZKk3lV5pvAmcG1mvgs4B5gTEeOB64DFmTkOWFzMU3w3AzgVmAp8PSIGVVifJOktKguFzNyYmT8upl8G1gCjgGnAwmKxhcD0YnoacF9mvp6Z64DngLOqqk+StKe2XFOIiDHAGcBS4ITM3AiN4ACOLxYbBWzosVp30fbWbV0VEcsiYtmmTZuqLFuSBpzKQyEihgEPAp/IzJf2tWgvbblHQ+aCzJyUmZNGjBjRqjIlSVQcChFxGI1AuDczHyqaX4iIkcX3I4EXi/ZuYHSP1TuB56usT5K0uyrvPgrgm8CazPxSj68eAWYW0zOBh3u0z4iIwyNiLDAOeLqq+iRJexpc4bbPB/4Y+GlErCjaPg3MA+6PiCuB9cAHATJzVUTcD6ymcefSnMzcUWF9kqS3qCwUMvMH9H6dAGDKXta5CbipqpokSfvmE82SpJKhIEkqGQqSpJKhIEkqGQqSpJKhIEkqVfmcgoBz1i+ouwRJapqhIEmt9uTN1e9j8txKNmv3kSSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpZChIkkqGgiSpdMiFQkRMjYifR8RzEXFd3fVI0kBySIVCRAwC/gr4A2A88F8jYny9VUnSwHFIhQJwFvBcZv5zZr4B3AdMq7kmSRowBtddwFuMAjb0mO8Gzu65QERcBVxVzG6NiJ+3qbaDdRzwb3UXUaH+fHweW9/Vj4/v0wdzbL+7ty8OtVCIXtpyt5nMBcCC9pTTOhGxLDMn1V1HVfrz8XlsfVd/Pr6qju1Q6z7qBkb3mO8Enq+pFkkacA61UPgnYFxEjI2IIcAM4JGaa5KkAeOQ6j7KzDcj4r8DjwGDgDsyc1XNZbVKn+vyOkD9+fg8tr6rPx9fJccWmbn/pSRJA8Kh1n0kSaqRoSBJKhkKFYuI0RHxZESsiYhVEfHxumtqtYgYFBE/iYj/XXctrRQRwyPigYj4WfH/79y6a2qliLim+Df5TET8bUR01F3Tbysi7oiIFyPimR5tx0TE4xGxtvh8R501Hoy9HN8Xin+bKyPi2xExvBX7MhSq9yZwbWa+CzgHmNMPh+74OLCm7iIq8BXge5n5TuB0+tExRsQo4GPApMw8jcaNHTPqreqg3AVMfUvbdcDizBwHLC7m+6q72PP4HgdOy8wu4Flgbit2ZChULDM3ZuaPi+mXafywjKq3qtaJiE7gD4Fv1F1LK0XE0cAFwDcBMvONzNxSb1UtNxg4IiIGA0Ppw88EZeY/Ar9+S/M0YGExvRCY3taiWqi348vM72fmm8Xsj2g813XQDIU2iogxwBnA0noraakvA38O7Ky7kBY7CdgE3Fl0jX0jIo6su6hWycxfAbcA64GNwG8y8/v1VtVyJ2TmRmj8cQYcX3M9VfoI8Pet2JCh0CYRMQx4EPhEZr5Udz2tEBGXAC9m5vK6a6nAYGAi8NeZeQbwCn27+2E3Rf/6NGAs8DvAkRFxRb1V6bcREZ+h0U19byu2Zyi0QUQcRiMQ7s3Mh+qup4XOBy6LiF/SGNH2fRFxT70ltUw30J2Zu87qHqAREv3F7wHrMnNTZm4HHgLOq7mmVnshIkYCFJ8v1lxPy0XETOAS4EPZoofODIWKRUTQ6Jdek5lfqrueVsrMuZnZmZljaFykfCIz+8Vfm5n5r8CGiDilaJoCrK6xpFZbD5wTEUOLf6NT6EcX0guPADOL6ZnAwzXW0nIRMRX4C+CyzHy1Vds1FKp3PvDHNP6KXlH8d3HdRakpfwbcGxErgQnA52qup2WKM6AHgB8DP6XxW9Bnh4SIiL8F/i9wSkR0R8SVwDzgoohYC1xUzPdJezm+24CjgMeL35X5LdmXw1xIknbxTEGSVDIUJEklQ0GSVDIUJEklQ0GSVDIUJEklQ0GSVPp/1qHH61K7WgoAAAAASUVORK5CYII=\n",
1981 |       "text/plain": [
1982 |        "<Figure size 432x288 with 1 Axes>"
1983 |       ]
1984 |      },
1985 |      "metadata": {
1986 |       "needs_background": "light"
1987 |      },
1988 |      "output_type": "display_data"
1989 |     }
1990 |    ],
1991 |    "source": [
1992 |     "# Library usef to create advanced static, animated and\n",
1993 |     "# interactive visualizations\n",
1994 |     "import matplotlib.pyplot as plt\n",
1995 |     "\n",
1996 |     "# Displays matplotlib plots in the Notebook\n",
1997 |     "%matplotlib inline\n",
1998 |     "\n",
1999 |     "# Histograms provide an approximation of the distribution of\n",
2000 |     "# results. You create them by dividing the range of values into \n",
2001 |     "# bins or buckets. Then you count how many of the results fall\n",
2002 |     "# into each bin.\n",
2003 |     "# Rolls 2 dice 5000 times and charts the frequency and \n",
2004 |     "# a histogram\n",
2005 |     "\n",
2006 |     "# Even though the odds increase as you approach 7 and then\n",
2007 |     "# decrease again (1 way to roll a 2 / 6 ways to roll a 7)\n",
2008 |     "# over many rolls they are nearly equal.\n",
2009 |     "df_dice = pd.DataFrame(\n",
2010 |     "    np.random.randint(1,7,5000),\n",
2011 |     "    columns = ['Hist'])\n",
2012 |     "df_dice['Odds'] = df_dice['Hist'] + np.random.randint(1,7,5000)\n",
2013 |     "# Alpha decreases the opacity in the chart\n",
2014 |     "ax = df_dice.plot.hist(bins=12, alpha=0.5)\n",
2015 |     "\n",
2016 |     "# Basic plot using 1000 random values that create cumulative sums\n",
2017 |     "# over an increasing date range\n",
2018 |     "ser_5 = pd.Series(np.random.randn(1000),\n",
2019 |     "                 index=pd.date_range('11/15/2017', periods=1000))\n",
2020 |     "ser_5 = ser_5.cumsum()\n",
2021 |     "# ser_5.plot()\n",
2022 |     "\n",
2023 |     "# Display 3 random plots\n",
2024 |     "df_15 = pd.DataFrame(np.random.randn(1000, 3),\n",
2025 |     "                    index=pd.date_range('11/15/2017', periods=1000),\n",
2026 |     "                    columns=list('ABC'))\n",
2027 |     "df_15 = df_15.cumsum()\n",
2028 |     "# df_15.plot()\n",
2029 |     "\n",
2030 |     "# Make bar chart from 5 random values\n",
2031 |     "# pd.DataFrame(np.random.randn(5)).plot.bar()\n",
2032 |     "\n",
2033 |     "# Make MultiBar Charts\n",
2034 |     "vals = ['A', 'B', 'C', 'D']\n",
2035 |     "df_15 = pd.DataFrame(np.random.rand(10,4), columns=vals)\n",
2036 |     "# df_15.plot.bar()\n",
2037 |     "\n",
2038 |     "# Area plot \n",
2039 |     "# Define x range and y values\n",
2040 |     "x_rng = range(1,15)\n",
2041 |     "y_vals = [1,5,4,7,6,9,5,7,10,14,10,12,9,8]\n",
2042 |     "# Change fill color and opacity\n",
2043 |     "# plt.fill_between(x_rng, y_vals, color=\"skyblue\", alpha=0.5)\n",
2044 |     "# plt.show()\n",
2045 |     "\n",
2046 |     "# Area plot with multiple areas\n",
2047 |     "# pd.DataFrame(np.random.rand(10,3), columns=['A','B','C']).plot.area()\n",
2048 |     "\n",
2049 |     "# Create a scatterplot with 100 random values\n",
2050 |     "# pd.DataFrame(np.random.rand(100,2), \n",
2051 |     "#              columns=['A','B']).plot.scatter(x='A', y='B')\n",
2052 |     "\n",
2053 |     "# Multiple column scatter plots\n",
2054 |     "df_15 = pd.DataFrame(np.random.rand(50,4), columns=['A','B','C','D'])\n",
2055 |     "# ax = df_15.plot.scatter(x='A', y='B', color='DarkBlue', label='Grp 1')\n",
2056 |     "# df_15.plot.scatter(x='C', y='D', color='Orange', label='Grp 2', ax=ax)\n",
2057 |     "\n",
2058 |     "# Pie Charts with 4 random values\n",
2059 |     "# pd.Series(np.random.rand(4),\n",
2060 |     "#          index=['a','b','c','d'], \n",
2061 |     "#           name='Pie').plot.pie(figsize=(6,6))"
2062 |    ]
2063 |   }
2064 |  ],
2065 |  "metadata": {
2066 |   "kernelspec": {
2067 |    "display_name": "Python 3",
2068 |    "language": "python",
2069 |    "name": "python3"
2070 |   },
2071 |   "language_info": {
2072 |    "codemirror_mode": {
2073 |     "name": "ipython",
2074 |     "version": 3
2075 |    },
2076 |    "file_extension": ".py",
2077 |    "mimetype": "text/x-python",
2078 |    "name": "python",
2079 |    "nbconvert_exporter": "python",
2080 |    "pygments_lexer": "ipython3",
2081 |    "version": "3.7.7"
2082 |   }
2083 |  },
2084 |  "nbformat": 4,
2085 |  "nbformat_minor": 4
2086 | }
2087 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pandas-tutorial
2 | This is the cheat sheet Jupyter Notebook I made for my Pandas Learn in One Video Tutorial. I basically condensed the Pandas API down into this one cheat sheet with hundreds of examples. I hope you find it useful.
3 | 


--------------------------------------------------------------------------------
/icecreamsales.csv:
--------------------------------------------------------------------------------
 1 | Temperature,Sales
 2 | 37,292
 3 | 40,228
 4 | 49,324
 5 | 61,376
 6 | 72,440
 7 | 79,496
 8 | 83,536
 9 | 81,556
10 | 75,496
11 | 64,412
12 | 53,324
13 | 40,320


--------------------------------------------------------------------------------