├── .gitattributes ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── course ├── 1 - Pandas & Datasets.ipynb ├── 2 - Cleaning Data with Python & Pandas.ipynb ├── 3 - Basic Analysis in Pandas DataFrames.ipynb ├── 4 - Cleaning Real Data.ipynb ├── 5 - Merge Datasets.ipynb ├── 6 - NBA Player Details.ipynb ├── appendix │ ├── Appendix A - Scrape & Build NBA Salary Dataset.ipynb │ ├── Appendix B - Inflation Rate Dataset.ipynb │ ├── Appendix C - The NBA API Experiments.ipynb │ ├── Appendix D - NBA Player Detail.ipynb │ └── appendix-b-united-states-inflation-rate.csv ├── datasets │ ├── inflation-rate.csv │ └── nba-historical-salaries.csv ├── example.csv ├── samples │ ├── 1.csv │ ├── 2.csv │ ├── 4-adj-salaries-cumlative-per-year.csv │ ├── 4-player-salaries-cleaned.csv │ ├── 4-player-salaries-per-year.csv │ ├── 5-player-adj-salaries-audit.csv │ └── players │ │ ├── Caleb Swanigan │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Charlie Ward │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Chris Mihm │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Chris Wilcox │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Darvin Ham │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Devin Harris │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Eric Gordon │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Gary Trent Jr │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Gerald Wilkins │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Jahidi White │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Jason Smith │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Jermaine O'Neal │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Ken Norman │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Kevin Garnett │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Ledell Eackles │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Luke Harangody │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Michael Beasley │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Michael Jordan │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Shaquille O'Neal │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Steve Scheffler │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Toby Bailey │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ ├── Tony Farmer │ │ ├── salary_over_time.png │ │ └── stats.xlsx │ │ └── Tristan Thompson │ │ ├── salary_over_time.png │ │ └── stats.xlsx └── utils.py ├── nbs_ref ├── 1 - DataFrame.ipynb ├── 2 - Import & Export.ipynb ├── 3 - Rename Columns.ipynb ├── 4 - Clean Rows.ipynb ├── 5 - Basic Analysis.ipynb ├── 6 - Grouping & Plots.ipynb ├── 7 - Clean Real Data.ipynb ├── 8 - Merge Datasets.ipynb ├── 9 - Using an NBA Stats API.ipynb ├── example.csv ├── temp.csv └── utils.py ├── requirements.txt └── start-here.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | docs/** filter= diff= 2 | bin/** 3 | include/** 4 | lib/** 5 | 6 | *.ipynb filter=nbstripout 7 | *.zpln filter=nbstripout 8 | *.ipynb diff=ipynb 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | bin/ 3 | etc/ 4 | share/ 5 | pyvenv.cfg 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v3.2.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - repo: https://github.com/kynan/nbstripout 9 | rev: 0.5.0 10 | hooks: 11 | - id: nbstripout 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Coding For Entrepreneurs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Try Pandas Logo](https://static.codingforentrepreneurs.com/media/projects/try-pandas/images/share/Try_Pandas_Share.png)](https://www.codingforentrepreneurs.com/projects/try-pandas) 2 | 3 | # Try Pandas 4 | [Pandas](https://pandas.pydata.org/) is a great tool for doing analysis on spreadsheets. 5 | 6 | It's easy to say that but let's actually learn why by doing something real. 7 | 8 | We're going to be analyzing [NBA](https://www.nba.com/stats/) data to help understand why Pandas should be a tool in your data science toolkit. 9 | 10 | But more importantly, doing something practical will help you better understand the need for a tool like Pandas. 11 | 12 | To help us work with Pandas in a practical way, we've teamed up with [Deepnote](https://deepnote.com/referral?token=cfe). [Deepnote](https://deepnote.com/referral?token=cfe) is a service that makes it easy to run interactive notebooks (also known as Juptyer Notebooks). These notebooks allow us to run Python & Pandas in a highly visual and highly interactive manner. 13 | 14 | What's better, notebooks, especially on [Deepnote](https://deepnote.com/referral?token=cfe), allow non-technical team members to participate in a code-heavy document (as we'll see how). 15 | 16 | To get started, sign up for Deepnote using this [link](https://deepnote.com/referral?token=cfe) (This link will unlock pro features). 17 | 18 | Once you [sign up](https://deepnote.com/referral?token=cfe), you can automagically copy all the code in this repo with the following button: 19 | 20 | [](https://deepnote.com/launch?url=https://github.com/codingforentrepreneurs/Try-Pandas) 21 | -------------------------------------------------------------------------------- /course/1 - Pandas & Datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "cell_id": "00000-15d3f3d1-7d9d-48b0-9022-0136024f5fa7", 7 | "deepnote_cell_type": "code", 8 | "tags": [] 9 | }, 10 | "source": [ 11 | "# 1 - Pandas & Datasets\n", 12 | "\n", 13 | "Pandas helps us manage datasets and very often using flat files (eg. `csv`, `xlsx`, `tsv`, etc). In this one, we're going to create our first dataset with random data." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "### Create Random Data\n", 21 | "Below is a simple Python function to generate random data with no external dependencies. " 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import random" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "items = []\n", 40 | "\n", 41 | "random_number = random.randint(0, 50_000)\n", 42 | "\n", 43 | "def float_to_dollars(value):\n", 44 | " # in the future, this will be stored in\n", 45 | " # utils.py in the courses/ directory\n", 46 | " return f\"${value:,.2f}\" \n", 47 | "\n", 48 | "\n", 49 | "for x in range(0, random_number):\n", 50 | " dollars = random.randint(30_000, 50_000_000)\n", 51 | " data = {\n", 52 | " \"Player Name\": f\"Player-{x}\",\n", 53 | " \"Player Salary\": float_to_dollars(dollars)\n", 54 | " }\n", 55 | " items.append(data)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "There's a few questions I want to ask about this data:\n", 63 | "- How do we save this data? How do we load saved data?\n", 64 | "- How do we clean this data?\n", 65 | "- How do we analyze this data?\n", 66 | "\n", 67 | "The answer, of course, is Pandas. So let's see why." 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### Initialize a DataFrame\n", 75 | "A table of data in Pandas is called a DataFrame. At it's core, a Dataframe is just rows and columns. There are many ways to initialize it. Let's use the data from above to start our first one:" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "import pandas as pd\n", 85 | "\n", 86 | "df = pd.DataFrame(items)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Pandas uses a common [numpy](https://numpy.org/) convention when importing:\n", 94 | "```python\n", 95 | "import pandas as pd\n", 96 | "```\n", 97 | "So in Python projects that use Pandas, you will typically see this import somewhere. You usually won't do `import pandas` or `from pandas import DataFrame`. As with most things in software, there's nothing technically stopping you from doing that; it's just not the common practice.\n", 98 | "\n", 99 | "The variable `df` is very often used for instances of `DataFrame`.\n", 100 | "\n", 101 | "Since a `DataFrame` is a table with columns and rows, you can easily initialize it with a list of data. \n", 102 | "\n", 103 | "Let's take a look at this data:" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "df.head()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "Tables in Pandas can be massive so we use `df.head()` to get a glimpse of the first 5 rows. Use `df.head(n=20)` to change this value. You can also use `df.tail(n=5)` to see the end of this table." 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### Exporting a DataFrame (Writing)\n", 127 | "There are many ways to save DataFrames. You can save to:\n", 128 | "\n", 129 | "- CSV (Comma Separated Values)\n", 130 | "- TSV (Tab Separated Values)\n", 131 | "- Excel (`xlsx`)\n", 132 | "- JSON (JavaScript Object Notion)\n", 133 | "- HDF (HDF5 files)\n", 134 | "- HTML (reading/writing HTML tables ``)\n", 135 | "- Pickle\n", 136 | "- SQL\n", 137 | "- And much [more](https://pandas.pydata.org/docs/reference/io.html)\n", 138 | "\n", 139 | "Throughout this course we'll use a mixture of storage options but mostly using `csv` files as they are lightweight and easy to use in many situations. \n", 140 | "\n", 141 | "So how do we save this?" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "df.to_csv(\"example.csv\", index=False)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Here are a few other ways to export:\n", 158 | "\n", 159 | "\n", 160 | "```python\n", 161 | "#TSV\n", 162 | "df.to_csv('example.tsv', sep='\\t', index=False)\n", 163 | "\n", 164 | "#Excel\n", 165 | "df.to_excel('example.xlsx', sheet_name='example', index=False)\n", 166 | "\n", 167 | "#JSON\n", 168 | "df.to_json('example.json', index=False)\n", 169 | "\n", 170 | "#HDF\n", 171 | "df.to_hdf('example.h5', key='example', index=False)\n", 172 | "\n", 173 | "#HTML: \n", 174 | "\n", 175 | "df.to_html('example.html', index=False)\n", 176 | "\n", 177 | "#Pickle\n", 178 | "df.to_pickle('example.pkl', index=False)\n", 179 | "\n", 180 | "\n", 181 | "#SQL\n", 182 | "from sqlalchemy import create_engine\n", 183 | "engine = create_engine('sqlite://', echo=False)\n", 184 | "df.to_sql('example_table', con=engine, index=True)\n", 185 | "```\n", 186 | "\n", 187 | "Now that we have saved our `example.csv` file, how do we load it in? That's just as simple, and it's usually a `read_` directly in Pandas.\n", 188 | "\n", 189 | "> A quick note. There are many reasons these different file types exist. One of them, especially in dealing with `csv` files, has to do with date type. More on storing data types later." 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "### Importing Data (Reading)\n", 197 | "\n", 198 | "Importing data is just as easy as exporting data but instead of using a DataFrame class, we use built in methods for reading. First, here are examples:\n", 199 | "\n", 200 | "```python\n", 201 | "#CSV\n", 202 | "df = pd.read_csv('example.csv')\n", 203 | "\n", 204 | "#TSV\n", 205 | "df = pd.read_csv('example.tsv', sep='\\t')\n", 206 | "\n", 207 | "#Excel\n", 208 | "df = pd.read_excel('example.xlsx', sheet_name='example')\n", 209 | "\n", 210 | "#JSON\n", 211 | "df = pd.read_json('example.json')\n", 212 | "\n", 213 | "#HDF\n", 214 | "df = pd.read_hdf('example.h5', key='example')\n", 215 | "\n", 216 | "#HTML\n", 217 | "df = pd.read_html('example.html')\n", 218 | "\n", 219 | "#Pickle\n", 220 | "df = pd.read_pickle('example.pkl')\n", 221 | "\n", 222 | "#SQL\n", 223 | "from sqlalchemy import create_engine\n", 224 | "engine = create_engine('sqlite://')\n", 225 | "df = pd.read_sql('SELECT * from example_table', con=engine)\n", 226 | "```\n" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "new_df = pd.read_csv('example.csv')\n", 236 | "new_df.head()" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "Now that we can export and import data, how do we clean it up? " 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "# Export to samples dir\n", 253 | "# df.to_csv(\"samples/1.csv\", index=False)" 254 | ] 255 | } 256 | ], 257 | "metadata": { 258 | "deepnote": { 259 | "is_reactive": false 260 | }, 261 | "deepnote_execution_queue": [], 262 | "deepnote_notebook_id": "e609e7b1-ff5b-43c7-8bff-b115fd3b7749", 263 | "kernelspec": { 264 | "display_name": "Python 3 (ipykernel)", 265 | "language": "python", 266 | "name": "python3" 267 | }, 268 | "language_info": { 269 | "codemirror_mode": { 270 | "name": "ipython", 271 | "version": 3 272 | }, 273 | "file_extension": ".py", 274 | "mimetype": "text/x-python", 275 | "name": "python", 276 | "nbconvert_exporter": "python", 277 | "pygments_lexer": "ipython3", 278 | "version": "3.9.7" 279 | } 280 | }, 281 | "nbformat": 4, 282 | "nbformat_minor": 2 283 | } 284 | -------------------------------------------------------------------------------- /course/2 - Cleaning Data with Python & Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "cell_id": "00000-15d3f3d1-7d9d-48b0-9022-0136024f5fa7", 7 | "deepnote_cell_type": "code", 8 | "tags": [] 9 | }, 10 | "source": [ 11 | "# 2 - Cleaning Data with Python & Pandas\n", 12 | "\n", 13 | "\n", 14 | "### Cleaning Data\n", 15 | "\n", 16 | "It's true that we made this data but let's look at it as if we didn't. \n", 17 | "\n", 18 | "The `Player Salary` column has valid values for US Dollars but there's a key issue with them: they're strings (`str`). In this section, we'll convert this data into a `float` data type. \n", 19 | "\n", 20 | "The next issue is the column names. `Player Name` and `Player Salary` work but I would prefer to name them a bit more pythonic like `name` and `salary` respectively. \n", 21 | "\n", 22 | "Let's start by importing our sample data from `1 - Pandas & Datasets`" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd\n", 32 | "import random\n", 33 | "\n", 34 | "# utils.py was created by us\n", 35 | "import utils" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# read sample data\n", 45 | "df = pd.read_csv(\"samples/1.csv\") " 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "> Are you missing the sample data? Be sure to [launched this code on Deepnote](https://deepnote.com/launch?url=https://github.com/codingforentrepreneurs/Try-Pandas)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Now, lets __change the column names__:" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "column_name_mapping = {\n", 69 | " \"Player Name\": \"name\",\n", 70 | " \"Player Salary\": \"salary\"\n", 71 | "}\n", 72 | "\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# we're using the first DataFrame from the top `df`.\n", 82 | "renamed_df = df.rename(columns=column_name_mapping)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "renamed_df.head()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "The mapping is pretty simple just use a `key`/`value` pair with the way you want to rename it.\n", 99 | "\n", 100 | "Going forward we'll use the convention `df` instead of `renamed_df` so let's make a copy:" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "df = renamed_df.copy()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "Now, let's convert a Dollar `string` into a `float`:" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "salary_example = \"$30,707,056.00\"\n", 126 | "salary_replacements = salary_example.replace(\"$\", \"\").replace(\",\", \"_\")\n", 127 | "salary_replacements" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "As you see, I replaced commas `,` with underscores `_`. As you may know, you can write large values in Python using underscores to make it more human readable just like `100000000000` becomes `100_000_000_000`" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "salary_example_as_float = float(salary_replacements)\n", 144 | "salary_example_as_float" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Now that we have a `float` value, we can do further analysis. \n", 152 | "\n", 153 | "But this is just one hard-coded value. How do we do this in our `DataFrame`? There's actually a few ways to do this. We'll do it by adding a column to our dataset.\n", 154 | "\n", 155 | "Before we can make changes to any given column, let's look at all values in any given column" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "df['salary']" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "This shows us:\n", 172 | "- How to grab data via column name (our renamed column of course)\n", 173 | "- An example of Pandas `Series`\n", 174 | "- DataFrame Index Values (based on our data).\n", 175 | "\n", 176 | "All of the above we'll continue to look at in future videos. For now, we need to get *just* the list of values from the column we're getting data from. We'll do that with:" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "list(df['salary'].values)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "So how would we convert all this data in pure python? Perhaps something like:" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "values = list(df['salary'].values)\n", 202 | "new_values = []\n", 203 | "for val in values:\n", 204 | " new_val = float(val.replace(\"$\", \"\").replace(\",\", \"_\"))\n", 205 | " # you can also use new_val = utils.float_to_dollars(val)\n", 206 | " new_values.append(new_val)\n", 207 | "\n", 208 | "print(new_values)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "Let's bare something in mind here: the position (or index) of each value should correspond to it's counterpoint in our table values (ie `new_values[312]` should be the same as `values[312]`). Let's test that here: " 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "random_index = random.randint(0, len(values))\n", 225 | "new_value_via_index = new_values[random_index]\n", 226 | "new_value_in_dollars = utils.float_to_dollars(new_value_via_index)\n", 227 | "\n", 228 | "assert new_value_in_dollars == values[random_index]" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "Now, let's add these values as a new column in our DataFrame" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "df['salary_raw_py'] = new_values\n", 245 | "df.head()" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "Now we can add new columns to a Pandas DataFrame using a familiar method (much like adding a new key to a Python dictionary `dict()`). In this case, the length of the values we added matches the length of all the rows in our DataFrame. We know this because the data *came from the dataframe* in the first place.\n", 253 | "\n", 254 | "Let's try to add arbitrary data. " 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "import datetime\n", 264 | "\n", 265 | "this_year = datetime.datetime.now().year # notice this \n", 266 | "df['year'] = this_year" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "df.head()" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "So we now see two properties of a DataFrame that are pretty cool. You can add a new column with 1 value or with matching number of row values.\n", 283 | "\n", 284 | "How about data that was 1/2 the number of rows?" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "rows_length = df.shape[0]\n", 294 | "# column_length = df.shape [1]\n", 295 | "half_rows = int(rows_length * 0.5)\n", 296 | "try:\n", 297 | " df['is_new'] = [True for x in range(0, half_rows)]\n", 298 | "except Exception as e:\n", 299 | " print(e)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "Now we see that you can:\n", 307 | "- Add a value for all rows from 1 value\n", 308 | "- Add a value fro all rows from a corresponding index value in another list\n", 309 | "\n", 310 | "Everything we did above technically works but it adds a lot of uncessary steps that we can skip thanks to Pandas awesomeness." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "def dollar_str_to_float(val):\n", 320 | " # in the future, this will be stored in\n", 321 | " # utils.py in the courses/ directory\n", 322 | " return float(val.replace(\"$\", \"\").replace(\",\", \"_\"))\n", 323 | "\n", 324 | "df['salary_as_float'] = df['salary'].apply(dollar_str_to_float)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "Let's break this down:\n", 332 | "- `df['salary_via_apply']` is declaring our new column\n", 333 | "- `df['salary']` is a reference to the values in a pre-existing column on this dataframe\n", 334 | "- `.apply()` will run a function on *all* values in the referenced column. \n", 335 | "- `dollar_str_to_float` is a function that we pass the values to in order to get the correct result.\n", 336 | "- The original `df['salary']` remains unchanged." 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "df.head()" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "You can also use a lambda to simplify this further:\n", 353 | "\n", 354 | "```python\n", 355 | "df['salary_via_apply_lambda'] = df['salary'].apply(lambda x: float(x.replace('$', '').replace(',', '')))\n", 356 | "```" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "# Export to samples dir\n", 366 | "# df.to_csv(\"samples/2.csv\", index=False)" 367 | ] 368 | } 369 | ], 370 | "metadata": { 371 | "deepnote": { 372 | "is_reactive": false 373 | }, 374 | "deepnote_execution_queue": [], 375 | "deepnote_notebook_id": "e609e7b1-ff5b-43c7-8bff-b115fd3b7749", 376 | "kernelspec": { 377 | "display_name": "Python 3 (ipykernel)", 378 | "language": "python", 379 | "name": "python3" 380 | }, 381 | "language_info": { 382 | "codemirror_mode": { 383 | "name": "ipython", 384 | "version": 3 385 | }, 386 | "file_extension": ".py", 387 | "mimetype": "text/x-python", 388 | "name": "python", 389 | "nbconvert_exporter": "python", 390 | "pygments_lexer": "ipython3", 391 | "version": "3.9.7" 392 | } 393 | }, 394 | "nbformat": 4, 395 | "nbformat_minor": 2 396 | } 397 | -------------------------------------------------------------------------------- /course/3 - Basic Analysis in Pandas DataFrames.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "cell_id": "00000-15d3f3d1-7d9d-48b0-9022-0136024f5fa7", 7 | "deepnote_cell_type": "code", 8 | "tags": [] 9 | }, 10 | "source": [ 11 | "# 3 - Basic Analysis in Pandas DataFrames\n", 12 | "\n", 13 | "At this point, we've only be working with auto-generated data. Analyzing auto-generate data is a lot like running on a treadmill; no matter how hard to you try you'll always be stuck in the same place(s).\n", 14 | "\n", 15 | "I use auto-generated data to show you some of the fundamentals of Pandas. In the next one, we'll go into real data from NBA.com. In this one, we'll cover how to do some basic analysis on your data by using a few built-in methods that Pandas offers.\n", 16 | "\n", 17 | "Let's start by importing our sample data from `2 - Cleaning Data with Python & Pandas`" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import random\n", 29 | "import numpy as np\n", 30 | "\n", 31 | "# utils.py was created by us\n", 32 | "import utils" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# read sample data\n", 42 | "df = pd.read_csv(\"samples/2.csv\") " 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "> Are you missing the sample data? Be sure to [launched this code on Deepnote](https://deepnote.com/launch?url=https://github.com/codingforentrepreneurs/Try-Pandas)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "### Analyze Data\n", 57 | "Let's take a basic look at how we can analyze this data." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "df2 = df.copy()[['name', 'salary_as_float']]" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "The above command does 2 things: copies the dataframe `df` and selects only some of the columns (in this case `name` and `salary_as_float`. Creating a copy means we won't accidentally modify a previous dataframe. \n" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "df2.head()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "df2.shape" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "Using `.shape` will give us the size of our table in the layout (`row_length`, `column_length`). This `.shape` call matches closely with `numpy`. Something we'll have to revisit another time." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "n_rows = df2.shape[0]\n", 108 | "n_columns = df2.shape[1]" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "Let's do some statistics:" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "avg_salary = df2['salary_as_float'].mean()\n", 125 | "most_common_salary = df2['salary_as_float'].mode() # returns a series\n", 126 | "top_salary = df2['salary_as_float'].max()\n", 127 | "bottom_salary = df2['salary_as_float'].min()\n", 128 | "\n", 129 | "print(\"Average Salary\\t\\t\\t\", utils.float_to_dollars(avg_salary))\n", 130 | "\n", 131 | "print(\"Top Salary\\t\\t\\t\", utils.float_to_dollars(top_salary))\n", 132 | "\n", 133 | "print(\"Bottom Salary\\t\\t\\t\", utils.float_to_dollars(bottom_salary))\n", 134 | "\n", 135 | "print(\"Top 3 Most Common Salaries\\t\", \", \".join(most_common_salary.apply(utils.float_to_dollars).values[:3]))" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "df3 = df2.copy()\n", 145 | "df3['salary_normalized'] = (df3['salary_as_float'] - df3['salary_as_float'].min()) / (df3['salary_as_float'].max() - df3['salary_as_float'].min())" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "Normalizing data is incredibly common. It convers a set of data (in this case `df['salary_as_float']`) and convers all numbers to be within the range of `0` and `1`. Data normalization is a common pre-processing practice when performing machine learning. We're going to use this normalized data as a way to parse our groups based on percentage values. " 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "def group_salary(val):\n", 162 | " # in the future, this will be stored in\n", 163 | " # utils.py in the courses/ directory\n", 164 | " if val > .95:\n", 165 | " return 'top'\n", 166 | " elif val < .95 and val > .50:\n", 167 | " return 'mid'\n", 168 | " return 'low'\n", 169 | "\n", 170 | "df3['salary_group'] = df3['salary_normalized'].apply(group_salary)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "df3['salary_group'].value_counts()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "df3.groupby('salary_group')['salary_group'].value_counts().plot(kind='bar', title='People in Group')" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "As you can see from the chart, the distribution of data falls into 3 categories based on arbitrary splitting done in the `group_salary` method above. " 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "sal_group = df3[['salary_as_float','salary_group']].groupby('salary_group').agg([np.sum])\n", 205 | "sal_group.plot(kind = \"bar\", legend = True, title='Average Salary per Group (Normalized)', color='green')" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "This chart shows the sum total salary of each group. With Random data, this is not that interesting because there's nothing to be learned from it. With our NBA dataset, it's this chart may look vastly different." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [] 221 | } 222 | ], 223 | "metadata": { 224 | "deepnote": { 225 | "is_reactive": false 226 | }, 227 | "deepnote_execution_queue": [], 228 | "deepnote_notebook_id": "e609e7b1-ff5b-43c7-8bff-b115fd3b7749", 229 | "kernelspec": { 230 | "display_name": "Python 3 (ipykernel)", 231 | "language": "python", 232 | "name": "python3" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 3 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython3", 244 | "version": "3.9.7" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 2 249 | } 250 | -------------------------------------------------------------------------------- /course/4 - Cleaning Real Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ee709b50", 6 | "metadata": {}, 7 | "source": [ 8 | "# 4 - Cleaning Real Data\n", 9 | "\n", 10 | "Now it's time for the real stuff. Let's use a load in a real dataset and discover our next steps together.\n", 11 | "\n", 12 | "In *Appendix A - Scrape & Build NBA Salary Dataset*, we create a NBA Player salary dataset by web scraping [hoopshype.com](hoopshype.com). We won't cover web scraping here but you can run that notebook if you want to learn more." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "2ae03fdd", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import datetime\n", 23 | "import pathlib\n", 24 | "import pandas as pd\n", 25 | "\n", 26 | "# import local utils.py\n", 27 | "import utils" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "4b445d4c", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "PERFROM_SCRAPE = True\n", 38 | "BASE_DIR = pathlib.Path().resolve()\n", 39 | "DATASET_PATH = BASE_DIR / 'datasets'\n", 40 | "INPUT_PATH = DATASET_PATH / 'nba-historical-salaries.csv'\n", 41 | "print(f'Dataset *{INPUT_PATH.name}* exists:', INPUT_PATH.exists())" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "cb544e0f", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "df = pd.read_csv(INPUT_PATH)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "df2d24f7", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "df.head()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "3cb16ea1", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "df.shape" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "7b947270", 77 | "metadata": {}, 78 | "source": [ 79 | "The above commands tell us a lot about this data already:\n", 80 | "- Finanical data\n", 81 | "- Columns with dollar strings need to be cleaned (`$`)\n", 82 | "- Rename columns for consistency\n", 83 | "- There's 14,549 records each with 5 data points.\n", 84 | "- `adj_salary` is given data. Does this mean adjusted in today's dollars? Is this accurate?\n", 85 | "\n", 86 | "After this assessment, let's get to work" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "75685eef", 92 | "metadata": {}, 93 | "source": [ 94 | "### Column consistency\n", 95 | "\n", 96 | "_How you do anything, is how you do everything._\n", 97 | "\n", 98 | "Let's start with the mundane task of committing to a consistent naming convention for our columns across our entire project here. \n", 99 | "\n", 100 | "Before we do, let's see the columns: " 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "2efef30b", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "df.columns" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "1506e8b2", 116 | "metadata": {}, 117 | "source": [ 118 | "If you're a seasoned programmer, you will notice the issue. If you're new to programming you might miss it. If you look at each column name you will see a subtle shift in how each column casing is done.\n", 119 | "\n", 120 | "Casing types? Yes, seriously. Here are a few options:\n", 121 | "\n", 122 | "- `PascalCase` -> `ThisIsPascalCase`\n", 123 | "- `camelCase` -> `thisIsCamelCase`\n", 124 | "- `snake_case` -> `this_is_snake_case`\n", 125 | "- `kebab-case` -> `this-is-kebab-case` (aka `slugified-string`, `spinal-case`)\n", 126 | "\n", 127 | "\n", 128 | "Since I use Python and create a lot of web applications, I tend to use `snake_case` or `kebab-case`. If you're a SQL database person, you'd probably use `PascalCase`. If you're from JavaScript, you'd probably use a lot of `camelCase`.\n", 129 | "\n", 130 | "Whatever format you use, just be consistent. Let's rename our columns using `snake_case`:" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "8e10f1d0", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "# %pip install python-slugify" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "03e0ea33", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "from slugify import slugify\n", 151 | "\n", 152 | "def to_snake_case(val):\n", 153 | " # in the future, this will be stored in\n", 154 | " # utils.py in the courses/ directory\n", 155 | " kebab_case = slugify(val)\n", 156 | " return kebab_case.replace('-', '_')" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "id": "913dfbae", 162 | "metadata": {}, 163 | "source": [ 164 | "I like using the `python-slugify` package to consistently and reliably convert any string into a url-ready slug (aka `kebab-casing`). Once we have a `slug`/`kebab-case` we can just switch out the dashes (`-`) for underscores (`_`)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "id": "90a5cbc9", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "old_columns = df.columns\n", 175 | "new_columns = [to_snake_case(x) for x in old_columns]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "ad1366b0", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "new_column_mapping = dict(zip(old_columns, new_columns))\n", 186 | "new_column_mapping" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "2f12d515", 192 | "metadata": {}, 193 | "source": [ 194 | "> `zip` is a cool built in python feature that combines two lists of the same length. Once you use `dict` around them, it will turn the left side list into keys and the right side list into values associated by their indices. I remember `zip` like a zipper on your pants, backpacks, luggage, etc; each size has \"teeth\" that corresponds to the other side. " 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "eb4e2e18", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "df.rename(columns=new_column_mapping, inplace=True)\n", 205 | "df.head()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "id": "999c2d41", 211 | "metadata": {}, 212 | "source": [ 213 | "## Cleaning Rows\n", 214 | "\n", 215 | "Now that we've renamed our columns, let's clean up our rows. In `utils.py` we have the function `dollar_str_to_float` which converts dollar strings into floats" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "id": "4fad499a", 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "def clean_row(row_series):\n", 226 | " row_series['salary'] = utils.dollar_str_to_float(row_series['salary'])\n", 227 | " row_series['adj_salary'] = utils.dollar_str_to_float(row_series['adj_salary'])\n", 228 | " return row_series\n", 229 | "\n", 230 | "df_cleaned = df.copy().apply(clean_row, axis=1)\n", 231 | "df_cleaned.head()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "id": "936bf0ca", 237 | "metadata": {}, 238 | "source": [ 239 | "I hope that your alarm bells are going off. We never covered `df.apply` we only covered `df['my_col'].apply`. What gives?\n", 240 | "\n", 241 | "When you run `.apply` on an entire DataFrame, you can modify each row as you see fit instead of just an entire column. Another way to write this would be to write:\n", 242 | "\n", 243 | "```python\n", 244 | "df_cleaned = df.copy().apply(clean_row, axis=1)\n", 245 | "df_cleaned['salary'] = df_cleaned['salary'].apply(utils.dollar_str_to_float)\n", 246 | "df_cleaned['adj_salary'] = df_cleaned['adj_salary'].apply(utils.dollar_str_to_float)\n", 247 | "```\n", 248 | "\n", 249 | "And that would be perfectly acceptable. But there's a major difference. And it's this:" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "id": "f8ca56e2", 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "def clean_row_2(row_series):\n", 260 | " dollar_cols = ['salary', 'adj_salary']\n", 261 | " for col in dollar_cols:\n", 262 | " row_series[col] = utils.dollar_str_to_float(row_series[col])\n", 263 | " return row_series\n", 264 | "\n", 265 | "df_cleaned_2 = df.copy().apply(clean_row_2, axis=1)\n", 266 | "df_cleaned_2.head()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "id": "7c7d6a87", 272 | "metadata": {}, 273 | "source": [ 274 | "`clean_row_2` gives us a way to reduce complexity by iterating over the columns we want to adjust. " 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "id": "15d48802", 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "df_cleaned_2['adj_salary'].dtype" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "id": "3d326000", 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "players_per_year = df_cleaned_2['year_start'].value_counts(sort=False)\n", 295 | "players_per_year" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "id": "fdeaa9f0", 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "players_per_year.plot(title='Number of Players Per Year')" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "id": "388e551e", 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "adj_salary_df = df_cleaned_2.copy()[['year_start', 'adj_salary']]\n", 316 | "adj_salaries_cumlative = adj_salary_df.groupby(\"year_start\").sum()\n", 317 | "\n", 318 | "adj_salaries_cumlative.plot(title='Adjusted Cumaltive Salaries Over Time')" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "id": "dbe57cb2", 324 | "metadata": {}, 325 | "source": [ 326 | "Look at this two charts! The second appears to be out-pacing the first.\n", 327 | "\n", 328 | "- upward trend of number of players and salaries\n", 329 | "- What happend in 2019?\n", 330 | "- 2020 seams to be trending towards a massive year for player payments" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "id": "36dc0131", 336 | "metadata": {}, 337 | "source": [ 338 | "The above dataset leaves me with a lot of questions:\n", 339 | "\n", 340 | "- Are these adjust salary numbers correct (they are from hypehoops.com)\n", 341 | "- Are the per-player salaries going up or just the top 5% of players?\n", 342 | "- How does a players' salary correlate to wins / losses / other stats?\n", 343 | "- How does a team (full of players) and their salaries correlate to wins / losses / other stats?\n", 344 | "- Do the audience metrics support these numbers? (In person, online, etc) In other words, is there really this much economic value being generated?\n", 345 | "\n", 346 | "Answers to these questions will inevitably leads to more questions which hopefully means more and better data analysis.\n" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "id": "4f826a51", 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "id": "e824c752", 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "# Export to samples dir\n", 365 | "\n", 366 | "# df_cleaned_2.to_csv('samples/4-player-salaries-cleaned.csv', index=False)\n", 367 | "\n", 368 | "# players_per_year.rename(columns={\"year_start\": \"players\"}, inplace=True)\n", 369 | "# players_per_year.to_csv('samples/4-player-salaries-per-year.csv', index_label='year', index=True)\n", 370 | "\n", 371 | "# adj_salaries_cumlative['adj_salary_$'] = adj_salaries_cumlative['adj_salary'].apply(utils.float_to_dollars)\n", 372 | "# adj_salaries_cumlative.rename(columns={\"year_start\": \"year\"}, inplace=True)\n", 373 | "# adj_salaries_cumlative.to_csv(\"samples/4-adj-salaries-cumlative-per-year.csv\", index_label=\"year\")" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "id": "1819325b", 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "id": "abb71da5", 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [] 391 | } 392 | ], 393 | "metadata": { 394 | "kernelspec": { 395 | "display_name": "Python 3 (ipykernel)", 396 | "language": "python", 397 | "name": "python3" 398 | }, 399 | "language_info": { 400 | "codemirror_mode": { 401 | "name": "ipython", 402 | "version": 3 403 | }, 404 | "file_extension": ".py", 405 | "mimetype": "text/x-python", 406 | "name": "python", 407 | "nbconvert_exporter": "python", 408 | "pygments_lexer": "ipython3", 409 | "version": "3.9.7" 410 | } 411 | }, 412 | "nbformat": 4, 413 | "nbformat_minor": 5 414 | } 415 | -------------------------------------------------------------------------------- /course/5 - Merge Datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "7a03add6", 6 | "metadata": {}, 7 | "source": [ 8 | "# 5 - Merge Datasets\n", 9 | "\n", 10 | "Merging datasets is a very common practice to enrich or validate the values we have. It's easy to do but it's better used when done practically.\n", 11 | "\n", 12 | "\n", 13 | "In *4 - Cleaning Real Data* we used data from [hoopshype.com](hoopshype.com) that included Actual Salaries and Adjusted Salaries. In this one, we're going to create our own Adjusted Salaries using the dataset from *Appendix B - Inflation Rate Dataset*" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "id": "787c61a1", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import datetime\n", 24 | "import pandas as pd\n", 25 | "import pathlib\n", 26 | "import utils" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "520356b4", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "BASE_DIR = pathlib.Path().resolve()\n", 37 | "COURSES_DIR = BASE_DIR / 'course'\n", 38 | "APPENDIX_DIR = BASE_DIR / 'appendix'\n", 39 | "DATASET_PATH = BASE_DIR / 'datasets'\n", 40 | "SAMPLES_DIR = BASE_DIR / 'samples'\n", 41 | "INPUT_PATH = SAMPLES_DIR / '4-player-salaries-cleaned.csv'\n", 42 | "INFLATION_DATA_PATH = DATASET_PATH / 'inflation-rate.csv'\n", 43 | "print(f'Dataset *{INPUT_PATH.name}* exists:', INPUT_PATH.exists())" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "1af83091", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Import Dataset from 4 - Cleaning Real Data\n", 54 | "df = pd.read_csv(INPUT_PATH)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "cbcd06a4", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "df.head()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "b4104781", 70 | "metadata": {}, 71 | "source": [ 72 | "Going forward, we are going to be doing a lot of analysis in 2020 dollars (2020 has the most up to date data as of October 2021).\n", 73 | "\n", 74 | "We're going to assume a few things about this scraped data:\n", 75 | "- Player names are correct (`player` column)\n", 76 | "- Salary (`salary` column) listed is their actual salary\n", 77 | "- Start Year is accurate (`year_start` column)\n", 78 | "\n", 79 | "Given these assumptions, we're going to create our own Adjust Salary column to illustrate how to merge data." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "2caa64ee", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "inflation_df = pd.read_csv(INFLATION_DATA_PATH)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "id": "f08375f9", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "inflation_df.head()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "id": "31f3176c", 105 | "metadata": {}, 106 | "source": [ 107 | "*Appendix B - Inflation Rate Dataset* shows exactly where and how the dataset for `inflation_df` is created. " 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "11a310a5", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "inflation_df.set_index('date', inplace=True)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "id": "4d423b23", 123 | "metadata": {}, 124 | "source": [ 125 | "Typically, the DataFrame index is auto-incrementing integers (0, 1, 2, 3, 4, ...) but it can be a time series index (ie based in dates).\n", 126 | "\n", 127 | "Setting our index to a date-like string (ie `YYYY-MM-DD`) will result in time series data.\n", 128 | "\n", 129 | "The nice thing about this is we can take a slice this data in a cool way:" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "2b14d26d", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "year_start = 2000\n", 140 | "year_end = 2005\n", 141 | "inflation_df[f\"{year_start}\": f\"{year_end}\"]" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "628cdfaa", 147 | "metadata": {}, 148 | "source": [ 149 | "Now we see a subset of our dataset. You can treat this as a new dataframe if you need or we can use it when enriching our data. We're not going to use this type of slicing in this guide but it is nice to see it in action." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "009773cf", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "year_start = 2000\n", 160 | "year_end = 2001\n", 161 | "inflation_df[f\"{year_start}\": f\"{year_end}\"]" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "id": "91567b12", 167 | "metadata": {}, 168 | "source": [ 169 | "This slice should help show us something interesting: for the `year_start` and `year_end` we choose, it has 2 new values that are related both the `inflation_rate_percent` and `multiplier`.\n", 170 | "\n", 171 | "Now we *can* use an apply here to enrich our data:\n", 172 | "```python\n", 173 | "def merge_data_via_lookup(row):\n", 174 | " year_start = row['year_start']\n", 175 | " year_end = row['year_end']\n", 176 | " new_data = inflation_df[f\"{year_start}\": f\"{year_end}\"]\n", 177 | " row['multiplier'] = new_data['multiplier'].values[0]\n", 178 | " return row\n", 179 | " \n", 180 | "df.apply(merge_data_via_lookup, axis=1)\n", 181 | "```\n", 182 | "\n", 183 | "Technically speaking, this would work but it's not efficient and it can lead to confusion. Let's use the built-in `merge` function instead.\n", 184 | "\n", 185 | "Since `year_start` from `df` and the index (ie the `date` column) on `inflation_df` are correlated let's try a merge:" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "id": "8c30a745", 191 | "metadata": {}, 192 | "source": [ 193 | "First, let's move the date column out of the index in `inflation_df`:" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "ac3a4dc4", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "inflation_df.reset_index(inplace=True, drop=False)\n", 204 | "inflation_df.head()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "id": "fea042bf", 210 | "metadata": {}, 211 | "source": [ 212 | "In this case, `reset_index` will preserve the original index (`date`) as a new column because of `drop=False`. " 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "id": "648f5664", 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "try:\n", 223 | " df.merge(inflation_df, left_on=\"year_start\", right_on=\"date\")\n", 224 | "except Exception as e:\n", 225 | " print(e)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "id": "00908037", 231 | "metadata": {}, 232 | "source": [ 233 | "This merge failed because the data types do not match up. `year_start` is an integer and `date` is an object. Let's change that:" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "id": "f7f90a18", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "# df['date'] = pd.to_datetime(df['year_start'])\n", 244 | "df['date'] = df['year_start'].apply(lambda x: datetime.datetime.strptime(f\"{x}-12-31\", \"%Y-%m-%d\"))\n", 245 | "df.head()" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "id": "523cd4f1", 251 | "metadata": {}, 252 | "source": [ 253 | "Above I used `f\"{x}-12-31\"` to match how the `inflation_df` represents the date for the year (as opposed to the start of the year `f\"{x}-01-01\"`)." 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "id": "f5c47b51", 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "inflation_df['date'] = inflation_df['date'].apply(lambda x: datetime.datetime.strptime(f\"{x}\", \"%Y-%m-%d\"))\n", 264 | "inflation_df.head()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "id": "19da6a26", 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "print(inflation_df['date'].dtype, df['date'].dtype)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "id": "628999f9", 280 | "metadata": {}, 281 | "source": [ 282 | "Now, `inflation_df['date']` and `df['date']` have the same data type, we can use `merge` on it:" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "id": "3db43fb1", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "merged_df = df.merge(inflation_df, left_on=\"date\", right_on='date')\n", 293 | "merged_df.head()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "id": "dcf0c5d5", 299 | "metadata": {}, 300 | "source": [ 301 | "A merge is a fast way to enrich our data based on corresponding values in two dataframes. The reason we do this is simple:" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "id": "04463e5b", 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "merged_df['adj_salary_audit'] = merged_df['salary'] * merged_df['multiplier']\n", 312 | "merged_df.head()" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "id": "b33d52fb", 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "merged_df['audit_delta'] = merged_df['adj_salary_audit'] - merged_df['adj_salary']" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "id": "faf29124", 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "total_adjusted = merged_df['adj_salary'].sum()\n", 333 | "total_adjusted_usd = utils.float_to_dollars(total_adjusted)\n", 334 | "total_adjusted_audit = merged_df['adj_salary_audit'].sum()\n", 335 | "total_adjusted_audit_usd = utils.float_to_dollars(total_adjusted_audit)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "id": "9ab1fc65", 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "audit_delta_sum = utils.float_to_dollars(merged_df['audit_delta'].sum())" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "id": "ecb8514a", 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "difference_perc = ((total_adjusted_audit - total_adjusted) / total_adjusted_audit) * 100\n", 356 | "print(f\"Difference between our internal audit and their numbers is {difference_perc:.4f}% which totals to {audit_delta_sum}\")" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "id": "65077dc6", 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "difference_perc = ((total_adjusted_audit - total_adjusted) / total_adjusted_audit) * 100" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "id": "06bd25ee", 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "print(\"Total Adjusted Salary (usd)\", total_adjusted_usd)\n", 377 | "print(\"Total Adjusted Salary Audit (usd)\", total_adjusted_audit_usd)\n", 378 | "\n", 379 | "print(\"Delta Total\", audit_delta_sum)\n", 380 | "print(f\"Detla Percent Difference {difference_perc:.4f}%\")" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "id": "c5cb52b3", 386 | "metadata": {}, 387 | "source": [ 388 | "This shows us that our adjusted salary number is about $410 million higher but that's under 1% difference. \n", 389 | "\n", 390 | "Since this data is good enough for future pandas lessons, we're not going to dig any deeper in improving the adjusted salaries. But there's a few questions that come to mind on how we could do it:\n", 391 | "\n", 392 | "- With this data, we used `year_start` and not `year_end` for our inflation rate multiplier. Perhaps `year_end` would yield closer results.\n", 393 | "- The source datasets might *both* be incorrect; how would we change this?\n", 394 | "- Does over `$410 million+` skew future results given total sum is over `$68 billion+`?" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "id": "47ed0b72", 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "# Export to samples dir\n", 405 | "\n", 406 | "# merged_df.to_csv('samples/5-player-adj-salaries-audit.csv', index=False)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "id": "baf1e7ee", 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [] 416 | } 417 | ], 418 | "metadata": { 419 | "kernelspec": { 420 | "display_name": "Python 3 (ipykernel)", 421 | "language": "python", 422 | "name": "python3" 423 | }, 424 | "language_info": { 425 | "codemirror_mode": { 426 | "name": "ipython", 427 | "version": 3 428 | }, 429 | "file_extension": ".py", 430 | "mimetype": "text/x-python", 431 | "name": "python", 432 | "nbconvert_exporter": "python", 433 | "pygments_lexer": "ipython3", 434 | "version": "3.9.7" 435 | } 436 | }, 437 | "nbformat": 4, 438 | "nbformat_minor": 5 439 | } 440 | -------------------------------------------------------------------------------- /course/appendix/Appendix A - Scrape & Build NBA Salary Dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "08553e5b", 6 | "metadata": {}, 7 | "source": [ 8 | "# Appendix A - Scrape & Build NBA Salary Dataset\n", 9 | "The goal of this notebook is to prepare our course with a pre-existing dataset. The data cleaning is done in the course itself; this is meant only to create the dataset. " 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "id": "d279ba03", 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# %pip install requests requests-html matplotlib pandas" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "id": "5624667e", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import datetime\n", 30 | "from decimal import Decimal\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "import requests\n", 33 | "from requests_html import HTML\n", 34 | "import pandas as pd\n", 35 | "import pathlib\n", 36 | "import time" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "id": "2a3e439e", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "PERFROM_SCRAPE = True\n", 47 | "BASE_DIR = pathlib.Path().resolve().parent.parent\n", 48 | "COURSES_DIR = BASE_DIR / 'course'\n", 49 | "DATASET_PATH = COURSES_DIR / 'datasets'\n", 50 | "OUTPUT_PATH = DATASET_PATH / 'nba-historical-salaries.csv'" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 6, 56 | "id": "c973adc3", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "True" 63 | ] 64 | }, 65 | "execution_count": 6, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "COURSES_DIR.exists()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "id": "1554825f", 77 | "metadata": {}, 78 | "source": [ 79 | "For this dataset, we use `hoopshype.com`'s record of player salaries." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "id": "8cc7326e", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "base_url = 'https://hoopshype.com/salaries/players/'" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "id": "7d701732", 95 | "metadata": {}, 96 | "source": [ 97 | "`hoopshype.com`'s salary data starts in the 1990-1991 season." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "id": "be7e83d9", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "year_start = 1990" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "id": "d48e19f3", 113 | "metadata": {}, 114 | "source": [ 115 | "End scraping at last year's season (this year might not be available)." 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 5, 121 | "id": "25aac766", 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "2020" 128 | ] 129 | }, 130 | "execution_count": 5, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "year_end = datetime.datetime.now().year - 1\n", 137 | "year_end" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 7, 143 | "id": "64accc20", 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "1990 https://hoopshype.com/salaries/players/1990-1991/\n", 151 | "1991 https://hoopshype.com/salaries/players/1991-1992/\n", 152 | "1992 https://hoopshype.com/salaries/players/1992-1993/\n", 153 | "1993 https://hoopshype.com/salaries/players/1993-1994/\n", 154 | "1994 https://hoopshype.com/salaries/players/1994-1995/\n", 155 | "1995 https://hoopshype.com/salaries/players/1995-1996/\n", 156 | "1996 https://hoopshype.com/salaries/players/1996-1997/\n", 157 | "1997 https://hoopshype.com/salaries/players/1997-1998/\n", 158 | "1998 https://hoopshype.com/salaries/players/1998-1999/\n", 159 | "1999 https://hoopshype.com/salaries/players/1999-2000/\n", 160 | "2000 https://hoopshype.com/salaries/players/2000-2001/\n", 161 | "2001 https://hoopshype.com/salaries/players/2001-2002/\n", 162 | "2002 https://hoopshype.com/salaries/players/2002-2003/\n", 163 | "2003 https://hoopshype.com/salaries/players/2003-2004/\n", 164 | "2004 https://hoopshype.com/salaries/players/2004-2005/\n", 165 | "2005 https://hoopshype.com/salaries/players/2005-2006/\n", 166 | "2006 https://hoopshype.com/salaries/players/2006-2007/\n", 167 | "2007 https://hoopshype.com/salaries/players/2007-2008/\n", 168 | "2008 https://hoopshype.com/salaries/players/2008-2009/\n", 169 | "2009 https://hoopshype.com/salaries/players/2009-2010/\n", 170 | "2010 https://hoopshype.com/salaries/players/2010-2011/\n", 171 | "2011 https://hoopshype.com/salaries/players/2011-2012/\n", 172 | "2012 https://hoopshype.com/salaries/players/2012-2013/\n", 173 | "2013 https://hoopshype.com/salaries/players/2013-2014/\n", 174 | "2014 https://hoopshype.com/salaries/players/2014-2015/\n", 175 | "2015 https://hoopshype.com/salaries/players/2015-2016/\n", 176 | "2016 https://hoopshype.com/salaries/players/2016-2017/\n", 177 | "2017 https://hoopshype.com/salaries/players/2017-2018/\n", 178 | "2018 https://hoopshype.com/salaries/players/2018-2019/\n", 179 | "2019 https://hoopshype.com/salaries/players/2019-2020/\n", 180 | "2020 https://hoopshype.com/salaries/players/2020-2021/\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "dfs = []\n", 186 | "if PERFROM_SCRAPE:\n", 187 | " for year in range(year_start, year_end+1):\n", 188 | " # NBA season spans 2 different calendar years\n", 189 | " year_range = f\"{year}-{year+1}\"\n", 190 | " # the lookup salary url is based on the above range\n", 191 | " url = f\"{base_url}{year_range}/\"\n", 192 | " # print year and url for manual review\n", 193 | " print(year, url)\n", 194 | " # perform lookup\n", 195 | " r = requests.get(url)\n", 196 | " # Convert response html text as a parsable object\n", 197 | " html = HTML(html=r.text)\n", 198 | " # Find the data table containing\n", 199 | " table = html.find('table', first=True)\n", 200 | " # table_data list holder\n", 201 | " table_data = []\n", 202 | " # iterate the table element and append all column values in each row\n", 203 | " for el in table.element.getchildren():\n", 204 | " for tr in el.getchildren():\n", 205 | " row_data = []\n", 206 | " for col in tr.getchildren():\n", 207 | " row_data.append(col.text_content().strip())\n", 208 | " table_data.append(row_data)\n", 209 | " # create the initial dataframe\n", 210 | " init_df = pd.DataFrame(table_data)\n", 211 | " # use the first row as the header\n", 212 | " new_header = init_df.iloc[0]\n", 213 | " # use everything after the first row as our dataset\n", 214 | " init_df = init_df[1:]\n", 215 | " # update header\n", 216 | " init_df.columns = new_header\n", 217 | "\n", 218 | " # attempt to rename columns, if it's avaiable\n", 219 | " # otherwise, move to the next year lookup\n", 220 | " try:\n", 221 | " renamed_cols = {\n", 222 | " \"Player\": 'player',\n", 223 | " f\"{new_header[2]}\": \"salary\",\n", 224 | " f\"{new_header[3]}\": \"adj_salary\"\n", 225 | " }\n", 226 | " init_df = init_df.rename(columns=renamed_cols)\n", 227 | " except:\n", 228 | " continue\n", 229 | "\n", 230 | " # create \n", 231 | " try:\n", 232 | " df = init_df.copy()[['player', 'salary', 'adj_salary']]\n", 233 | " except:\n", 234 | " continue\n", 235 | " # update dataset with year values \n", 236 | " df['year-start'] = year\n", 237 | " df['year-end'] = year + 1\n", 238 | " # append this dataset to our group of datasets\n", 239 | " dfs.append(df)\n", 240 | " # slow down lookups to ensure our scraping doesn't overload\n", 241 | " # hoopshype.com\n", 242 | " time.sleep(1.2)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "id": "f86901e5", 248 | "metadata": {}, 249 | "source": [ 250 | "Convert our list of dataframes (ie season salaries) into our entire dataset via pandas concat." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 12, 256 | "id": "e5417732", 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "(14549, 5)" 263 | ] 264 | }, 265 | "execution_count": 12, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "dataset_df = pd.concat(dfs) #[['player', 'year-start', 'year-end', 'salary', 'adj_salary']]\n", 272 | "dataset_df.reset_index(drop=True, inplace=True)\n", 273 | "dataset_df.shape" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "id": "455f44e8", 279 | "metadata": {}, 280 | "source": [ 281 | "Store file to our course data" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 14, 287 | "id": "262b15f4", 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "dataset_df.to_csv(OUTPUT_PATH, index=False)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "id": "5b5697c6", 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [] 301 | } 302 | ], 303 | "metadata": { 304 | "kernelspec": { 305 | "display_name": "Python 3 (ipykernel)", 306 | "language": "python", 307 | "name": "python3" 308 | }, 309 | "language_info": { 310 | "codemirror_mode": { 311 | "name": "ipython", 312 | "version": 3 313 | }, 314 | "file_extension": ".py", 315 | "mimetype": "text/x-python", 316 | "name": "python", 317 | "nbconvert_exporter": "python", 318 | "pygments_lexer": "ipython3", 319 | "version": "3.9.7" 320 | } 321 | }, 322 | "nbformat": 4, 323 | "nbformat_minor": 5 324 | } 325 | -------------------------------------------------------------------------------- /course/appendix/Appendix C - The NBA API Experiments.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "0214eb00", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# %pip install nba_api" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "eef82f84", 16 | "metadata": {}, 17 | "source": [ 18 | "## NBA API Experiments\n", 19 | "While I was experimenting the `nba_api` I created this notebook. It's a bit of a mixed bag of tests but certainly a worthy inclusion of things you might try yourself.\n", 20 | "\n", 21 | "There is no real order in this notebook -- feel free to submit a [pull request](https://github.com/codingforentrepreneurs/Try-Pandas/pulls) if you find ways to improve it.\n", 22 | "\n", 23 | "\n", 24 | "#### _NBA Shot Chart explainer_\n", 25 | "Interesting project (perhas a future video)?\n", 26 | "- https://www.youtube.com/watch?v=a3u-3gEYvxM\n", 27 | "- https://github.com/hkair/nba-shotcharts" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "e40478dd", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import pandas as pd\n", 38 | "import pathlib" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "39fe9277", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "from nba_api.stats.endpoints import commonplayerinfo, playercareerstats\n", 49 | "from nba_api.stats.static import players" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "9da954c0", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "BASE_DIR = pathlib.Path().resolve().parent\n", 60 | "SAMPLES_DIR = BASE_DIR / \"samples\"\n", 61 | "salary_df = pd.read_csv(SAMPLES_DIR / '5-player-adj-salaries-audit.csv')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "7b5f24e0", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "def split_player_name(row):\n", 72 | " name = row['player']\n", 73 | " name_list = name.split()\n", 74 | " row['first_name'] = None\n", 75 | " row['last_name'] = None\n", 76 | " if len(name_list) == 2:\n", 77 | " row['first_name'] = name_list[0]\n", 78 | " row['last_name'] = name_list[1]\n", 79 | " if len(name_list) > 2 :\n", 80 | " row['first_name'] = \" \".join(name_list[:-1])\n", 81 | " row['last_name'] = name_list[-1]\n", 82 | " return row\n", 83 | "\n", 84 | "salary_df = salary_df.apply(split_player_name, axis=1)\n", 85 | "# salary_df.sort_values(\"player\", inplace=True)\n", 86 | "salary_df.head()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "ad2c98dc", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "career = playercareerstats.PlayerCareerStats(player_id=3)\n", 97 | "career" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "id": "3eb2925f", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "def season_id_to_season_end(val):\n", 108 | " season_start, season_end = val.split(\"-\")\n", 109 | " if f\"{season_start}\".startswith(\"1\"):\n", 110 | " season_end = f\"19{season_end}\"\n", 111 | " else:\n", 112 | " season_end = f\"20{season_end}\"\n", 113 | " return season_end\n", 114 | "\n", 115 | "# Charles Barkle player_id = 3\n", 116 | "def get_season_data(player_id=1):\n", 117 | " career = playercareerstats.PlayerCareerStats(player_id=player_id)\n", 118 | " player_df = career.get_data_frames()[0]\n", 119 | " player_df['season_start'] = player_df['SEASON_ID'].apply(lambda x: x.split(\"-\")[0])\n", 120 | " player_df['season_end'] = player_df['SEASON_ID'].apply(season_id_to_season_end)\n", 121 | " # player_df_final = player_df.copy()[['PLAYER_ID', 'TEAM_ABBREVIATION', 'season_start', 'season_end']]\n", 122 | " return player_df" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "360d220f", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "player_ = get_season_data(player_id=3)\n", 133 | "player_.head()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "4ed52b9f", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "nba_players = players.get_players()\n", 144 | "\n", 145 | "nba_players_df = pd.DataFrame(nba_players)\n", 146 | "nba_players_df.head()" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "6fedf048", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "95d97685", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# this is not efficient, perhaps another method to enrich all the data.\n", 165 | "\n", 166 | "# all_player_teams = []\n", 167 | "# for p_id in nba_players_df['id'].values[:10]:\n", 168 | "# all_player_teams.append(get_season_data(player_id=p_id))\n", 169 | "\n", 170 | "# all_player_teams_df = pd.concat(all_player_teams)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "541a07cb", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "id": "42f075f6", 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "salary_df.merge(nba_players_df, left_on=\"player\", right_on=\"full_name\")" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "e404bfc0", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "from nba_api.stats.static import teams\n", 199 | "\n", 200 | "nba_teams = teams.get_teams()\n", 201 | "# Select the dictionary for the Celtics, which contains their team ID\n", 202 | "celtics = [team for team in nba_teams if team['abbreviation'] == 'BOS'][0]\n", 203 | "celtics_id = celtics['id']" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "id": "4aed8df9", 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "from nba_api.stats.endpoints import leaguegamefinder\n", 214 | "\n", 215 | "# Query for games where the Celtics were playing\n", 216 | "gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=celtics_id)\n", 217 | "# The first DataFrame of those returned is what we want.\n", 218 | "games = gamefinder.get_data_frames()[0]\n", 219 | "games.head()" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "id": "7770f675", 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "id": "250ec48a", 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "player_info = commonplayerinfo.CommonPlayerInfo(player_id=51)\n", 238 | "player_info_df = pd.concat(player_info.get_data_frames()[:1])\n", 239 | "player_info_df.head()" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "b8476bc2", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "from nba_api.stats.endpoints import playercareerstats\n", 250 | "# Anthony Davis\n", 251 | "career = playercareerstats.PlayerCareerStats(player_id=51)\n", 252 | "player_df = career.get_data_frames()[0]\n", 253 | "player_df['season_start'] = player_df['SEASON_ID'].apply(lambda x: x.split(\"-\")[0])\n", 254 | "\n", 255 | "def season_end(val):\n", 256 | " season_start, season_end = val.split(\"-\")\n", 257 | " if f\"{season_end}\".startswith(\"9\"):\n", 258 | " season_end = f\"19{season_end}\"\n", 259 | " else:\n", 260 | " season_end = f\"20{season_end}\"\n", 261 | " return season_end\n", 262 | "\n", 263 | "player_df['season_end'] = player_df['SEASON_ID'].apply(season_end)\n", 264 | "player_df_final = player_df.copy()[['PLAYER_ID', 'TEAM_ABBREVIATION', 'season_start', 'season_end']]\n", 265 | "player_df_final.tail(n=100)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "id": "13b8a211", 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "player_df.columns" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "id": "c14ebaac", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "player_games = player_df.copy()[['SEASON_ID', 'GAME_ID', \"TEAM_ID\", 'TEAM_NAME', \"WL\", \"MIN\", \"PTS\"]]\n", 286 | "player_games.head()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "id": "c7e5b1e6", 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "players = players.get_players()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "id": "07a4cd9c", 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "players_df = pd.DataFrame(players)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "id": "04dc2b72", 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "players_df.head()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "id": "57cd0d44", 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "\n", 327 | "\n", 328 | "player_info.common_player_info.get_data_frame()" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "id": "8fcf9929", 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "# def add_common_info(row):\n", 339 | "# player_id = row['id']\n", 340 | "# player_info = commonplayerinfo.CommonPlayerInfo(player_id=player_id)\n", 341 | "# print(player_info)\n", 342 | "# return row\n", 343 | "\n", 344 | "# players_df.apply(add_common_info , axis=1)\n", 345 | "# players_df.head()" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "id": "79aff263", 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "from nba_api.stats.static import teams\n", 356 | "\n", 357 | "\n", 358 | "teams = teams.get_teams()" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "id": "d0cdec29", 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "teams_df = pd.DataFrame(teams)\n", 369 | "teams_df.head()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "id": "730be027", 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [] 379 | } 380 | ], 381 | "metadata": { 382 | "kernelspec": { 383 | "display_name": "Python 3 (ipykernel)", 384 | "language": "python", 385 | "name": "python3" 386 | }, 387 | "language_info": { 388 | "codemirror_mode": { 389 | "name": "ipython", 390 | "version": 3 391 | }, 392 | "file_extension": ".py", 393 | "mimetype": "text/x-python", 394 | "name": "python", 395 | "nbconvert_exporter": "python", 396 | "pygments_lexer": "ipython3", 397 | "version": "3.9.7" 398 | } 399 | }, 400 | "nbformat": 4, 401 | "nbformat_minor": 5 402 | } 403 | -------------------------------------------------------------------------------- /course/appendix/Appendix D - NBA Player Detail.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "62e34320", 6 | "metadata": {}, 7 | "source": [ 8 | "# 6 - NBA Player Detail\n", 9 | "\n", 10 | "Now we're going to heavily use the [nba_api](https://github.com/swar/nba_api) data library as it's proven to be a great API for extracting data from https://stats.nba.com." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "c4ac00b3", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# %pip install nba_api" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "f79bb047", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import utils" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "id": "a6e59df0", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import pandas as pd\n", 41 | "import pathlib" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "e4f6c115", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "from nba_api.stats.endpoints import commonplayerinfo, playercareerstats\n", 52 | "from nba_api.stats.static import players" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "a462b5bd", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "BASE_DIR = pathlib.Path().resolve()\n", 63 | "SAMPLES_DIR = BASE_DIR / 'samples'\n", 64 | "SAMPLE_PLAYERS_DIR = SAMPLES_DIR / 'players'\n", 65 | "SAMPLE_PLAYERS_DIR.mkdir(exist_ok=True, parents=True)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "c54afdfd", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "salary_df = pd.read_csv('samples/5-player-adj-salaries-audit.csv')\n", 76 | "salary_df.columns = [f\"{x}\".upper() for x in salary_df.columns]" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "id": "bf3a739f", 82 | "metadata": {}, 83 | "source": [ 84 | "I converted our columns to match the column names in the `nba_api` library." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "33ec3ad5", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "random_row = salary_df.sample(n=1)\n", 95 | "name = random_row['PLAYER'].item() # .item() will get the value\n", 96 | "random_row" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "47bb9f3d", 102 | "metadata": {}, 103 | "source": [ 104 | "Using `.sample(n=1)` will return a random sample of our data. This sample can be as large as you'd like but I chose to return `1` row (`n=1`) to ultimately get `1` player's name for use in the API." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "90c49f0d", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "# name = 'Michael Jordan'\n", 115 | "name" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "d88d5d14", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "player_results = players.find_players_by_full_name(name) \n", 126 | "player_df = pd.DataFrame(player_results)\n", 127 | "player_df.head()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "2afa3bb1", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "player_df" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "c532bdbc", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "player_id = player_df.iloc[0]['id'].item()\n", 148 | "# player_id = player_df.loc[0]['id'].item()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "id": "07f6e6ac", 154 | "metadata": {}, 155 | "source": [ 156 | "`.iloc` is much like using an index value in a standard python list. `iloc[23]` will yield the 24th element in the DataFrame. `iloc[0]` will return the 1st element at the 0th position. `loc[0]` *may* yield the same result if the index column is not shuffled. " 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "887546a2", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "career = playercareerstats.PlayerCareerStats(player_id=player_id)\n", 167 | "career_df = career.get_data_frames()[0]\n", 168 | "career_df.head()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "49841b9c", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "def season_id_to_season_end(val):\n", 179 | " season_start, season_end = val.split(\"-\")\n", 180 | " if f\"{season_start}\".startswith(\"1\"):\n", 181 | " season_end = f\"19{season_end}\"\n", 182 | " else:\n", 183 | " season_end = f\"20{season_end}\"\n", 184 | " return season_end" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "id": "1342c025", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "career_df['YEAR_START'] = career_df['SEASON_ID'].apply(lambda x: x.split(\"-\")[0])\n", 195 | "career_df['YEAR_END'] = career_df['SEASON_ID'].apply(season_id_to_season_end)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "id": "8c0a6c4b", 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "career_df.head()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "e3e8a5a9", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "year_start = career_df['YEAR_START'].min()\n", 216 | "year_end = career_df['YEAR_END'].max()\n", 217 | "year_start, year_end" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "10884871", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "key_stats = ['GP',\n", 228 | " 'GS',\n", 229 | " 'MIN',\n", 230 | " 'FGM',\n", 231 | " 'FGA',\n", 232 | " 'FG_PCT',\n", 233 | " 'FG3M',\n", 234 | " 'FG3A',\n", 235 | " 'FG3_PCT',\n", 236 | " 'FTM',\n", 237 | " 'FTA',\n", 238 | " 'FT_PCT',\n", 239 | " 'OREB',\n", 240 | " 'DREB',\n", 241 | " 'REB',\n", 242 | " 'AST',\n", 243 | " 'STL',\n", 244 | " 'BLK',\n", 245 | " 'TOV',\n", 246 | " 'PF',\n", 247 | " 'PTS']\n", 248 | " \n", 249 | "key_stats_labels = [\n", 250 | " \"Games Played\",\n", 251 | " \"Games Started\",\n", 252 | " \"Minutes\",\n", 253 | " \"Field Goals Made\",\n", 254 | " \"Field Goals Attempted\",\n", 255 | " \"Field Goal Percentage\",\n", 256 | " \"3-point Field Goals Made\",\n", 257 | " \"3-point Field Goals Made Attempted\",\n", 258 | " \"3-point Field Goal Percentage\",\n", 259 | " 'Free Throws Made',\n", 260 | " 'Free Throws Attempted',\n", 261 | " 'Free Throw Percentage',\n", 262 | " 'Offensive Rebouns',\n", 263 | " 'Defensive Rebouns',\n", 264 | " 'Rebouns',\n", 265 | " 'Assists',\n", 266 | " 'Steals',\n", 267 | " 'Blocks',\n", 268 | " 'Turnovers',\n", 269 | " 'Personal Fouls',\n", 270 | " 'Points'\n", 271 | "]\n", 272 | " \n", 273 | "key_stats_mapping = dict(zip(key_stats, key_stats_labels))\n", 274 | "stats_without_perc = [x for x in key_stats if not \"pct\" in x.lower()]\n", 275 | "stats_with_perc = [x for x in key_stats if \"pct\" in x.lower()]" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "id": "cbd694ec", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "career_df" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "id": "bb1e311c", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "career_stats_columns = stats_without_perc + [\"SEASON_ID\"]\n", 296 | "career_stats = career_df.copy()[career_stats_columns]\n", 297 | "career_stats.head()" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "id": "69e70a1b", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "# career_stats = career_stats.astype(int)\n", 308 | "career_stats.set_index(\"SEASON_ID\", inplace=True, drop=True)\n", 309 | "totals_df = career_stats.rename(columns=key_stats_mapping)\n", 310 | "totals = totals_df.sum()" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "id": "0c8cf184", 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "totals_df" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "id": "849b1797", 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "career_stats_perc = career_df.copy()[stats_with_perc]\n", 331 | "averages_df = career_stats_perc.rename(columns=key_stats_mapping)\n", 332 | "averages = averages_df.mean()" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "id": "550cfc04", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "totals" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "id": "3111002f", 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "averages" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "id": "16bf9466", 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "id": "794a21fb", 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "earnings = salary_df.copy()[salary_df['PLAYER'] == name][['SALARY', 'ADJ_SALARY', 'YEAR_START']]\n", 371 | "earnings.set_index(\"YEAR_START\", inplace=True, drop=True)\n", 372 | "earnings.tail(n=15)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "id": "4ec4bdfd", 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "total_earnings = earnings.sum().apply(utils.float_to_dollars)\n", 383 | "total_earnings" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "id": "8ad188d4", 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "name" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "id": "d74c9382", 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "total_games = career_stats.GP.sum()\n", 404 | "total_games" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "9f255d71", 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "earnings_per_game = earnings['SALARY'].sum() / total_games\n", 415 | "utils.float_to_dollars(earnings_per_game)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "id": "757bb983", 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "plot = earnings.plot()" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "id": "1d8583e6", 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "id": "7fea57c7", 439 | "metadata": {}, 440 | "source": [ 441 | "### Export Player Data" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "id": "218616fd", 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "PLAYER_DIR = SAMPLE_PLAYERS_DIR / f\"{name}\"\n", 452 | "PLAYER_DIR.mkdir(exist_ok=True, parents=True)" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "id": "3b2d2aea", 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "# Eearnings Plot\n", 463 | "figure = plot.figure\n", 464 | "figure.savefig(PLAYER_DIR / f\"salary_over_time.png\")" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "id": "42d5c8db", 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "stats_output = PLAYER_DIR / \"stats.xlsx\"" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "id": "91cde06a", 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "totals_df" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "id": "784a63ba", 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "with pd.ExcelWriter(stats_output) as writer:\n", 495 | " totals_df.to_excel(writer, sheet_name='Career Stats')\n", 496 | " totals.to_excel(writer, sheet_name='Career Totals')\n", 497 | " averages.to_excel(writer, sheet_name='Career Averages')\n", 498 | " earnings.to_excel(writer, sheet_name='Yearly Earnings')\n", 499 | " total_earnings.to_excel(writer, sheet_name='Total Earnings')" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "id": "09a59565", 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "name" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "id": "aea7e10a", 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "import requests" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "id": "c358192a", 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "import requests\n", 530 | "\n", 531 | "url = \"https://stats.nba.com/stats/playerindex?College=&Country=&DraftPick=&DraftRound=&DraftYear=&Height=&Historical=1&LeagueID=00&Season=2021-22&SeasonType=Regular%20Season&TeamID=0\"\n", 532 | "url = \"https://stats.nba.com/stats/playercareerstats?LeagueID=&PerMode=Totals&PlayerID=2544\"\n", 533 | "\n", 534 | "r = requests.get(url, stream=True)\n", 535 | "r.json()" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "id": "2c89cf45", 542 | "metadata": {}, 543 | "outputs": [], 544 | "source": [] 545 | } 546 | ], 547 | "metadata": { 548 | "kernelspec": { 549 | "display_name": "Python 3 (ipykernel)", 550 | "language": "python", 551 | "name": "python3" 552 | }, 553 | "language_info": { 554 | "codemirror_mode": { 555 | "name": "ipython", 556 | "version": 3 557 | }, 558 | "file_extension": ".py", 559 | "mimetype": "text/x-python", 560 | "name": "python", 561 | "nbconvert_exporter": "python", 562 | "pygments_lexer": "ipython3", 563 | "version": "3.9.7" 564 | } 565 | }, 566 | "nbformat": 4, 567 | "nbformat_minor": 5 568 | } 569 | -------------------------------------------------------------------------------- /course/appendix/appendix-b-united-states-inflation-rate.csv: -------------------------------------------------------------------------------- 1 | Macrotrends Data Download 2 | 3 | U.S. Inflation Rate 1960-2021 4 | 5 | 6 | DISCLAIMER AND TERMS OF USE: HISTORICAL DATA IS PROVIDED "AS IS" AND SOLELY 7 | FOR INFORMATIONAL PURPOSES - NOT FOR TRADING PURPOSES OR ADVICE. 8 | NEITHER MACROTRENDS LLC NOR ANY OF OUR INFORMATION PROVIDERS WILL BE LIABLE 9 | FOR ANY DAMAGES RELATING TO YOUR USE OF THE DATA PROVIDED. 10 | 11 | 12 | ATTRIBUTION: Proper attribution requires clear indication of the data source as "www.macrotrends.net". 13 | A "dofollow" backlink to the originating page is also required if the data is displayed on a web page. 14 | 15 | 16 | , , , 17 | date, Inflation Rate (%), Annual Change, 18 | 1960-12-31,1.458 19 | 1961-12-31,1.0707,-0.39 20 | 1962-12-31,1.1988,0.13 21 | 1963-12-31,1.2397,0.04 22 | 1964-12-31,1.2789,0.04 23 | 1965-12-31,1.5852,0.31 24 | 1966-12-31,3.0151,1.43 25 | 1967-12-31,2.7728,-0.24 26 | 1968-12-31,4.2718,1.5 27 | 1969-12-31,5.4624,1.19 28 | 1970-12-31,5.8383,0.38 29 | 1971-12-31,4.2928,-1.55 30 | 1972-12-31,3.2723,-1.02 31 | 1973-12-31,6.1778,2.91 32 | 1974-12-31,11.0548,4.88 33 | 1975-12-31,9.1431,-1.91 34 | 1976-12-31,5.7448,-3.4 35 | 1977-12-31,6.5017,0.76 36 | 1978-12-31,7.631,1.13 37 | 1979-12-31,11.2545,3.62 38 | 1980-12-31,13.5492,2.29 39 | 1981-12-31,10.3347,-3.21 40 | 1982-12-31,6.1314,-4.2 41 | 1983-12-31,3.2124,-2.92 42 | 1984-12-31,4.3005,1.09 43 | 1985-12-31,3.5456,-0.75 44 | 1986-12-31,1.898,-1.65 45 | 1987-12-31,3.6646,1.77 46 | 1988-12-31,4.0777,0.41 47 | 1989-12-31,4.827,0.75 48 | 1990-12-31,5.398,0.57 49 | 1991-12-31,4.235,-1.16 50 | 1992-12-31,3.0288,-1.21 51 | 1993-12-31,2.9517,-0.08 52 | 1994-12-31,2.6074,-0.34 53 | 1995-12-31,2.8054,0.2 54 | 1996-12-31,2.9312,0.13 55 | 1997-12-31,2.3377,-0.59 56 | 1998-12-31,1.5523,-0.79 57 | 1999-12-31,2.188,0.64 58 | 2000-12-31,3.3769,1.19 59 | 2001-12-31,2.8262,-0.55 60 | 2002-12-31,1.586,-1.24 61 | 2003-12-31,2.2701,0.68 62 | 2004-12-31,2.6772,0.41 63 | 2005-12-31,3.3927,0.72 64 | 2006-12-31,3.2259,-0.17 65 | 2007-12-31,2.8527,-0.37 66 | 2008-12-31,3.8391,0.99 67 | 2009-12-31,-0.3555,-4.19 68 | 2010-12-31,1.64,2 69 | 2011-12-31,3.1568,1.52 70 | 2012-12-31,2.0693,-1.09 71 | 2013-12-31,1.4648,-0.6 72 | 2014-12-31,1.6222,0.16 73 | 2015-12-31,0.1186,-1.5 74 | 2016-12-31,1.2616,1.14 75 | 2017-12-31,2.1301,0.87 76 | 2018-12-31,2.4426,0.31 77 | 2019-12-31,1.8122,-0.63 78 | 2020-12-31,1.2336,-0.58 79 | -------------------------------------------------------------------------------- /course/datasets/inflation-rate.csv: -------------------------------------------------------------------------------- 1 | date,inflation_rate_percent,multiplier 2 | 1960-12-31,1.01458,8.751003401396263 3 | 1961-12-31,1.010707,8.65829899406679 4 | 1962-12-31,1.011988,8.555732868439929 5 | 1963-12-31,1.012397,8.450966239963108 6 | 1964-12-31,1.012789,8.344251606171778 7 | 1965-12-31,1.015852,8.214042602831697 8 | 1966-12-31,1.030151,7.97362969393001 9 | 1967-12-31,1.027728,7.758501951810214 10 | 1968-12-31,1.042718,7.440652172313332 11 | 1969-12-31,1.054624,7.055265357429124 12 | 1970-12-31,1.058383,6.666079630369272 13 | 1971-12-31,1.042928,6.391696867251884 14 | 1972-12-31,1.032723,6.189168699885534 15 | 1973-12-31,1.061778,5.829060971206343 16 | 1974-12-31,1.110548,5.2488149735142855 17 | 1975-12-31,1.091431,4.809112965926647 18 | 1976-12-31,1.057448,4.5478481834819755 19 | 1977-12-31,1.065017,4.270211821484517 20 | 1978-12-31,1.07631,3.967455307006825 21 | 1979-12-31,1.112545,3.5661077143008386 22 | 1980-12-31,1.135492,3.1405837419381557 23 | 1981-12-31,1.103347,2.8464152636823705 24 | 1982-12-31,1.061314,2.681972784380845 25 | 1983-12-31,1.032124,2.59849861487655 26 | 1984-12-31,1.043005,2.491357773813692 27 | 1985-12-31,1.035456,2.4060489038778017 28 | 1986-12-31,1.01898,2.3612327070970984 29 | 1987-12-31,1.036646,2.2777618464713094 30 | 1988-12-31,1.040777,2.1885205442388806 31 | 1989-12-31,1.04827,2.08774508880239 32 | 1990-12-31,1.05398,1.9808204034254824 33 | 1991-12-31,1.04235,1.900340963616331 34 | 1992-12-31,1.030288,1.8444754899759395 35 | 1993-12-31,1.029517,1.7915930382654586 36 | 1994-12-31,1.026074,1.7460661105002742 37 | 1995-12-31,1.028054,1.6984186730466238 38 | 1996-12-31,1.029312,1.6500523388891057 39 | 1997-12-31,1.023377,1.6123601946194852 40 | 1998-12-31,1.015523,1.5877141085130386 41 | 1999-12-31,1.02188,1.553718742428699 42 | 2000-12-31,1.033769,1.502965113510561 43 | 2001-12-31,1.028262,1.4616557973654194 44 | 2002-12-31,1.01586,1.4388358606160483 45 | 2003-12-31,1.022701,1.4068978720232481 46 | 2004-12-31,1.026772,1.37021448970487 47 | 2005-12-31,1.033927,1.3252526432764307 48 | 2006-12-31,1.032259,1.2838373346964576 49 | 2007-12-31,1.028527,1.2482291030730917 50 | 2008-12-31,1.038391,1.20208004795216 51 | 2009-12-31,0.996445,1.206368688640276 52 | 2010-12-31,1.0164,1.186903471704325 53 | 2011-12-31,1.031568,1.1505819022151957 54 | 2012-12-31,1.020693,1.127255602042138 55 | 2013-12-31,1.014648,1.1109819386054456 56 | 2014-12-31,1.016222,1.0932472812096623 57 | 2015-12-31,1.001186,1.091952225869781 58 | 2016-12-31,1.012616,1.0783477901492577 59 | 2017-12-31,1.021301,1.0558569806053826 60 | 2018-12-31,1.024426,1.030681552992 61 | 2019-12-31,1.018122,1.012336 62 | 2020-12-31,1.012336,1.0 63 | -------------------------------------------------------------------------------- /course/samples/4-adj-salaries-cumlative-per-year.csv: -------------------------------------------------------------------------------- 1 | year,adj_salary,adj_salary_$ 2 | 1990,582539682.0,"$582,539,682.00" 3 | 1991,700377392.0,"$700,377,392.00" 4 | 1992,788801985.0,"$788,801,985.00" 5 | 1993,893209929.0,"$893,209,929.00" 6 | 1994,1042891125.0,"$1,042,891,125.00" 7 | 1995,1224345459.0,"$1,224,345,459.00" 8 | 1996,1321922576.0,"$1,321,922,576.00" 9 | 1997,1525445588.0,"$1,525,445,588.00" 10 | 1998,1691690974.0,"$1,691,690,974.00" 11 | 1999,2038382935.0,"$2,038,382,935.00" 12 | 2000,2248609728.0,"$2,248,609,728.00" 13 | 2001,2257291832.0,"$2,257,291,832.00" 14 | 2002,2392844658.0,"$2,392,844,658.00" 15 | 2003,2347269535.0,"$2,347,269,535.00" 16 | 2004,2410317680.0,"$2,410,317,680.00" 17 | 2005,2514965375.0,"$2,514,965,375.00" 18 | 2006,2464069199.0,"$2,464,069,199.00" 19 | 2007,2553671099.0,"$2,553,671,099.00" 20 | 2008,2543013922.0,"$2,543,013,922.00" 21 | 2009,2525568731.0,"$2,525,568,731.00" 22 | 2010,2396025966.0,"$2,396,025,966.00" 23 | 2011,2302629043.0,"$2,302,629,043.00" 24 | 2012,2371940974.0,"$2,371,940,974.00" 25 | 2013,2338442773.0,"$2,338,442,773.00" 26 | 2014,2382200105.0,"$2,382,200,105.00" 27 | 2015,2543053337.0,"$2,543,053,337.00" 28 | 2016,3172547897.0,"$3,172,547,897.00" 29 | 2017,3532646079.0,"$3,532,646,079.00" 30 | 2018,3687776555.0,"$3,687,776,555.00" 31 | 2019,3557605726.0,"$3,557,605,726.00" 32 | 2020,3905172153.0,"$3,905,172,153.00" 33 | -------------------------------------------------------------------------------- /course/samples/4-player-salaries-per-year.csv: -------------------------------------------------------------------------------- 1 | year,players 2 | 1990,352 3 | 1991,383 4 | 1992,401 5 | 1993,385 6 | 1994,418 7 | 1995,451 8 | 1996,415 9 | 1997,444 10 | 1998,426 11 | 1999,516 12 | 2000,455 13 | 2001,450 14 | 2002,451 15 | 2003,454 16 | 2004,470 17 | 2005,479 18 | 2006,495 19 | 2007,469 20 | 2008,460 21 | 2009,456 22 | 2010,459 23 | 2011,463 24 | 2012,494 25 | 2013,492 26 | 2014,513 27 | 2015,500 28 | 2016,545 29 | 2017,586 30 | 2018,576 31 | 2019,513 32 | 2020,578 33 | -------------------------------------------------------------------------------- /course/samples/players/Caleb Swanigan/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Caleb Swanigan/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Caleb Swanigan/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Caleb Swanigan/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Charlie Ward/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Charlie Ward/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Charlie Ward/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Charlie Ward/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Chris Mihm/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Chris Mihm/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Chris Mihm/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Chris Mihm/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Chris Wilcox/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Chris Wilcox/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Chris Wilcox/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Chris Wilcox/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Darvin Ham/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Darvin Ham/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Darvin Ham/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Darvin Ham/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Devin Harris/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Devin Harris/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Devin Harris/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Devin Harris/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Eric Gordon/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Eric Gordon/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Eric Gordon/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Eric Gordon/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Gary Trent Jr/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Gary Trent Jr/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Gary Trent Jr/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Gary Trent Jr/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Gerald Wilkins/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Gerald Wilkins/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Gerald Wilkins/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Gerald Wilkins/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Jahidi White/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jahidi White/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Jahidi White/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jahidi White/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Jason Smith/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jason Smith/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Jason Smith/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jason Smith/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Jermaine O'Neal/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jermaine O'Neal/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Jermaine O'Neal/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jermaine O'Neal/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Ken Norman/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Ken Norman/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Ken Norman/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Ken Norman/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Kevin Garnett/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Kevin Garnett/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Kevin Garnett/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Kevin Garnett/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Ledell Eackles/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Ledell Eackles/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Ledell Eackles/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Ledell Eackles/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Luke Harangody/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Luke Harangody/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Luke Harangody/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Luke Harangody/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Michael Beasley/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Michael Beasley/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Michael Beasley/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Michael Beasley/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Michael Jordan/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Michael Jordan/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Michael Jordan/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Michael Jordan/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Shaquille O'Neal/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Shaquille O'Neal/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Shaquille O'Neal/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Shaquille O'Neal/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Steve Scheffler/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Steve Scheffler/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Steve Scheffler/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Steve Scheffler/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Toby Bailey/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Toby Bailey/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Toby Bailey/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Toby Bailey/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Tony Farmer/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Tony Farmer/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Tony Farmer/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Tony Farmer/stats.xlsx -------------------------------------------------------------------------------- /course/samples/players/Tristan Thompson/salary_over_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Tristan Thompson/salary_over_time.png -------------------------------------------------------------------------------- /course/samples/players/Tristan Thompson/stats.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Tristan Thompson/stats.xlsx -------------------------------------------------------------------------------- /course/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Below are various simple utility methods we use in many of the notebooks. 3 | To use just: 4 | 5 | import utils 6 | 7 | utils.float_to_dollars(32.00) 8 | 9 | """ 10 | from slugify import slugify 11 | 12 | 13 | def float_to_dollars(value:float) -> str: 14 | """ 15 | Take in a float (32.00) 16 | """ 17 | return f"${value:,.2f}" 18 | 19 | 20 | def dollar_str_to_float(value:str) -> float: 21 | return float(value.replace("$", "").replace(",", "_")) 22 | 23 | 24 | def group_salary(value:float) -> str: 25 | if value > .95: 26 | return 'top' 27 | elif value < .95 and value > .50: 28 | return 'mid' 29 | return 'low' 30 | 31 | 32 | def to_snake_case(val): 33 | # in the future, this will be stored in 34 | # utils.py in the courses/ directory 35 | kebab_case = slugify(val) 36 | return kebab_case.replace('-', '_') 37 | -------------------------------------------------------------------------------- /nbs_ref/1 - DataFrame.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "cell_id": "00000-404671cc-c8d9-4153-b6c0-fef41a4c8ad4", 8 | "deepnote_cell_type": "code", 9 | "deepnote_to_be_reexecuted": false, 10 | "execution_millis": 5, 11 | "execution_start": 1634660464202, 12 | "source_hash": "9d6c0093", 13 | "tags": [] 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "import datetime\n", 18 | "number = 10\n", 19 | "data = [{\"number\": x, \"time\": datetime.datetime.now(), \"added_by\": \"Justin\"} for x in range(0, number)]\n", 20 | "data" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "cell_id": "00001-b9f74741-0730-4321-abaf-4f5a0f3a33ac", 28 | "deepnote_cell_type": "code", 29 | "deepnote_to_be_reexecuted": false, 30 | "execution_millis": 0, 31 | "execution_start": 1634660404069, 32 | "source_hash": "9b82ee11", 33 | "tags": [] 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import pandas as pd" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "cell_id": "00002-a3425b6e-ca8e-42b9-8794-babe70bdf7cd", 45 | "deepnote_cell_type": "code", 46 | "deepnote_to_be_reexecuted": false, 47 | "execution_millis": 1, 48 | "execution_start": 1634660494571, 49 | "source_hash": "68b98649", 50 | "tags": [] 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "df = pd.DataFrame(data)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "cell_id": "00003-d30403ec-6d18-4950-9406-8a09845ea4a1", 62 | "deepnote_cell_type": "code", 63 | "deepnote_to_be_reexecuted": false, 64 | "execution_millis": 24, 65 | "execution_start": 1634660495073, 66 | "source_hash": "c085b6ba", 67 | "tags": [] 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "df.head()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "cell_id": "00004-bae5cad7-e47a-4649-ae30-da286f06f94d", 79 | "deepnote_cell_type": "code", 80 | "deepnote_to_be_reexecuted": false, 81 | "execution_millis": 20, 82 | "execution_start": 1634660531873, 83 | "source_hash": "c6672ebc", 84 | "tags": [] 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "df.to_csv(\"temp.csv\")" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "cell_id": "00005-39181e7e-fd4e-4d05-a413-b3dde31d113f", 96 | "deepnote_cell_type": "code", 97 | "tags": [] 98 | }, 99 | "outputs": [], 100 | "source": [] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": { 105 | "created_in_deepnote_cell": true, 106 | "deepnote_cell_type": "markdown", 107 | "tags": [] 108 | }, 109 | "source": [ 110 | "\n", 111 | "Created in deepnote.com \n", 112 | "Created in Deepnote" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "deepnote": { 118 | "is_reactive": false 119 | }, 120 | "deepnote_execution_queue": [], 121 | "deepnote_notebook_id": "6d02c57e-af66-4ca3-a9e8-3e4d1a4b40ec" 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 2 125 | } 126 | -------------------------------------------------------------------------------- /nbs_ref/2 - Import & Export.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "cell_id": "00000-73e17520-ba02-468e-b566-ca0b1d88df64", 8 | "deepnote_cell_type": "code", 9 | "deepnote_to_be_reexecuted": false, 10 | "execution_millis": 4, 11 | "execution_start": 1634662754163, 12 | "source_hash": "fb25d99a", 13 | "tags": [] 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "import random\n", 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "cell_id": "00001-d427eb7c-182d-487d-92ec-57a6bf0ace8d", 26 | "deepnote_cell_type": "code", 27 | "deepnote_to_be_reexecuted": false, 28 | "execution_millis": 2, 29 | "execution_start": 1634662872515, 30 | "source_hash": "10381e97", 31 | "tags": [] 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "items = []\n", 36 | "number = random.randint(0, 10_000)\n", 37 | "\n", 38 | "for x in range(0, number):\n", 39 | " dollars = random.randint(200_000, 50_000_000)\n", 40 | " data = {\n", 41 | " \"Player Name\": f\"Player-{x}\",\n", 42 | " \"Player Salary\": f\"${dollars:,.2f}\"\n", 43 | " }\n", 44 | " items.append(data)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "cell_id": "00002-7c63a5ab-bbe9-4098-9d10-68ccb0a0c6c4", 52 | "deepnote_cell_type": "code", 53 | "deepnote_to_be_reexecuted": false, 54 | "execution_millis": 9, 55 | "execution_start": 1634663034475, 56 | "source_hash": "77d806e5", 57 | "tags": [] 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "items[df.shape[0]-1]\n", 62 | "items[-1]" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "cell_id": "00002-b0c3ddbd-1856-4c13-b20a-2e9213a9de33", 70 | "deepnote_cell_type": "code", 71 | "deepnote_to_be_reexecuted": false, 72 | "execution_millis": 10, 73 | "execution_start": 1634663045778, 74 | "source_hash": "ee80ac5f", 75 | "tags": [] 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "df = pd.DataFrame(items)\n", 80 | "df.tail(n=5)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "cell_id": "00003-8693a73e-9587-4404-807f-dac4a386fd65", 88 | "deepnote_cell_type": "code", 89 | "deepnote_to_be_reexecuted": false, 90 | "execution_millis": 2, 91 | "execution_start": 1634662954296, 92 | "source_hash": "14f60b8f", 93 | "tags": [] 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "df.shape" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "cell_id": "00003-37cc71ed-0226-47af-a776-9f18083dc3a4", 105 | "deepnote_cell_type": "code", 106 | "deepnote_to_be_reexecuted": false, 107 | "execution_millis": 11, 108 | "execution_start": 1634663125029, 109 | "source_hash": "9a72dbc", 110 | "tags": [] 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "df.to_csv(\"example.csv\", index=False)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "cell_id": "00004-01213fba-c9c9-4809-bd53-d712180c865f", 122 | "deepnote_cell_type": "code", 123 | "deepnote_to_be_reexecuted": false, 124 | "execution_millis": 16, 125 | "execution_start": 1634663135503, 126 | "source_hash": "e261385d", 127 | "tags": [] 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "df2 = pd.read_csv(\"example.csv\")\n", 132 | "df2.to_csv(\"example.csv\", index=False)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "cell_id": "00005-5bb5b873-84a6-4c13-8283-80324940e936", 140 | "deepnote_cell_type": "code", 141 | "deepnote_to_be_reexecuted": false, 142 | "execution_millis": 2, 143 | "execution_start": 1634663135575, 144 | "source_hash": "4e0cbe0d", 145 | "tags": [] 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "df2.head()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "cell_id": "00008-f54b4824-92df-447c-8b68-9fe9cb57db7e", 157 | "deepnote_cell_type": "code", 158 | "tags": [] 159 | }, 160 | "outputs": [], 161 | "source": [] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "created_in_deepnote_cell": true, 167 | "deepnote_cell_type": "markdown", 168 | "tags": [] 169 | }, 170 | "source": [ 171 | "\n", 172 | "Created in deepnote.com \n", 173 | "Created in Deepnote" 174 | ] 175 | } 176 | ], 177 | "metadata": { 178 | "deepnote": { 179 | "is_reactive": false 180 | }, 181 | "deepnote_execution_queue": [], 182 | "deepnote_notebook_id": "7d195f65-2af9-44d5-b3b3-a1ec93a8da0c" 183 | }, 184 | "nbformat": 4, 185 | "nbformat_minor": 2 186 | } 187 | -------------------------------------------------------------------------------- /nbs_ref/3 - Rename Columns.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "cell_id": "00000-80b53f9e-8a99-42ce-8537-563e35eae81e", 8 | "deepnote_cell_type": "code", 9 | "deepnote_to_be_reexecuted": false, 10 | "execution_millis": 3, 11 | "execution_start": 1634664323897, 12 | "source_hash": "3296bc83", 13 | "tags": [] 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "import pathlib\n", 18 | "import pandas as pd " 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "cell_id": "00001-619fdf6d-e70c-463c-9ef0-cade76400689", 26 | "deepnote_cell_type": "code", 27 | "deepnote_to_be_reexecuted": false, 28 | "execution_millis": 8, 29 | "execution_start": 1634664325242, 30 | "source_hash": "cc9f5d6c", 31 | "tags": [] 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "BASE_DIR = pathlib.Path().resolve().parent\n", 36 | "COURSE_DIR = BASE_DIR / \"course\"\n", 37 | "SAMPLES_DIR = COURSE_DIR / 'samples'\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "cell_id": "00002-fb8339de-56d3-4821-9ccb-c43a97be82f2", 45 | "deepnote_cell_type": "code", 46 | "deepnote_to_be_reexecuted": false, 47 | "execution_millis": 10, 48 | "execution_start": 1634664551676, 49 | "source_hash": "b2e8acb5", 50 | "tags": [] 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "df = pd.read_csv(SAMPLES_DIR / \"1.csv\")\n", 55 | "df.head()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "cell_id": "00003-eec93174-c12b-41f5-a12e-113ea58e2387", 63 | "deepnote_cell_type": "code", 64 | "deepnote_to_be_reexecuted": false, 65 | "execution_millis": 3, 66 | "execution_start": 1634664554225, 67 | "source_hash": "c5e12177", 68 | "tags": [] 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "columns = df.columns\n", 73 | "auto_changed = [x.lower().replace(\" \", \"_\") for x in list(columns)]\n", 74 | "mapped_columns = dict(zip(columns, auto_changed))\n", 75 | "mapped_columns" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "cell_id": "00004-b288a341-abc8-44bf-8528-77c75d9a4717", 83 | "deepnote_cell_type": "code", 84 | "deepnote_to_be_reexecuted": false, 85 | "execution_millis": 6, 86 | "execution_start": 1634664554594, 87 | "source_hash": "53315aff", 88 | "tags": [] 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "custom_mapped_columns = {'Player Name': 'name', 'Player Salary': 'salary'}" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "cell_id": "00003-1bc88210-86e1-408f-8ca0-4784e53b59f2", 100 | "deepnote_cell_type": "code", 101 | "deepnote_to_be_reexecuted": false, 102 | "execution_millis": 5, 103 | "execution_start": 1634664558540, 104 | "source_hash": "805d5ae3", 105 | "tags": [] 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "new_df = df.rename(columns=custom_mapped_columns)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "cell_id": "00004-17e8390b-3ee1-4304-aff1-d86311f55a43", 117 | "deepnote_cell_type": "code", 118 | "deepnote_to_be_reexecuted": false, 119 | "execution_millis": 18, 120 | "execution_start": 1634664559234, 121 | "source_hash": "e0dec228", 122 | "tags": [] 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "new_df.head()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "cell_id": "00007-0ca575d4-3aab-46aa-aed3-dfde855c2558", 134 | "deepnote_cell_type": "code", 135 | "deepnote_to_be_reexecuted": false, 136 | "execution_millis": 5, 137 | "execution_start": 1634664565357, 138 | "source_hash": "c085b6ba", 139 | "tags": [] 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "df.head()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "cell_id": "00008-78290255-8993-4195-93b1-e7c651c18689", 151 | "deepnote_cell_type": "code", 152 | "deepnote_to_be_reexecuted": false, 153 | "execution_millis": 4, 154 | "execution_start": 1634664605275, 155 | "source_hash": "a7cdfe1a", 156 | "tags": [] 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "df = new_df.copy()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "cell_id": "00009-f639a375-19ca-406a-8f95-83b6b275ef8c", 168 | "deepnote_cell_type": "code", 169 | "deepnote_to_be_reexecuted": false, 170 | "execution_millis": 4, 171 | "execution_start": 1634664608527, 172 | "source_hash": "c085b6ba", 173 | "tags": [] 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "df.head()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "cell_id": "00010-31d4f407-4738-4bbe-8ce6-d5d86f6a79ad", 185 | "deepnote_cell_type": "code", 186 | "tags": [] 187 | }, 188 | "outputs": [], 189 | "source": [] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": { 194 | "created_in_deepnote_cell": true, 195 | "deepnote_cell_type": "markdown", 196 | "tags": [] 197 | }, 198 | "source": [ 199 | "\n", 200 | "Created in deepnote.com \n", 201 | "Created in Deepnote" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "deepnote": { 207 | "is_reactive": false 208 | }, 209 | "deepnote_execution_queue": [], 210 | "deepnote_notebook_id": "6728ff69-a810-43ce-9c54-6d8830c0bc28" 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 2 214 | } 215 | -------------------------------------------------------------------------------- /nbs_ref/4 - Clean Rows.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "cell_id": "00000-349f686e-3dfe-4eec-a63b-ef2ae23c894f", 8 | "deepnote_cell_type": "code", 9 | "deepnote_to_be_reexecuted": false, 10 | "execution_millis": 19, 11 | "execution_start": 1634664703106, 12 | "source_hash": "3296bc83", 13 | "tags": [] 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "import pathlib\n", 18 | "import pandas as pd " 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "cell_id": "00001-05cb6c26-afaa-4d70-a56a-1192c0ce5297", 26 | "deepnote_cell_type": "code", 27 | "deepnote_to_be_reexecuted": false, 28 | "execution_millis": 377930, 29 | "execution_start": 1634664703216, 30 | "source_hash": "cc9f5d6c", 31 | "tags": [] 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "BASE_DIR = pathlib.Path().resolve().parent\n", 36 | "COURSE_DIR = BASE_DIR / \"course\"\n", 37 | "SAMPLES_DIR = COURSE_DIR / 'samples'\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "cell_id": "00002-97e9fae1-78cb-4530-abdc-108b358f86d0", 45 | "deepnote_cell_type": "code", 46 | "deepnote_to_be_reexecuted": false, 47 | "execution_millis": 54, 48 | "execution_start": 1634664703263, 49 | "source_hash": "ed8fc3d1", 50 | "tags": [] 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "df = pd.read_csv(SAMPLES_DIR / \"1.csv\")\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "cell_id": "00004-286f55fa-610e-47e2-a42f-6c2230a095e9", 62 | "deepnote_cell_type": "code", 63 | "deepnote_to_be_reexecuted": false, 64 | "execution_millis": 148804, 65 | "execution_start": 1634664703398, 66 | "source_hash": "53315aff", 67 | "tags": [] 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "custom_mapped_columns = {'Player Name': 'name', 'Player Salary': 'salary'}" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "cell_id": "00005-93b6dcf8-51e7-471b-9f27-ca88fa93f463", 79 | "deepnote_cell_type": "code", 80 | "deepnote_to_be_reexecuted": false, 81 | "execution_millis": 144901, 82 | "execution_start": 1634664703441, 83 | "source_hash": "805d5ae3", 84 | "tags": [] 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "new_df = df.rename(columns=custom_mapped_columns)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "cell_id": "00008-c49d10d4-b635-41e0-9aa9-3432117382e0", 96 | "deepnote_cell_type": "code", 97 | "deepnote_to_be_reexecuted": false, 98 | "execution_millis": 98364, 99 | "execution_start": 1634664703639, 100 | "source_hash": "a7cdfe1a", 101 | "tags": [] 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "df = new_df.copy()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "cell_id": "00009-b67e1732-661c-4924-abe5-0fb426055b53", 113 | "deepnote_cell_type": "code", 114 | "deepnote_to_be_reexecuted": false, 115 | "execution_millis": 72, 116 | "execution_start": 1634664810180, 117 | "source_hash": "838c0c28", 118 | "tags": [] 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "df.tail()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "cell_id": "00010-ae0b342b-d7f3-46b2-8668-f256c3b685c1", 130 | "deepnote_cell_type": "code", 131 | "deepnote_to_be_reexecuted": false, 132 | "execution_millis": 0, 133 | "execution_start": 1634664863379, 134 | "source_hash": "89b06b8e", 135 | "tags": [] 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "my_salary_list = list(df['salary'].values)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "cell_id": "00008-31c5accf-8a7d-4bdc-b97a-c8a210f16c2d", 147 | "deepnote_cell_type": "code", 148 | "deepnote_to_be_reexecuted": false, 149 | "execution_millis": 11, 150 | "execution_start": 1634664863791, 151 | "source_hash": "263ee8a5", 152 | "tags": [] 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "# float(my_salary_list[0])" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "cell_id": "00009-905b7c10-33fb-4e9d-91a2-b7599107dcaa", 164 | "deepnote_cell_type": "code", 165 | "deepnote_to_be_reexecuted": false, 166 | "execution_millis": 36, 167 | "execution_start": 1634664931801, 168 | "source_hash": "eccf32d0", 169 | "tags": [] 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "current_str = '$23,564,932.00'.replace(\"$\", \"\").replace(\",\", \"_\")\n", 174 | "current_dollars = float(current_str)\n", 175 | "\n", 176 | "current_dollars * 32" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "cell_id": "00010-0abc1f6e-6f69-4400-9d45-8f403624e6ea", 184 | "deepnote_cell_type": "code", 185 | "deepnote_to_be_reexecuted": false, 186 | "execution_millis": 152, 187 | "execution_start": 1634665011941, 188 | "source_hash": "6dc07231", 189 | "tags": [] 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "my_salary_list_cleaned = [float(x.replace(\"$\", \"\").replace(\",\", \"_\")) for x in my_salary_list]\n", 194 | "my_salary_list_cleaned[0]" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "cell_id": "00011-788b27c2-271b-4c6c-99d3-41d2064ac321", 202 | "deepnote_cell_type": "code", 203 | "deepnote_to_be_reexecuted": false, 204 | "execution_millis": 19, 205 | "execution_start": 1634665032102, 206 | "source_hash": "b2382e4e", 207 | "tags": [] 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "df['salary_cleaned'] = my_salary_list_cleaned" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "cell_id": "00012-ede91513-304c-4cc7-ba13-832758f63350", 219 | "deepnote_cell_type": "code", 220 | "deepnote_to_be_reexecuted": false, 221 | "execution_millis": 15, 222 | "execution_start": 1634665037154, 223 | "source_hash": "c085b6ba", 224 | "tags": [] 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "df.head()" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "cell_id": "00013-c82a8240-dc20-4bab-9ea6-7483496b83ca", 236 | "deepnote_cell_type": "code", 237 | "deepnote_to_be_reexecuted": false, 238 | "execution_millis": 93, 239 | "execution_start": 1634665204365, 240 | "source_hash": "3272c222", 241 | "tags": [] 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "def clean_salary_data(val):\n", 246 | " new_val = float(val.replace(\"$\", \"\").replace(\",\", \"_\"))\n", 247 | " return new_val\n", 248 | "\n", 249 | "df['salary_cleaned_2'] = df['salary'].apply(clean_salary_data)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "cell_id": "00014-3a1907ec-5e2b-4f75-9a0a-ae7a7ed0b8aa", 257 | "deepnote_cell_type": "code", 258 | "deepnote_to_be_reexecuted": false, 259 | "execution_millis": 52, 260 | "execution_start": 1634665205940, 261 | "source_hash": "c085b6ba", 262 | "tags": [] 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "df.head()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "cell_id": "00015-e4db563d-51a1-4641-9b90-ec2c197a076e", 274 | "deepnote_cell_type": "code", 275 | "deepnote_to_be_reexecuted": false, 276 | "execution_millis": 8, 277 | "execution_start": 1634665297961, 278 | "source_hash": "d4fbbd00", 279 | "tags": [] 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "rows_length = df.shape[0] # (row_length, col_length)\n", 284 | "new_rows = int(rows_length / 2.0)\n", 285 | "new_rows" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "cell_id": "00016-4c33d2d5-39a1-4fe4-ab92-40217746486d", 293 | "deepnote_cell_type": "code", 294 | "deepnote_to_be_reexecuted": false, 295 | "execution_millis": 2, 296 | "execution_start": 1634665349160, 297 | "source_hash": "962d0c10", 298 | "tags": [] 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "new_col_data = [True for x in range(0, new_rows)]\n", 303 | "df['new_data'] = new_col_data + new_col_data" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "cell_id": "00017-ab5d82a6-e83a-4e03-8859-4aa28bb192c1", 311 | "deepnote_cell_type": "code", 312 | "deepnote_to_be_reexecuted": false, 313 | "execution_millis": 22, 314 | "execution_start": 1634665353581, 315 | "source_hash": "c085b6ba", 316 | "tags": [] 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "df.head()" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": { 327 | "cell_id": "00018-2025550d-f4c6-4f94-b1e9-0a68076f0d83", 328 | "deepnote_cell_type": "code", 329 | "deepnote_to_be_reexecuted": false, 330 | "execution_millis": 1, 331 | "execution_start": 1634665562850, 332 | "source_hash": "ba524046", 333 | "tags": [] 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "def apply_on_df(row):\n", 338 | " row['random_data'] = 12 \n", 339 | " return row\n", 340 | "\n", 341 | "# df.apply(apply_on_df, axis=1)\n" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "cell_id": "00019-23a40cd4-d1bb-478e-acb8-951f3467964a", 349 | "deepnote_cell_type": "code", 350 | "deepnote_to_be_reexecuted": false, 351 | "execution_millis": 41, 352 | "execution_start": 1634665517660, 353 | "source_hash": "c085b6ba", 354 | "tags": [] 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "df.head()" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "cell_id": "00020-83834316-225d-407e-9663-bfb6fc0a4158", 366 | "deepnote_cell_type": "code", 367 | "deepnote_to_be_reexecuted": false, 368 | "execution_millis": 22, 369 | "execution_start": 1634665590996, 370 | "source_hash": "a0af127d", 371 | "tags": [] 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "df['half_salary'] = df['salary_cleaned'] * 0.5" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "cell_id": "00021-a77b18f8-30cb-4db2-afcb-f70d0710d12a", 383 | "deepnote_cell_type": "code", 384 | "deepnote_to_be_reexecuted": false, 385 | "execution_millis": 19, 386 | "execution_start": 1634665638969, 387 | "source_hash": "22926cbe", 388 | "tags": [] 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "df['half_salary_again'] = df['salary_cleaned'].apply(lambda x: x * 0.5)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "cell_id": "00021-e898b7bb-71c3-4cf9-b30e-2e2c0bf7cb5f", 400 | "deepnote_cell_type": "code", 401 | "deepnote_to_be_reexecuted": false, 402 | "execution_millis": 74, 403 | "execution_start": 1634665642531, 404 | "source_hash": "c085b6ba", 405 | "tags": [] 406 | }, 407 | "outputs": [], 408 | "source": [ 409 | "df.head()" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "cell_id": "00022-09f6345e-c840-43f2-897e-7b8c8c30b2a3", 417 | "deepnote_cell_type": "code", 418 | "tags": [] 419 | }, 420 | "outputs": [], 421 | "source": [] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": { 426 | "created_in_deepnote_cell": true, 427 | "deepnote_cell_type": "markdown", 428 | "tags": [] 429 | }, 430 | "source": [ 431 | "\n", 432 | "Created in deepnote.com \n", 433 | "Created in Deepnote" 434 | ] 435 | } 436 | ], 437 | "metadata": { 438 | "deepnote": { 439 | "is_reactive": false 440 | }, 441 | "deepnote_execution_queue": [], 442 | "deepnote_notebook_id": "2f18222c-6554-4fca-aaf0-222952069583" 443 | }, 444 | "nbformat": 4, 445 | "nbformat_minor": 2 446 | } 447 | -------------------------------------------------------------------------------- /nbs_ref/5 - Basic Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "cell_id": "00000-d1d3b693-8fc2-4852-b977-bf59db96cdf7", 8 | "deepnote_cell_type": "code", 9 | "deepnote_to_be_reexecuted": false, 10 | "execution_millis": 1, 11 | "execution_start": 1634670769787, 12 | "source_hash": "120b09e9", 13 | "tags": [] 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "import pathlib\n", 18 | "import pandas as pd\n", 19 | "\n", 20 | "BASE_DIR = pathlib.Path().resolve().parent\n", 21 | "COURSE_DIR = BASE_DIR / \"course\"\n", 22 | "SAMPLES_DIR = COURSE_DIR / 'samples'\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "cell_id": "00001-4da4045c-083d-4950-a801-f30394e6b103", 30 | "deepnote_cell_type": "code", 31 | "deepnote_to_be_reexecuted": false, 32 | "execution_millis": 64, 33 | "execution_start": 1634670843243, 34 | "source_hash": "811c99cf", 35 | "tags": [] 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "init_df = pd.read_csv(SAMPLES_DIR / '2.csv')\n", 40 | "init_df.head()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "cell_id": "00002-c7f63dad-4bd5-49fb-befc-dd9d01183da4", 48 | "deepnote_cell_type": "code", 49 | "deepnote_to_be_reexecuted": false, 50 | "execution_millis": 34, 51 | "execution_start": 1634670926698, 52 | "source_hash": "2c5513cb", 53 | "tags": [] 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "columns = ['name', 'salary_as_float']\n", 58 | "df = init_df.copy()[columns]\n", 59 | "df.head()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "cell_id": "00003-7151714e-0b8b-46ac-a35e-f057309c0bd2", 67 | "deepnote_cell_type": "code", 68 | "deepnote_to_be_reexecuted": false, 69 | "execution_millis": 2, 70 | "execution_start": 1634670959996, 71 | "source_hash": "889a1265", 72 | "tags": [] 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "n_rows = df.shape[0] # (rows, cols)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "cell_id": "00004-4a2dced2-1c95-42cc-9c4d-be27b5ab38f4", 84 | "deepnote_cell_type": "code", 85 | "deepnote_to_be_reexecuted": false, 86 | "execution_millis": 5, 87 | "execution_start": 1634671018731, 88 | "source_hash": "cd46aaa8", 89 | "tags": [] 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "salaries = list(df['salary_as_float'].values)\n", 94 | "sum_salaries = sum(salaries)\n", 95 | "avg_salaries = sum_salaries / n_rows\n", 96 | "print(avg_salaries)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "cell_id": "00005-ac5763c8-8738-4ebb-823b-252a21900f4d", 104 | "deepnote_cell_type": "code", 105 | "deepnote_to_be_reexecuted": false, 106 | "execution_millis": 8, 107 | "execution_start": 1634671056500, 108 | "source_hash": "1e1d38cd", 109 | "tags": [] 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "avg = df['salary_as_float'].mean()\n", 114 | "avg" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "cell_id": "00006-4f6b6acb-26b3-499f-a66a-12c9123cc087", 122 | "deepnote_cell_type": "code", 123 | "deepnote_to_be_reexecuted": false, 124 | "execution_millis": 7, 125 | "execution_start": 1634671094965, 126 | "source_hash": "3e7b4aaa", 127 | "tags": [] 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "df_sum = df['salary_as_float'].sum() # / n_rows\n", 132 | "df_sum" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "cell_id": "00007-a33f5ea6-757b-44f9-abf0-25c4d3d8e645", 140 | "deepnote_cell_type": "code", 141 | "deepnote_to_be_reexecuted": false, 142 | "execution_millis": 9, 143 | "execution_start": 1634671153935, 144 | "source_hash": "8a870ad4", 145 | "tags": [] 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "df_mode = df['salary_as_float'].mode()\n", 150 | "df_mode" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "cell_id": "00008-b944d33a-73af-458a-9183-e1ec7eca7685", 158 | "deepnote_cell_type": "code", 159 | "deepnote_to_be_reexecuted": false, 160 | "execution_millis": 3, 161 | "execution_start": 1634671186003, 162 | "source_hash": "e2c1864b", 163 | "tags": [] 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "top_salary = df['salary_as_float'].max()\n", 168 | "bottom_salary = df['salary_as_float'].min()\n", 169 | "print(top_salary, bottom_salary)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": { 175 | "created_in_deepnote_cell": true, 176 | "deepnote_cell_type": "markdown", 177 | "tags": [] 178 | }, 179 | "source": [ 180 | "\n", 181 | "Created in deepnote.com \n", 182 | "Created in Deepnote" 183 | ] 184 | } 185 | ], 186 | "metadata": { 187 | "deepnote": { 188 | "is_reactive": false 189 | }, 190 | "deepnote_execution_queue": [], 191 | "deepnote_notebook_id": "cf920c62-4d9f-44fd-8086-4b610a57be2c" 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 2 195 | } 196 | -------------------------------------------------------------------------------- /nbs_ref/6 - Grouping & Plots.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "cell_id": "00000-6ff3fb28-b769-44e1-acda-797876f4de99", 8 | "deepnote_cell_type": "code", 9 | "deepnote_to_be_reexecuted": false, 10 | "execution_millis": 37, 11 | "execution_start": 1634673068073, 12 | "source_hash": "1477e9f2", 13 | "tags": [] 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "import pathlib\n", 18 | "import pandas as pd\n", 19 | "\n", 20 | "BASE_DIR = pathlib.Path().resolve().parent\n", 21 | "COURSE_DIR = BASE_DIR / \"course\"\n", 22 | "SAMPLES_DIR = COURSE_DIR / 'samples'\n", 23 | "init_df = pd.read_csv(SAMPLES_DIR / '2.csv')\n", 24 | "df = init_df.copy()[['name', 'salary_as_float']]" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "cell_id": "00001-5487d78d-d59f-4be1-a7ee-3f88ddabcede", 32 | "deepnote_cell_type": "code", 33 | "deepnote_to_be_reexecuted": false, 34 | "execution_millis": 11, 35 | "execution_start": 1634673069101, 36 | "source_hash": "c085b6ba", 37 | "tags": [] 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "df.head()" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "cell_id": "00002-338a599e-31af-4acf-81c3-d1bbcdfd4e0a", 49 | "deepnote_cell_type": "code", 50 | "deepnote_to_be_reexecuted": false, 51 | "execution_millis": 10, 52 | "execution_start": 1634673168311, 53 | "source_hash": "9c98dc87", 54 | "tags": [] 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "df['salary_norm'] = (df['salary_as_float'] - df['salary_as_float'].min()) / (df['salary_as_float'].max() - df['salary_as_float'].min())" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "cell_id": "00003-e4cc0c7c-bd5e-4626-8b09-7faece38b632", 66 | "deepnote_cell_type": "code", 67 | "deepnote_to_be_reexecuted": false, 68 | "execution_millis": 10, 69 | "execution_start": 1634673185917, 70 | "source_hash": "c085b6ba", 71 | "tags": [] 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "df.head()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "cell_id": "00004-d7c94965-1fa7-458a-b619-9ece1eeb0199", 83 | "deepnote_cell_type": "code", 84 | "deepnote_to_be_reexecuted": false, 85 | "execution_millis": 5, 86 | "execution_start": 1634673269717, 87 | "source_hash": "17f60c52", 88 | "tags": [] 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "def group_our_row(val):\n", 93 | " if val > .95:\n", 94 | " return \"top\"\n", 95 | " elif val <= .95 and val > .5:\n", 96 | " return \"mid\"\n", 97 | " return \"low\"\n", 98 | "\n", 99 | "df['group'] = df['salary_norm'].apply(group_our_row)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "cell_id": "00005-72c969b5-62b6-4f97-aa81-94dc40d354af", 107 | "deepnote_cell_type": "code", 108 | "deepnote_to_be_reexecuted": false, 109 | "execution_millis": 13, 110 | "execution_start": 1634673275779, 111 | "source_hash": "c085b6ba", 112 | "tags": [] 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "df.head()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "cell_id": "00006-6279936a-01f8-45c5-a8f5-3202c2f88fc7", 124 | "deepnote_cell_type": "code", 125 | "deepnote_to_be_reexecuted": false, 126 | "execution_millis": 7, 127 | "execution_start": 1634673339360, 128 | "source_hash": "f3b0fec6", 129 | "tags": [] 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "df['group'].value_counts()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "cell_id": "00007-bbc32c6f-2cba-428c-bcfa-3f0da950ba4f", 141 | "deepnote_cell_type": "code", 142 | "deepnote_to_be_reexecuted": false, 143 | "execution_millis": 17, 144 | "execution_start": 1634673389735, 145 | "source_hash": "8d05d690", 146 | "tags": [] 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "df.groupby(\"group\")['salary_as_float'].mean()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "cell_id": "00008-53df8c13-34f7-4dec-8f52-693110591029", 158 | "deepnote_cell_type": "code", 159 | "deepnote_to_be_reexecuted": false, 160 | "execution_millis": 13, 161 | "execution_start": 1634673438596, 162 | "source_hash": "c45ddc98", 163 | "tags": [] 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "df.groupby(\"group\")['salary_as_float'].mean().apply(lambda x: f\"${x:,.2f}\")" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "cell_id": "00009-e320a2f4-7264-4506-bdbb-91e61a293ceb", 175 | "deepnote_cell_type": "code", 176 | "deepnote_to_be_reexecuted": false, 177 | "execution_millis": 17, 178 | "execution_start": 1634673460506, 179 | "source_hash": "d18ad3de", 180 | "tags": [] 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "df.groupby(\"group\")['salary_as_float'].sum().apply(lambda x: f\"${x:,.2f}\")" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "cell_id": "00010-e8b9efa3-22bd-452a-bd30-2b2a290c7c9d", 192 | "deepnote_cell_type": "code", 193 | "deepnote_to_be_reexecuted": false, 194 | "execution_millis": 34, 195 | "execution_start": 1634673501615, 196 | "source_hash": "c085b6ba", 197 | "tags": [] 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "df.head()" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "cell_id": "00011-4abbbef9-1ef7-4457-93e3-8ee48fecbd0b", 209 | "deepnote_cell_type": "code", 210 | "deepnote_to_be_reexecuted": false, 211 | "execution_millis": 11, 212 | "execution_start": 1634673632965, 213 | "source_hash": "d6663e40", 214 | "tags": [] 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "mean_group_data = df.groupby(\"group\")['salary_as_float'].mean()\n", 219 | "type(mean_group_data)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "cell_id": "00012-94072016-e962-48ff-8077-2c7a58f14d43", 227 | "deepnote_cell_type": "code", 228 | "deepnote_to_be_reexecuted": false, 229 | "execution_millis": 174, 230 | "execution_start": 1634673740679, 231 | "source_hash": "1e4998d2", 232 | "tags": [] 233 | }, 234 | "outputs": [], 235 | "source": [ 236 | "mean_group_data.plot(kind='bar')" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "cell_id": "00013-c3058928-6583-4b3c-8cf2-bf4963d552b8", 244 | "deepnote_cell_type": "code", 245 | "tags": [] 246 | }, 247 | "outputs": [], 248 | "source": [] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "created_in_deepnote_cell": true, 254 | "deepnote_cell_type": "markdown", 255 | "tags": [] 256 | }, 257 | "source": [ 258 | "\n", 259 | "Created in deepnote.com \n", 260 | "Created in Deepnote" 261 | ] 262 | } 263 | ], 264 | "metadata": { 265 | "deepnote": { 266 | "is_reactive": false 267 | }, 268 | "deepnote_execution_queue": [], 269 | "deepnote_notebook_id": "cf991719-f636-434d-85ce-17792dadcb6e" 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 2 273 | } 274 | -------------------------------------------------------------------------------- /nbs_ref/7 - Clean Real Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "cell_id": "00000-6184a4d0-7989-48ea-a1d7-229bbb45c9c4", 8 | "deepnote_cell_type": "code", 9 | "deepnote_to_be_reexecuted": false, 10 | "execution_millis": 43, 11 | "execution_start": 1634674284536, 12 | "source_hash": "771ec97", 13 | "tags": [] 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "import pathlib\n", 18 | "import pandas as pd\n", 19 | "import utils\n", 20 | "\n", 21 | "BASE_DIR = pathlib.Path().resolve().parent\n", 22 | "COURSE_DIR = BASE_DIR / \"course\"\n", 23 | "DATASET_DIR = COURSE_DIR / \"datasets\"\n", 24 | "INPUT_PATH = DATASET_DIR / 'nba-historical-salaries.csv' # appendix a" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "cell_id": "00001-b13827b3-ba63-4a27-98cd-75ea460dc229", 32 | "deepnote_cell_type": "code", 33 | "deepnote_to_be_reexecuted": false, 34 | "execution_millis": 45, 35 | "execution_start": 1634674284580, 36 | "source_hash": "cdb9e8e8", 37 | "tags": [] 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "df = pd.read_csv(INPUT_PATH)\n", 42 | "df.head()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "cell_id": "00002-98de7395-c20b-4112-9c50-f0343a582927", 50 | "deepnote_cell_type": "code", 51 | "deepnote_to_be_reexecuted": false, 52 | "execution_millis": 2719, 53 | "execution_start": 1634674463776, 54 | "source_hash": "451b0cc8", 55 | "tags": [] 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "def clean_row(row):\n", 60 | " # this is a pandas series\n", 61 | " cols = ['salary', 'adj_salary']\n", 62 | " for col in cols:\n", 63 | " row[col] = utils.dollar_str_to_float(row[col])\n", 64 | " return row\n", 65 | "\n", 66 | "df_cleaned = df.copy().apply(clean_row, axis=1)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "cell_id": "00003-eb652a22-ff35-4416-bbd0-866a2241418f", 74 | "deepnote_cell_type": "code", 75 | "deepnote_to_be_reexecuted": false, 76 | "execution_millis": 46, 77 | "execution_start": 1634674405676, 78 | "source_hash": "79cfa8a3", 79 | "tags": [] 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "df_cleaned.head()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "cell_id": "00004-cafe7973-4639-45eb-8b62-baa02fb76924", 91 | "deepnote_cell_type": "code", 92 | "deepnote_to_be_reexecuted": false, 93 | "execution_millis": 2, 94 | "execution_start": 1634674498860, 95 | "source_hash": "fa21f23c", 96 | "tags": [] 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "df_cleaned['salary'].dtype" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "cell_id": "00005-02c9e2fc-11f1-4403-b922-ab088a21921d", 108 | "deepnote_cell_type": "code", 109 | "deepnote_to_be_reexecuted": false, 110 | "execution_millis": 0, 111 | "execution_start": 1634674907005, 112 | "source_hash": "8a728a42", 113 | "tags": [] 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "player_per_year = df_cleaned.groupby('year-start')['year-end'].value_counts()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "cell_id": "00006-e7a1d785-6db7-41fe-a8d4-599239a49b3d", 125 | "deepnote_cell_type": "code", 126 | "deepnote_to_be_reexecuted": false, 127 | "execution_millis": 332, 128 | "execution_start": 1634674923778, 129 | "source_hash": "7ac234a2", 130 | "tags": [] 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "player_per_year.plot(title='# of Players per Year')" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "cell_id": "00007-d3746b2c-d81a-44ee-bfce-4e5d281dc32c", 142 | "deepnote_cell_type": "code", 143 | "deepnote_to_be_reexecuted": false, 144 | "execution_millis": 16, 145 | "execution_start": 1634675071018, 146 | "source_hash": "39dba7f8", 147 | "tags": [] 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "adj_salaries = df_cleaned.groupby('year-start')['adj_salary'].mean()\n", 152 | "adj_salaries" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "cell_id": "00008-3ca443e2-a570-4d3d-ae29-b544ee0051e3", 160 | "deepnote_cell_type": "code", 161 | "deepnote_to_be_reexecuted": false, 162 | "execution_millis": 235, 163 | "execution_start": 1634675106167, 164 | "source_hash": "bec8be5d", 165 | "tags": [] 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "adj_salaries.plot(title='Adj Average Salary over Time')" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "cell_id": "00009-7171f951-b87c-4a04-92b6-21ee1e66530c", 177 | "deepnote_cell_type": "code", 178 | "deepnote_to_be_reexecuted": false, 179 | "execution_millis": 269, 180 | "execution_start": 1634675166491, 181 | "source_hash": "d4097160", 182 | "tags": [] 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "adj_salaries_sum = df_cleaned.groupby('year-start')['adj_salary'].sum()\n", 187 | "adj_salaries_sum.plot()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "cell_id": "00010-a3c81557-18a9-4f76-b9c2-03b1784e10ae", 195 | "deepnote_cell_type": "code", 196 | "tags": [] 197 | }, 198 | "outputs": [], 199 | "source": [] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": { 204 | "created_in_deepnote_cell": true, 205 | "deepnote_cell_type": "markdown", 206 | "tags": [] 207 | }, 208 | "source": [ 209 | "\n", 210 | "Created in deepnote.com \n", 211 | "Created in Deepnote" 212 | ] 213 | } 214 | ], 215 | "metadata": { 216 | "deepnote": { 217 | "is_reactive": false 218 | }, 219 | "deepnote_execution_queue": [], 220 | "deepnote_notebook_id": "07b99e53-6ee8-425b-9587-4e24c7c22fc9" 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 2 224 | } 225 | -------------------------------------------------------------------------------- /nbs_ref/8 - Merge Datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "cell_id": "00000-b6e94cba-c4cd-42da-8208-cfa28665606a", 8 | "deepnote_cell_type": "code", 9 | "deepnote_to_be_reexecuted": false, 10 | "execution_millis": 8, 11 | "execution_start": 1634678578677, 12 | "source_hash": "59e619dd", 13 | "tags": [] 14 | }, 15 | "outputs": [], 16 | "source": [ 17 | "import datetime\n", 18 | "import pathlib\n", 19 | "import pandas as pd\n", 20 | "import utils\n", 21 | "\n", 22 | "BASE_DIR = pathlib.Path().resolve().parent\n", 23 | "COURSE_DIR = BASE_DIR / \"course\"\n", 24 | "DATASET_DIR = COURSE_DIR / \"datasets\"\n", 25 | "SAMPLES_DIR = COURSE_DIR / \"samples\"\n", 26 | "INPUT_PATH = SAMPLES_DIR / '4-player-salaries-cleaned.csv'\n", 27 | "INFLATION_DATA_INPUT_PATH = DATASET_DIR / 'inflation-rate.csv' # appendix b" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "cell_id": "00001-1c6dfc81-739b-46f9-b4eb-9b196fd0ea2e", 35 | "deepnote_cell_type": "code", 36 | "deepnote_to_be_reexecuted": false, 37 | "execution_millis": 30, 38 | "execution_start": 1634676577391, 39 | "source_hash": "1249c42", 40 | "tags": [] 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "df = pd.read_csv(INPUT_PATH)\n", 45 | "inflation_df = pd.read_csv(INFLATION_DATA_INPUT_PATH)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "cell_id": "00002-d5211440-03f4-45a1-9fe1-e51e60032257", 53 | "deepnote_cell_type": "code", 54 | "deepnote_to_be_reexecuted": false, 55 | "execution_millis": 3, 56 | "execution_start": 1634676577495, 57 | "source_hash": "1d1d0047", 58 | "tags": [] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "inflation_df.head()" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "cell_id": "00003-7be75291-9b75-4f89-8575-395e572cf507", 70 | "deepnote_cell_type": "code", 71 | "deepnote_to_be_reexecuted": false, 72 | "execution_millis": 21, 73 | "execution_start": 1634676577496, 74 | "source_hash": "c085b6ba", 75 | "tags": [] 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "df.head()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "cell_id": "00003-bfa3f414-9661-4507-9161-8254062d455b", 87 | "deepnote_cell_type": "code", 88 | "deepnote_to_be_reexecuted": false, 89 | "execution_millis": 2, 90 | "execution_start": 1634676788998, 91 | "source_hash": "c65297e4", 92 | "tags": [] 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "og_salary = df.iloc[0]['salary'] # \n", 97 | "adj_salary = df.iloc[0]['adj_salary']\n", 98 | "year_start =df.iloc[0]['year_start']\n", 99 | "year_end =df.iloc[0]['year_end']" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "cell_id": "00004-0b860d70-f703-4c2b-9a76-ffdbb053e963", 107 | "deepnote_cell_type": "code", 108 | "deepnote_to_be_reexecuted": false, 109 | "execution_millis": 1, 110 | "execution_start": 1634676683383, 111 | "source_hash": "87e09398", 112 | "tags": [] 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "inflation_df.set_index(\"date\", inplace=True)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "cell_id": "00005-94068da7-b63e-47de-bff3-e11e1650af85", 124 | "deepnote_cell_type": "code", 125 | "deepnote_to_be_reexecuted": false, 126 | "execution_millis": 2, 127 | "execution_start": 1634676921931, 128 | "source_hash": "88a8f6df", 129 | "tags": [] 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "multiplier = float(inflation_df[f'{year_start-1}': f'{year_end-1}']['multiplier'])" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "cell_id": "00006-ab40009d-b5fb-4476-87d2-6ef1d80f7033", 141 | "deepnote_cell_type": "code", 142 | "deepnote_to_be_reexecuted": false, 143 | "execution_millis": 4, 144 | "execution_start": 1634676923367, 145 | "source_hash": "8be8d4a1", 146 | "tags": [] 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "(multiplier * og_salary) - adj_salary" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "cell_id": "00011-073931b9-f5de-42db-8816-1b38ab49b020", 158 | "deepnote_cell_type": "code", 159 | "deepnote_to_be_reexecuted": false, 160 | "execution_millis": 0, 161 | "execution_start": 1634678420331, 162 | "source_hash": "7e1a403f", 163 | "tags": [] 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "def cal_adj_salary_2(row):\n", 168 | " og_salary = row['salary'] # \n", 169 | " adj_salary = row['adj_salary']\n", 170 | " year_start =row['year_start']\n", 171 | " year_end =row['year_end']\n", 172 | " multiplier = float(inflation_df[f'{year_start-1}': f'{year_end-1}']['multiplier'])\n", 173 | " row['adj_salary_2'] = (multiplier * og_salary)\n", 174 | " return row\n", 175 | "\n", 176 | "# df.apply(cal_adj_salary_2, axis=1)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "cell_id": "00009-27db00c0-6f60-45de-8e91-b5858052beb4", 184 | "deepnote_cell_type": "code", 185 | "deepnote_to_be_reexecuted": false, 186 | "execution_millis": 3, 187 | "execution_start": 1634678445812, 188 | "source_hash": "65afa563", 189 | "tags": [] 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "inflation_df.reset_index(inplace=True, drop=False) " 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "cell_id": "00010-ff3f5c94-531f-4293-8ab8-c06ffc0fe23d", 201 | "deepnote_cell_type": "code", 202 | "deepnote_to_be_reexecuted": false, 203 | "execution_millis": 5, 204 | "execution_start": 1634678455068, 205 | "source_hash": "1d36096", 206 | "tags": [] 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "inflation_df.columns\n" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "cell_id": "00011-bf991f67-55b0-4e01-99e5-a97d51e63c13", 218 | "deepnote_cell_type": "code", 219 | "deepnote_to_be_reexecuted": false, 220 | "execution_millis": 3, 221 | "execution_start": 1634678503692, 222 | "source_hash": "e7234db2", 223 | "tags": [] 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "df.merge(inflation_df, left_on='year_start', right_on='date')" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "cell_id": "00012-948e47fa-6cd4-4576-ad0c-305e8261329d", 235 | "deepnote_cell_type": "code", 236 | "deepnote_to_be_reexecuted": false, 237 | "execution_millis": 2, 238 | "execution_start": 1634678672675, 239 | "source_hash": "ed6039a1", 240 | "tags": [] 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "df['date'].dtype" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "cell_id": "00013-6182c67d-299b-4087-bdf1-cd3b826d2841", 252 | "deepnote_cell_type": "code", 253 | "deepnote_to_be_reexecuted": false, 254 | "execution_millis": 6, 255 | "execution_start": 1634678666293, 256 | "source_hash": "f473fc50", 257 | "tags": [] 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "inflation_df['date'].dtype" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": { 268 | "cell_id": "00014-d4d9e9bc-e347-4f98-8a68-e0313dc428da", 269 | "deepnote_cell_type": "code", 270 | "deepnote_to_be_reexecuted": false, 271 | "execution_millis": 253, 272 | "execution_start": 1634678659184, 273 | "source_hash": "b1530698", 274 | "tags": [] 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "df['date'] = df['year_start'].apply(lambda x: datetime.datetime.strptime(f\"{x}-12-31\", \"%Y-%m-%d\"))\n", 279 | "inflation_df['date'] = inflation_df['date'].apply(lambda x: datetime.datetime.strptime(f\"{x}\", \"%Y-%m-%d\"))\n" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "cell_id": "00015-1d51e6da-d2f4-469e-812b-46d2df48b9c3", 287 | "deepnote_cell_type": "code", 288 | "deepnote_to_be_reexecuted": false, 289 | "execution_millis": 1, 290 | "execution_start": 1634678723983, 291 | "source_hash": "24afa461", 292 | "tags": [] 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "merged_df = df.merge(inflation_df, left_on='date', right_on='date')" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": { 303 | "cell_id": "00016-265717d2-93cb-49a1-92b4-c7a8ea0957e8", 304 | "deepnote_cell_type": "code", 305 | "deepnote_to_be_reexecuted": false, 306 | "execution_millis": 17, 307 | "execution_start": 1634678732032, 308 | "source_hash": "89beb45", 309 | "tags": [] 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "merged_df.head()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": { 320 | "cell_id": "00017-46d049c9-c2fe-46f6-9080-3307ab2c0c93", 321 | "deepnote_cell_type": "code", 322 | "deepnote_to_be_reexecuted": false, 323 | "execution_millis": 370, 324 | "execution_start": 1634678798480, 325 | "source_hash": "2889b9fe", 326 | "tags": [] 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "merged_df['adj_salary_audit'] = merged_df['salary'] * merged_df['multiplier']" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": { 337 | "cell_id": "00018-6de37f92-4158-4bbe-a84e-2ff627983cc7", 338 | "deepnote_cell_type": "code", 339 | "deepnote_to_be_reexecuted": false, 340 | "execution_millis": 50, 341 | "execution_start": 1634678804259, 342 | "source_hash": "89beb45", 343 | "tags": [] 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "merged_df.head()" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": { 354 | "cell_id": "00019-e18192c1-603c-45ee-99f2-e66dfb71c753", 355 | "deepnote_cell_type": "code", 356 | "deepnote_to_be_reexecuted": false, 357 | "execution_millis": 1, 358 | "execution_start": 1634678934679, 359 | "source_hash": "2c2e27e2", 360 | "tags": [] 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "merged_df['delta'] = merged_df['adj_salary'] - merged_df['adj_salary_audit']" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": { 371 | "cell_id": "00019-6cecd6cc-e8c1-45be-8855-347572f58515", 372 | "deepnote_cell_type": "code", 373 | "deepnote_to_be_reexecuted": false, 374 | "execution_millis": 2, 375 | "execution_start": 1634678946003, 376 | "source_hash": "2019aaf6", 377 | "tags": [] 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "merged_df['delta'].sum()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "cell_id": "00021-acc29beb-a865-42ae-beaf-bbdc07225e6e", 389 | "deepnote_cell_type": "code", 390 | "deepnote_to_be_reexecuted": false, 391 | "execution_millis": 8, 392 | "execution_start": 1634678984535, 393 | "source_hash": "d377b739", 394 | "tags": [] 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "f\"{merged_df['adj_salary'].sum():,.2f}\"" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": { 405 | "cell_id": "00022-b8840192-32e2-43ab-8809-60796273a018", 406 | "deepnote_cell_type": "code", 407 | "deepnote_to_be_reexecuted": false, 408 | "execution_millis": 9, 409 | "execution_start": 1634679013094, 410 | "source_hash": "455f7aa7", 411 | "tags": [] 412 | }, 413 | "outputs": [], 414 | "source": [ 415 | "merged_df['adj_salary'].sum() - merged_df['delta'].sum()" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": { 422 | "cell_id": "00023-1ee187d6-0d07-4bb0-a760-9edfc993963c", 423 | "deepnote_cell_type": "code", 424 | "tags": [] 425 | }, 426 | "outputs": [], 427 | "source": [] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": { 432 | "created_in_deepnote_cell": true, 433 | "deepnote_cell_type": "markdown", 434 | "tags": [] 435 | }, 436 | "source": [ 437 | "\n", 438 | "Created in deepnote.com \n", 439 | "Created in Deepnote" 440 | ] 441 | } 442 | ], 443 | "metadata": { 444 | "deepnote": { 445 | "is_reactive": false 446 | }, 447 | "deepnote_execution_queue": [], 448 | "deepnote_notebook_id": "7599b9fd-85af-4ebd-a7ee-55058917791d" 449 | }, 450 | "nbformat": 4, 451 | "nbformat_minor": 2 452 | } 453 | -------------------------------------------------------------------------------- /nbs_ref/9 - Using an NBA Stats API.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "cell_id": "00000-cb991b88-6ed2-45ac-9f9e-f0a764026d74", 7 | "deepnote_cell_type": "markdown", 8 | "tags": [] 9 | }, 10 | "source": [ 11 | "Let's use the free [balldontlie.io](https://www.balldontlie.io/) API for extracting a new dataset!" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "cell_id": "00000-3b0814d8-1a58-447d-9b49-aa092239ee41", 19 | "deepnote_cell_type": "code", 20 | "deepnote_to_be_reexecuted": false, 21 | "execution_millis": 88, 22 | "execution_start": 1634749327463, 23 | "source_hash": "9af06d13", 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import requests\n", 29 | "import datetime\n", 30 | "import pathlib\n", 31 | "import pandas as pd\n", 32 | "import time\n", 33 | "import utils\n", 34 | "\n", 35 | "BASE_DIR = pathlib.Path().resolve().parent\n", 36 | "COURSE_DIR = BASE_DIR / \"course\"\n", 37 | "DATASET_DIR = COURSE_DIR / \"datasets\"\n", 38 | "SAMPLES_DIR = COURSE_DIR / \"samples\"\n", 39 | "INPUT_PATH = SAMPLES_DIR / '4-player-salaries-cleaned.csv'\n", 40 | "salary_df = pd.read_csv(INPUT_PATH)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "cell_id": "00002-342871ca-eb44-4928-845c-6dd854374ca5", 48 | "deepnote_cell_type": "code", 49 | "deepnote_to_be_reexecuted": false, 50 | "execution_millis": 0, 51 | "execution_start": 1634749327600, 52 | "source_hash": "9811f36f", 53 | "tags": [] 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "players_endpoint = \"https://www.balldontlie.io/api/v1/players?per_page=100&page=0\"\n", 58 | "stats_endpoint = f'https://www.balldontlie.io/api/v1/stats'" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "cell_id": "00003-f338b11d-5033-419d-91dc-b49a25aec495", 66 | "deepnote_cell_type": "code", 67 | "deepnote_to_be_reexecuted": false, 68 | "execution_millis": 3760665, 69 | "execution_start": 1634749327600, 70 | "source_hash": "4b520d41", 71 | "tags": [] 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "# !curl \"https://www.balldontlie.io/api/v1/players?per_page=100\"" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "cell_id": "00009-740edd60-cad4-4a68-9d27-27303ce274ad", 83 | "deepnote_cell_type": "code", 84 | "deepnote_to_be_reexecuted": false, 85 | "execution_millis": 0, 86 | "execution_start": 1634749327601, 87 | "source_hash": "59b00187", 88 | "tags": [] 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "def get_players_dataset(per_page=100):\n", 93 | " dataset = []\n", 94 | " base_url = \"https://www.balldontlie.io/api/v1/players\"\n", 95 | " init_url = f\"{base_url}?per_page={per_page}\"\n", 96 | " r = requests.get(init_url)\n", 97 | " if not r.status_code in range(200, 299):\n", 98 | " return []\n", 99 | " json_data = r.json()\n", 100 | " meta_data = json_data['meta']\n", 101 | " total_pages = int(meta_data.get('total_pages'))\n", 102 | " for x in range(0, total_pages + 1):\n", 103 | " time.sleep(0.25)\n", 104 | " url = f\"{base_url}?per_page={per_page}&page={x}\"\n", 105 | " r = requests.get(url)\n", 106 | " if not r.status_code in range(200, 299):\n", 107 | " print('skipping')\n", 108 | " continue\n", 109 | " json_data = r.json()\n", 110 | " data = json_data['data']\n", 111 | " # dataset.append(data)\n", 112 | " dataset += data\n", 113 | " return dataset" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "cell_id": "00010-1710a7bc-14d4-420b-bf3e-cc9779f71535", 121 | "deepnote_cell_type": "code", 122 | "deepnote_to_be_reexecuted": false, 123 | "execution_millis": 12194, 124 | "execution_start": 1634749327643, 125 | "source_hash": "59be25f8", 126 | "tags": [] 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "players_dataset = get_players_dataset()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "cell_id": "00011-f7353f2f-db7c-4ec1-962c-644e35454a5c", 138 | "deepnote_cell_type": "code", 139 | "deepnote_to_be_reexecuted": false, 140 | "execution_millis": 87, 141 | "execution_start": 1634749577130, 142 | "source_hash": "27076b9b", 143 | "tags": [] 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "player_df = pd.DataFrame(players_dataset)[['id', 'first_name', 'last_name']]\n", 148 | "player_df['full_name'] = player_df['first_name'] + \" \" + player_df['last_name']\n", 149 | "player_df.drop_duplicates(subset=['id'], inplace=True)\n", 150 | "player_df.head()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "cell_id": "00015-46e44e13-d060-4e26-8eec-5f724f6f6832", 158 | "deepnote_cell_type": "code", 159 | "deepnote_to_be_reexecuted": false, 160 | "execution_millis": 7, 161 | "execution_start": 1634749596218, 162 | "source_hash": "6e624904", 163 | "tags": [] 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "player_df.shape" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "cell_id": "00016-29a87eff-a101-4ff3-bdec-85fae9f9e4ba", 175 | "deepnote_cell_type": "code", 176 | "deepnote_to_be_reexecuted": false, 177 | "execution_millis": 0, 178 | "execution_start": 1634749339938, 179 | "source_hash": "b623e53d", 180 | "tags": [] 181 | }, 182 | "outputs": [], 183 | "source": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "cell_id": "00017-f79da473-1056-44bc-b9d1-eae843bbc5cf", 190 | "deepnote_cell_type": "code", 191 | "deepnote_to_be_reexecuted": false, 192 | "execution_millis": 2530419, 193 | "execution_start": 1634749339939, 194 | "source_hash": "b623e53d", 195 | "tags": [] 196 | }, 197 | "outputs": [], 198 | "source": [] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "cell_id": "00018-90c65eb6-08a0-48a7-883b-12d8ff089708", 205 | "deepnote_cell_type": "code", 206 | "deepnote_to_be_reexecuted": false, 207 | "execution_millis": 2420384, 208 | "execution_start": 1634749339984, 209 | "source_hash": "9a5e2c99", 210 | "tags": [] 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "def get_stats(player_id=1, postseason=False, per_page=100):\n", 215 | " dataset = []\n", 216 | " postseason_param = \"true\" if postseason else \"false\"\n", 217 | " base_url = f\"https://www.balldontlie.io/api/v1/stats?player_ids[]={player_id}&postseason={postseason_param}\"\n", 218 | " init_url = f\"{base_url}&per_page={per_page}\"\n", 219 | " r = requests.get(init_url)\n", 220 | " if not r.status_code in range(200, 299):\n", 221 | " return []\n", 222 | " json_data = r.json()\n", 223 | " meta_data = json_data['meta']\n", 224 | " total_pages = int(meta_data.get('total_pages'))\n", 225 | " for x in range(0, total_pages + 1):\n", 226 | " time.sleep(0.25)\n", 227 | " url = f\"{base_url}&per_page={per_page}&page={x}\"\n", 228 | " r = requests.get(url)\n", 229 | " if not r.status_code in range(200, 299):\n", 230 | " print('skipping')\n", 231 | " continue\n", 232 | " json_data = r.json()\n", 233 | " data = json_data['data']\n", 234 | " # dataset.append(data)\n", 235 | " dataset += data\n", 236 | " return dataset" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "cell_id": "00012-30978274-c083-430f-9461-b91981687a58", 244 | "deepnote_cell_type": "code", 245 | "deepnote_to_be_reexecuted": false, 246 | "execution_millis": 1, 247 | "execution_start": 1634749501803, 248 | "source_hash": "fe3ddbb4", 249 | "tags": [] 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "NESTED_STATS_COLS = ['game', 'team', 'player']\n", 254 | "\n", 255 | "def unpack_nested_dict(row):\n", 256 | " for col in NESTED_STATS_COLS:\n", 257 | " col_val = row[col] # row['game']\n", 258 | " if isinstance(col_val, dict):\n", 259 | " for key, val in col_val.items():\n", 260 | " new_col_key = f\"{col}_{key}\"\n", 261 | " # game_id\n", 262 | " # game_period\n", 263 | " # game_status\n", 264 | " row[new_col_key] = val\n", 265 | " return row\n", 266 | "\n", 267 | "def get_second_played(val):\n", 268 | " h, m, s = 0,0,0\n", 269 | " if val:\n", 270 | " time_string = val.split(\":\") # always create a list\n", 271 | " if len(time_string) == 2:\n", 272 | " m, s = time_string\n", 273 | " if len(time_string) == 3:\n", 274 | " h, m, s = time_string\n", 275 | " if len(time_string) == 1:\n", 276 | " m = time_string[0]\n", 277 | " if f\"{h}\".isdigit():\n", 278 | " h = int(h)\n", 279 | " if f\"{m}\".isdigit():\n", 280 | " m = int(m)\n", 281 | " if f\"{s}\".isdigit():\n", 282 | " s = int(s)\n", 283 | " return datetime.timedelta(hours=h, minutes=m, seconds=s).total_seconds()\n", 284 | "\n", 285 | "def get_stats_df(stats_dataset):\n", 286 | " if len(stats_dataset) == 0:\n", 287 | " return pd.DataFrame()\n", 288 | " df = pd.DataFrame(stats_dataset)\n", 289 | " df = df.apply(unpack_nested_dict, axis=1)\n", 290 | " df.drop(columns=NESTED_STATS_COLS, inplace=True)\n", 291 | " if \"game_date\" in df.columns:\n", 292 | " df['date'] = pd.to_datetime(df['game_date'])\n", 293 | " df['year'] = df['date'].apply(lambda x: x.year)\n", 294 | " if \"min\" in df.columns:\n", 295 | " df['seconds'] = df['min'].apply(get_second_played)\n", 296 | " df['did_play'] = df['seconds'].apply(lambda x: x > 0)\n", 297 | " df.drop_duplicates(subset=['id'], inplace=True)\n", 298 | " return df" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "cell_id": "00012-8825c3ec-c1d8-4867-9b96-e8ecd95dbbe9", 306 | "deepnote_cell_type": "code", 307 | "deepnote_to_be_reexecuted": false, 308 | "execution_millis": 7, 309 | "execution_start": 1634749372390, 310 | "source_hash": "5256c07f", 311 | "tags": [] 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "# player_id = player_df.sample(n=1)['id'].item()\n", 316 | "# player_id" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "cell_id": "00013-f184159c-3491-447d-b854-5ddbba5940a7", 324 | "deepnote_cell_type": "code", 325 | "deepnote_to_be_reexecuted": false, 326 | "execution_millis": 3, 327 | "execution_start": 1634749714102, 328 | "source_hash": "b6aefb90", 329 | "tags": [] 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "name = 'Michael Jordan'\n", 334 | "player = player_df[player_df[\"full_name\"] == name]\n", 335 | "player_id = 0\n", 336 | "\n", 337 | "if not player.empty:\n", 338 | " player_id = player['id'].item()\n", 339 | "\n", 340 | "player_id" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": { 347 | "cell_id": "00011-662f1c28-2461-48bb-b976-6b9852b5cb7f", 348 | "deepnote_cell_type": "code", 349 | "deepnote_to_be_reexecuted": false, 350 | "execution_millis": 8181, 351 | "execution_start": 1634749722803, 352 | "source_hash": "a573dd94", 353 | "tags": [] 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "reg_season_stats = get_stats(player_id=player_id, postseason=False)\n", 358 | "post_season_stats = get_stats(player_id=player_id, postseason=True)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "cell_id": "00013-37085edb-190d-4349-90e7-c2f9f2b16c18", 366 | "deepnote_cell_type": "code", 367 | "deepnote_to_be_reexecuted": false, 368 | "execution_millis": 36408, 369 | "execution_start": 1634749731021, 370 | "source_hash": "1b7daed3", 371 | "tags": [] 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "reg_season_df = get_stats_df(reg_season_stats)\n", 376 | "post_season_df = get_stats_df(post_season_stats)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": { 383 | "cell_id": "00014-cb51df0a-dc70-4e78-a818-ec97e7404a54", 384 | "deepnote_cell_type": "code", 385 | "deepnote_to_be_reexecuted": false, 386 | "execution_millis": 204, 387 | "execution_start": 1634749767443, 388 | "source_hash": "4c028ab8", 389 | "tags": [] 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "reg_season_df.head()" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": { 400 | "cell_id": "00016-718161f6-0116-4be8-8494-4eb4dea76e80", 401 | "deepnote_cell_type": "code", 402 | "deepnote_to_be_reexecuted": false, 403 | "execution_millis": 10, 404 | "execution_start": 1634749810520, 405 | "source_hash": "c5f00695", 406 | "tags": [] 407 | }, 408 | "outputs": [], 409 | "source": [ 410 | "post_season_df.shape" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": { 417 | "cell_id": "00018-681c6d19-666d-4e0c-b4ad-e525d27f260a", 418 | "deepnote_cell_type": "code", 419 | "deepnote_to_be_reexecuted": false, 420 | "execution_millis": 9, 421 | "execution_start": 1634750145075, 422 | "source_hash": "cb04ba6c", 423 | "tags": [] 424 | }, 425 | "outputs": [], 426 | "source": [ 427 | "avg_pts_per_year = reg_season_df.groupby('year')['pts'].mean()\n", 428 | "# avg_pts_per_year" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "cell_id": "00019-290df566-fec7-4b24-a063-4d35d0e2240f", 436 | "deepnote_cell_type": "code", 437 | "deepnote_to_be_reexecuted": false, 438 | "execution_millis": 1, 439 | "execution_start": 1634750084879, 440 | "source_hash": "c574989f", 441 | "tags": [] 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "avg_pts_per_year_per_postseason = post_season_df.groupby('year')['pts'].mean()\n", 446 | "# avg_pts_per_year_per_postseason" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "cell_id": "00020-fd615d8f-285f-46d2-8537-e175c3e50bf5", 454 | "deepnote_cell_type": "code", 455 | "deepnote_to_be_reexecuted": false, 456 | "execution_millis": 3, 457 | "execution_start": 1634750134937, 458 | "source_hash": "b623e53d", 459 | "tags": [] 460 | }, 461 | "outputs": [], 462 | "source": [ 463 | "player_salary_df = salary_df.copy()[salary_df['player'] == name][['adj_salary', 'year_start']]\n", 464 | "player_salary_df.head(n=20)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": { 471 | "cell_id": "00021-0139f72a-d42f-4cdc-b9aa-3b5b3b10fba9", 472 | "deepnote_cell_type": "code", 473 | "deepnote_to_be_reexecuted": false, 474 | "execution_millis": 3, 475 | "execution_start": 1634750199585, 476 | "source_hash": "b26c02e0", 477 | "tags": [] 478 | }, 479 | "outputs": [], 480 | "source": [ 481 | "mean_df = pd.DataFrame(avg_pts_per_year)\n", 482 | "mean_df.reset_index(drop=False, inplace=True)\n", 483 | "mean_df.head()" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": { 490 | "cell_id": "00022-b1d8c3af-6667-440d-a216-86ad40d5265f", 491 | "deepnote_cell_type": "code", 492 | "deepnote_to_be_reexecuted": false, 493 | "execution_millis": 48, 494 | "execution_start": 1634750305065, 495 | "source_hash": "b1c02bae", 496 | "tags": [] 497 | }, 498 | "outputs": [], 499 | "source": [ 500 | "merged_df = mean_df.merge(player_salary_df, left_on='year', right_on='year_start')\n", 501 | "merged_df.drop(columns=['year_start'], inplace=True)\n", 502 | "merged_df['adj_salary_$'] = merged_df['adj_salary'].apply(utils.float_to_dollars)\n", 503 | "merged_df.head(n=100)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": { 510 | "cell_id": "00023-5ddbc582-c278-4080-a907-52110cadf5ea", 511 | "deepnote_cell_type": "code", 512 | "tags": [] 513 | }, 514 | "outputs": [], 515 | "source": [] 516 | }, 517 | { 518 | "cell_type": "markdown", 519 | "metadata": { 520 | "created_in_deepnote_cell": true, 521 | "deepnote_cell_type": "markdown", 522 | "tags": [] 523 | }, 524 | "source": [ 525 | "\n", 526 | "Created in deepnote.com \n", 527 | "Created in Deepnote" 528 | ] 529 | } 530 | ], 531 | "metadata": { 532 | "deepnote": { 533 | "is_reactive": false 534 | }, 535 | "deepnote_execution_queue": [], 536 | "deepnote_notebook_id": "16cecac8-66ad-4d39-8a6c-73c2d96fd581" 537 | }, 538 | "nbformat": 4, 539 | "nbformat_minor": 2 540 | } 541 | -------------------------------------------------------------------------------- /nbs_ref/temp.csv: -------------------------------------------------------------------------------- 1 | ,number,time,added_by 2 | 0,0,2021-10-19 16:21:04.142161,Justin 3 | 1,1,2021-10-19 16:21:04.142165,Justin 4 | 2,2,2021-10-19 16:21:04.142167,Justin 5 | 3,3,2021-10-19 16:21:04.142168,Justin 6 | 4,4,2021-10-19 16:21:04.142169,Justin 7 | 5,5,2021-10-19 16:21:04.142170,Justin 8 | 6,6,2021-10-19 16:21:04.142171,Justin 9 | 7,7,2021-10-19 16:21:04.142173,Justin 10 | 8,8,2021-10-19 16:21:04.142174,Justin 11 | 9,9,2021-10-19 16:21:04.142175,Justin 12 | -------------------------------------------------------------------------------- /nbs_ref/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Below are various simple utility methods we use in many of the notebooks. 3 | To use just: 4 | 5 | import utils 6 | 7 | utils.float_to_dollars(32.00) 8 | 9 | """ 10 | from slugify import slugify 11 | 12 | 13 | def float_to_dollars(value:float) -> str: 14 | """ 15 | Take in a float (32.00) 16 | """ 17 | return f"${value:,.2f}" 18 | 19 | 20 | def dollar_str_to_float(value:str) -> float: 21 | return float(value.replace("$", "").replace(",", "_")) 22 | 23 | 24 | def group_salary(value:float) -> str: 25 | if value > .95: 26 | return 'top' 27 | elif value <= .95 and value > .50: 28 | return 'mid' 29 | return 'low' 30 | 31 | 32 | def to_snake_case(val): 33 | # in the future, this will be stored in 34 | # utils.py in the courses/ directory 35 | kebab_case = slugify(val) 36 | return kebab_case.replace('-', '_') 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | pre-commit 3 | nbstripout 4 | matplotlib 5 | numpy 6 | requests 7 | requests-html 8 | openpyxl 9 | python-slugify 10 | -------------------------------------------------------------------------------- /start-here.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "cell_id": "00001-691e21a4-843f-476d-8cbb-9aa691d1d0d8", 7 | "deepnote_cell_type": "markdown", 8 | "tags": [] 9 | }, 10 | "source": [ 11 | "# Welcome\n", 12 | "\n", 13 | "This is an interactive notebook. It allows us to write & run code, include Markdown-based instructions, and create visualizations.\n", 14 | "\n", 15 | "Assuming you have [launched this code on Deepnote](https://deepnote.com/launch?url=https://github.com/codingforentrepreneurs/Try-Pandas), click `Run Notebook` above.\n", 16 | "\n", 17 | "What you'll find in this project is a lot of notebooks. Each notebook should work in isolation so you can learn Pandas.\n", 18 | "\n", 19 | "Below is a very basic example of how you can both write code and allow anyone to collaborate with you. " 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "cell_id": "00000-4fa6d049-fec9-4534-a889-f8f1d2535af4", 27 | "deepnote_cell_type": "code", 28 | "deepnote_to_be_reexecuted": false, 29 | "execution_millis": 6838, 30 | "execution_start": 1633809511037, 31 | "output_cleared": false, 32 | "source_hash": "bcce9116", 33 | "tags": [] 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "name = input(\"Type your name then press enter:\")\n", 38 | "print(f\"Nice to meet you {name}. Are you ready to get started?\")\n", 39 | "\n", 40 | "while True:\n", 41 | " number = input(\"Type a number then press enter:\")\n", 42 | " if not number.isdigit():\n", 43 | " print(\"Please enter a number\")\n", 44 | " continue\n", 45 | " number = int(number)\n", 46 | " break\n", 47 | "\n", 48 | "print(f\"I enjoy {number} as well.\")\n", 49 | "\n", 50 | "import pandas as pd\n", 51 | "import random\n", 52 | "import datetime\n", 53 | "\n", 54 | "random_number = random.randint(0, number)\n", 55 | "random_data = [{\"number\": x * random_number, \"time\": datetime.datetime.now(), \"added_by\": name} for x in range(0, number)]\n", 56 | "\n", 57 | "df = pd.DataFrame(random_data)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "cell_id": "00002-ce4b4c81-ddce-49be-9e43-742db288db06", 64 | "deepnote_cell_type": "markdown", 65 | "tags": [] 66 | }, 67 | "source": [ 68 | "From your number, we have generated a random Pandas spreadsheet although it's not called a spreasheet, it's called a DataFrame. Let's take a look at the top 10 rows (if your number was > 10) in it:" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "cell_id": "00004-82f5d27a-b832-4133-b493-2fb4857137da", 76 | "deepnote_cell_type": "code", 77 | "deepnote_to_be_reexecuted": false, 78 | "execution_millis": 23, 79 | "execution_start": 1633809520831, 80 | "output_cleared": false, 81 | "source_hash": "990bc731", 82 | "tags": [] 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "df.head(n=10)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "cell_id": "00004-49f828e9-9888-4d85-b1e0-d85d40201501", 93 | "deepnote_cell_type": "markdown", 94 | "tags": [] 95 | }, 96 | "source": [ 97 | "Above is an example of how this course will work. There are videos associated to each lesson but the goal is to allow you to run the code anytime to dive deep into what is going on.\n", 98 | "\n", 99 | "Are you ready to begin? Open `Try-Pandas/Courses/1 - Pandas & Datasets.ipynb` to start the course!" 100 | ] 101 | } 102 | ], 103 | "metadata": { 104 | "deepnote": { 105 | "is_reactive": false 106 | }, 107 | "deepnote_execution_queue": [], 108 | "deepnote_notebook_id": "c9ce9944-34a5-4f65-b253-4c8d164ece75", 109 | "kernelspec": { 110 | "display_name": "Python 3 (ipykernel)", 111 | "language": "python", 112 | "name": "python3" 113 | }, 114 | "language_info": { 115 | "codemirror_mode": { 116 | "name": "ipython", 117 | "version": 3 118 | }, 119 | "file_extension": ".py", 120 | "mimetype": "text/x-python", 121 | "name": "python", 122 | "nbconvert_exporter": "python", 123 | "pygments_lexer": "ipython3", 124 | "version": "3.9.7" 125 | } 126 | }, 127 | "nbformat": 4, 128 | "nbformat_minor": 2 129 | } 130 | --------------------------------------------------------------------------------