├── .gitattributes
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── course
├── 1 - Pandas & Datasets.ipynb
├── 2 - Cleaning Data with Python & Pandas.ipynb
├── 3 - Basic Analysis in Pandas DataFrames.ipynb
├── 4 - Cleaning Real Data.ipynb
├── 5 - Merge Datasets.ipynb
├── 6 - NBA Player Details.ipynb
├── appendix
│ ├── Appendix A - Scrape & Build NBA Salary Dataset.ipynb
│ ├── Appendix B - Inflation Rate Dataset.ipynb
│ ├── Appendix C - The NBA API Experiments.ipynb
│ ├── Appendix D - NBA Player Detail.ipynb
│ └── appendix-b-united-states-inflation-rate.csv
├── datasets
│ ├── inflation-rate.csv
│ └── nba-historical-salaries.csv
├── example.csv
├── samples
│ ├── 1.csv
│ ├── 2.csv
│ ├── 4-adj-salaries-cumlative-per-year.csv
│ ├── 4-player-salaries-cleaned.csv
│ ├── 4-player-salaries-per-year.csv
│ ├── 5-player-adj-salaries-audit.csv
│ └── players
│ │ ├── Caleb Swanigan
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Charlie Ward
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Chris Mihm
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Chris Wilcox
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Darvin Ham
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Devin Harris
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Eric Gordon
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Gary Trent Jr
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Gerald Wilkins
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Jahidi White
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Jason Smith
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Jermaine O'Neal
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Ken Norman
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Kevin Garnett
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Ledell Eackles
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Luke Harangody
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Michael Beasley
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Michael Jordan
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Shaquille O'Neal
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Steve Scheffler
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Toby Bailey
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ ├── Tony Farmer
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
│ │ └── Tristan Thompson
│ │ ├── salary_over_time.png
│ │ └── stats.xlsx
└── utils.py
├── nbs_ref
├── 1 - DataFrame.ipynb
├── 2 - Import & Export.ipynb
├── 3 - Rename Columns.ipynb
├── 4 - Clean Rows.ipynb
├── 5 - Basic Analysis.ipynb
├── 6 - Grouping & Plots.ipynb
├── 7 - Clean Real Data.ipynb
├── 8 - Merge Datasets.ipynb
├── 9 - Using an NBA Stats API.ipynb
├── example.csv
├── temp.csv
└── utils.py
├── requirements.txt
└── start-here.ipynb
/.gitattributes:
--------------------------------------------------------------------------------
1 | docs/** filter= diff=
2 | bin/**
3 | include/**
4 | lib/**
5 |
6 | *.ipynb filter=nbstripout
7 | *.zpln filter=nbstripout
8 | *.ipynb diff=ipynb
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | bin/
3 | etc/
4 | share/
5 | pyvenv.cfg
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 |
11 | # C extensions
12 | *.so
13 |
14 | # Distribution / packaging
15 | .Python
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | pip-wheel-metadata/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .nox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | *.py,cover
56 | .hypothesis/
57 | .pytest_cache/
58 |
59 | # Translations
60 | *.mo
61 | *.pot
62 |
63 | # Django stuff:
64 | *.log
65 | local_settings.py
66 | db.sqlite3
67 | db.sqlite3-journal
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 |
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 |
106 | # SageMath parsed files
107 | *.sage.py
108 |
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 |
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 |
122 | # Rope project settings
123 | .ropeproject
124 |
125 | # mkdocs documentation
126 | /site
127 |
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 |
133 | # Pyre type checker
134 | .pyre/
135 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v3.2.0
4 | hooks:
5 | - id: trailing-whitespace
6 | - id: end-of-file-fixer
7 | - id: check-yaml
8 | - repo: https://github.com/kynan/nbstripout
9 | rev: 0.5.0
10 | hooks:
11 | - id: nbstripout
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Coding For Entrepreneurs
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://www.codingforentrepreneurs.com/projects/try-pandas)
2 |
3 | # Try Pandas
4 | [Pandas](https://pandas.pydata.org/) is a great tool for doing analysis on spreadsheets.
5 |
6 | It's easy to say that but let's actually learn why by doing something real.
7 |
8 | We're going to be analyzing [NBA](https://www.nba.com/stats/) data to help understand why Pandas should be a tool in your data science toolkit.
9 |
10 | But more importantly, doing something practical will help you better understand the need for a tool like Pandas.
11 |
12 | To help us work with Pandas in a practical way, we've teamed up with [Deepnote](https://deepnote.com/referral?token=cfe). [Deepnote](https://deepnote.com/referral?token=cfe) is a service that makes it easy to run interactive notebooks (also known as Juptyer Notebooks). These notebooks allow us to run Python & Pandas in a highly visual and highly interactive manner.
13 |
14 | What's better, notebooks, especially on [Deepnote](https://deepnote.com/referral?token=cfe), allow non-technical team members to participate in a code-heavy document (as we'll see how).
15 |
16 | To get started, sign up for Deepnote using this [link](https://deepnote.com/referral?token=cfe) (This link will unlock pro features).
17 |
18 | Once you [sign up](https://deepnote.com/referral?token=cfe), you can automagically copy all the code in this repo with the following button:
19 |
20 | [
](https://deepnote.com/launch?url=https://github.com/codingforentrepreneurs/Try-Pandas)
21 |
--------------------------------------------------------------------------------
/course/1 - Pandas & Datasets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "cell_id": "00000-15d3f3d1-7d9d-48b0-9022-0136024f5fa7",
7 | "deepnote_cell_type": "code",
8 | "tags": []
9 | },
10 | "source": [
11 | "# 1 - Pandas & Datasets\n",
12 | "\n",
13 | "Pandas helps us manage datasets and very often using flat files (eg. `csv`, `xlsx`, `tsv`, etc). In this one, we're going to create our first dataset with random data."
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "### Create Random Data\n",
21 | "Below is a simple Python function to generate random data with no external dependencies. "
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "import random"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "items = []\n",
40 | "\n",
41 | "random_number = random.randint(0, 50_000)\n",
42 | "\n",
43 | "def float_to_dollars(value):\n",
44 | " # in the future, this will be stored in\n",
45 | " # utils.py in the courses/ directory\n",
46 | " return f\"${value:,.2f}\" \n",
47 | "\n",
48 | "\n",
49 | "for x in range(0, random_number):\n",
50 | " dollars = random.randint(30_000, 50_000_000)\n",
51 | " data = {\n",
52 | " \"Player Name\": f\"Player-{x}\",\n",
53 | " \"Player Salary\": float_to_dollars(dollars)\n",
54 | " }\n",
55 | " items.append(data)"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "There's a few questions I want to ask about this data:\n",
63 | "- How do we save this data? How do we load saved data?\n",
64 | "- How do we clean this data?\n",
65 | "- How do we analyze this data?\n",
66 | "\n",
67 | "The answer, of course, is Pandas. So let's see why."
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "### Initialize a DataFrame\n",
75 | "A table of data in Pandas is called a DataFrame. At it's core, a Dataframe is just rows and columns. There are many ways to initialize it. Let's use the data from above to start our first one:"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "import pandas as pd\n",
85 | "\n",
86 | "df = pd.DataFrame(items)"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "Pandas uses a common [numpy](https://numpy.org/) convention when importing:\n",
94 | "```python\n",
95 | "import pandas as pd\n",
96 | "```\n",
97 | "So in Python projects that use Pandas, you will typically see this import somewhere. You usually won't do `import pandas` or `from pandas import DataFrame`. As with most things in software, there's nothing technically stopping you from doing that; it's just not the common practice.\n",
98 | "\n",
99 | "The variable `df` is very often used for instances of `DataFrame`.\n",
100 | "\n",
101 | "Since a `DataFrame` is a table with columns and rows, you can easily initialize it with a list of data. \n",
102 | "\n",
103 | "Let's take a look at this data:"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "df.head()"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "Tables in Pandas can be massive so we use `df.head()` to get a glimpse of the first 5 rows. Use `df.head(n=20)` to change this value. You can also use `df.tail(n=5)` to see the end of this table."
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "### Exporting a DataFrame (Writing)\n",
127 | "There are many ways to save DataFrames. You can save to:\n",
128 | "\n",
129 | "- CSV (Comma Separated Values)\n",
130 | "- TSV (Tab Separated Values)\n",
131 | "- Excel (`xlsx`)\n",
132 | "- JSON (JavaScript Object Notion)\n",
133 | "- HDF (HDF5 files)\n",
134 | "- HTML (reading/writing HTML tables `
`)\n",
135 | "- Pickle\n",
136 | "- SQL\n",
137 | "- And much [more](https://pandas.pydata.org/docs/reference/io.html)\n",
138 | "\n",
139 | "Throughout this course we'll use a mixture of storage options but mostly using `csv` files as they are lightweight and easy to use in many situations. \n",
140 | "\n",
141 | "So how do we save this?"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "df.to_csv(\"example.csv\", index=False)"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "Here are a few other ways to export:\n",
158 | "\n",
159 | "\n",
160 | "```python\n",
161 | "#TSV\n",
162 | "df.to_csv('example.tsv', sep='\\t', index=False)\n",
163 | "\n",
164 | "#Excel\n",
165 | "df.to_excel('example.xlsx', sheet_name='example', index=False)\n",
166 | "\n",
167 | "#JSON\n",
168 | "df.to_json('example.json', index=False)\n",
169 | "\n",
170 | "#HDF\n",
171 | "df.to_hdf('example.h5', key='example', index=False)\n",
172 | "\n",
173 | "#HTML: \n",
174 | "\n",
175 | "df.to_html('example.html', index=False)\n",
176 | "\n",
177 | "#Pickle\n",
178 | "df.to_pickle('example.pkl', index=False)\n",
179 | "\n",
180 | "\n",
181 | "#SQL\n",
182 | "from sqlalchemy import create_engine\n",
183 | "engine = create_engine('sqlite://', echo=False)\n",
184 | "df.to_sql('example_table', con=engine, index=True)\n",
185 | "```\n",
186 | "\n",
187 | "Now that we have saved our `example.csv` file, how do we load it in? That's just as simple, and it's usually a `read_` directly in Pandas.\n",
188 | "\n",
189 | "> A quick note. There are many reasons these different file types exist. One of them, especially in dealing with `csv` files, has to do with date type. More on storing data types later."
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "### Importing Data (Reading)\n",
197 | "\n",
198 | "Importing data is just as easy as exporting data but instead of using a DataFrame class, we use built in methods for reading. First, here are examples:\n",
199 | "\n",
200 | "```python\n",
201 | "#CSV\n",
202 | "df = pd.read_csv('example.csv')\n",
203 | "\n",
204 | "#TSV\n",
205 | "df = pd.read_csv('example.tsv', sep='\\t')\n",
206 | "\n",
207 | "#Excel\n",
208 | "df = pd.read_excel('example.xlsx', sheet_name='example')\n",
209 | "\n",
210 | "#JSON\n",
211 | "df = pd.read_json('example.json')\n",
212 | "\n",
213 | "#HDF\n",
214 | "df = pd.read_hdf('example.h5', key='example')\n",
215 | "\n",
216 | "#HTML\n",
217 | "df = pd.read_html('example.html')\n",
218 | "\n",
219 | "#Pickle\n",
220 | "df = pd.read_pickle('example.pkl')\n",
221 | "\n",
222 | "#SQL\n",
223 | "from sqlalchemy import create_engine\n",
224 | "engine = create_engine('sqlite://')\n",
225 | "df = pd.read_sql('SELECT * from example_table', con=engine)\n",
226 | "```\n"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "new_df = pd.read_csv('example.csv')\n",
236 | "new_df.head()"
237 | ]
238 | },
239 | {
240 | "cell_type": "markdown",
241 | "metadata": {},
242 | "source": [
243 | "Now that we can export and import data, how do we clean it up? "
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "# Export to samples dir\n",
253 | "# df.to_csv(\"samples/1.csv\", index=False)"
254 | ]
255 | }
256 | ],
257 | "metadata": {
258 | "deepnote": {
259 | "is_reactive": false
260 | },
261 | "deepnote_execution_queue": [],
262 | "deepnote_notebook_id": "e609e7b1-ff5b-43c7-8bff-b115fd3b7749",
263 | "kernelspec": {
264 | "display_name": "Python 3 (ipykernel)",
265 | "language": "python",
266 | "name": "python3"
267 | },
268 | "language_info": {
269 | "codemirror_mode": {
270 | "name": "ipython",
271 | "version": 3
272 | },
273 | "file_extension": ".py",
274 | "mimetype": "text/x-python",
275 | "name": "python",
276 | "nbconvert_exporter": "python",
277 | "pygments_lexer": "ipython3",
278 | "version": "3.9.7"
279 | }
280 | },
281 | "nbformat": 4,
282 | "nbformat_minor": 2
283 | }
284 |
--------------------------------------------------------------------------------
/course/2 - Cleaning Data with Python & Pandas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "cell_id": "00000-15d3f3d1-7d9d-48b0-9022-0136024f5fa7",
7 | "deepnote_cell_type": "code",
8 | "tags": []
9 | },
10 | "source": [
11 | "# 2 - Cleaning Data with Python & Pandas\n",
12 | "\n",
13 | "\n",
14 | "### Cleaning Data\n",
15 | "\n",
16 | "It's true that we made this data but let's look at it as if we didn't. \n",
17 | "\n",
18 | "The `Player Salary` column has valid values for US Dollars but there's a key issue with them: they're strings (`str`). In this section, we'll convert this data into a `float` data type. \n",
19 | "\n",
20 | "The next issue is the column names. `Player Name` and `Player Salary` work but I would prefer to name them a bit more pythonic like `name` and `salary` respectively. \n",
21 | "\n",
22 | "Let's start by importing our sample data from `1 - Pandas & Datasets`"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "import pandas as pd\n",
32 | "import random\n",
33 | "\n",
34 | "# utils.py was created by us\n",
35 | "import utils"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "# read sample data\n",
45 | "df = pd.read_csv(\"samples/1.csv\") "
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "> Are you missing the sample data? Be sure to [launched this code on Deepnote](https://deepnote.com/launch?url=https://github.com/codingforentrepreneurs/Try-Pandas)"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "Now, lets __change the column names__:"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "column_name_mapping = {\n",
69 | " \"Player Name\": \"name\",\n",
70 | " \"Player Salary\": \"salary\"\n",
71 | "}\n",
72 | "\n"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "# we're using the first DataFrame from the top `df`.\n",
82 | "renamed_df = df.rename(columns=column_name_mapping)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "renamed_df.head()"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "The mapping is pretty simple just use a `key`/`value` pair with the way you want to rename it.\n",
99 | "\n",
100 | "Going forward we'll use the convention `df` instead of `renamed_df` so let's make a copy:"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "df = renamed_df.copy()"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "Now, let's convert a Dollar `string` into a `float`:"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "salary_example = \"$30,707,056.00\"\n",
126 | "salary_replacements = salary_example.replace(\"$\", \"\").replace(\",\", \"_\")\n",
127 | "salary_replacements"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "As you see, I replaced commas `,` with underscores `_`. As you may know, you can write large values in Python using underscores to make it more human readable just like `100000000000` becomes `100_000_000_000`"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "salary_example_as_float = float(salary_replacements)\n",
144 | "salary_example_as_float"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "Now that we have a `float` value, we can do further analysis. \n",
152 | "\n",
153 | "But this is just one hard-coded value. How do we do this in our `DataFrame`? There's actually a few ways to do this. We'll do it by adding a column to our dataset.\n",
154 | "\n",
155 | "Before we can make changes to any given column, let's look at all values in any given column"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "df['salary']"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "This shows us:\n",
172 | "- How to grab data via column name (our renamed column of course)\n",
173 | "- An example of Pandas `Series`\n",
174 | "- DataFrame Index Values (based on our data).\n",
175 | "\n",
176 | "All of the above we'll continue to look at in future videos. For now, we need to get *just* the list of values from the column we're getting data from. We'll do that with:"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "list(df['salary'].values)"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "So how would we convert all this data in pure python? Perhaps something like:"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "values = list(df['salary'].values)\n",
202 | "new_values = []\n",
203 | "for val in values:\n",
204 | " new_val = float(val.replace(\"$\", \"\").replace(\",\", \"_\"))\n",
205 | " # you can also use new_val = utils.float_to_dollars(val)\n",
206 | " new_values.append(new_val)\n",
207 | "\n",
208 | "print(new_values)"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "Let's bare something in mind here: the position (or index) of each value should correspond to it's counterpoint in our table values (ie `new_values[312]` should be the same as `values[312]`). Let's test that here: "
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "random_index = random.randint(0, len(values))\n",
225 | "new_value_via_index = new_values[random_index]\n",
226 | "new_value_in_dollars = utils.float_to_dollars(new_value_via_index)\n",
227 | "\n",
228 | "assert new_value_in_dollars == values[random_index]"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "Now, let's add these values as a new column in our DataFrame"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "df['salary_raw_py'] = new_values\n",
245 | "df.head()"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "Now we can add new columns to a Pandas DataFrame using a familiar method (much like adding a new key to a Python dictionary `dict()`). In this case, the length of the values we added matches the length of all the rows in our DataFrame. We know this because the data *came from the dataframe* in the first place.\n",
253 | "\n",
254 | "Let's try to add arbitrary data. "
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "import datetime\n",
264 | "\n",
265 | "this_year = datetime.datetime.now().year # notice this \n",
266 | "df['year'] = this_year"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "df.head()"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "metadata": {},
281 | "source": [
282 | "So we now see two properties of a DataFrame that are pretty cool. You can add a new column with 1 value or with matching number of row values.\n",
283 | "\n",
284 | "How about data that was 1/2 the number of rows?"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "rows_length = df.shape[0]\n",
294 | "# column_length = df.shape [1]\n",
295 | "half_rows = int(rows_length * 0.5)\n",
296 | "try:\n",
297 | " df['is_new'] = [True for x in range(0, half_rows)]\n",
298 | "except Exception as e:\n",
299 | " print(e)"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "Now we see that you can:\n",
307 | "- Add a value for all rows from 1 value\n",
308 | "- Add a value fro all rows from a corresponding index value in another list\n",
309 | "\n",
310 | "Everything we did above technically works but it adds a lot of uncessary steps that we can skip thanks to Pandas awesomeness."
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": [
319 | "def dollar_str_to_float(val):\n",
320 | " # in the future, this will be stored in\n",
321 | " # utils.py in the courses/ directory\n",
322 | " return float(val.replace(\"$\", \"\").replace(\",\", \"_\"))\n",
323 | "\n",
324 | "df['salary_as_float'] = df['salary'].apply(dollar_str_to_float)"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "Let's break this down:\n",
332 | "- `df['salary_via_apply']` is declaring our new column\n",
333 | "- `df['salary']` is a reference to the values in a pre-existing column on this dataframe\n",
334 | "- `.apply()` will run a function on *all* values in the referenced column. \n",
335 | "- `dollar_str_to_float` is a function that we pass the values to in order to get the correct result.\n",
336 | "- The original `df['salary']` remains unchanged."
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "df.head()"
346 | ]
347 | },
348 | {
349 | "cell_type": "markdown",
350 | "metadata": {},
351 | "source": [
352 | "You can also use a lambda to simplify this further:\n",
353 | "\n",
354 | "```python\n",
355 | "df['salary_via_apply_lambda'] = df['salary'].apply(lambda x: float(x.replace('$', '').replace(',', '')))\n",
356 | "```"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": null,
362 | "metadata": {},
363 | "outputs": [],
364 | "source": [
365 | "# Export to samples dir\n",
366 | "# df.to_csv(\"samples/2.csv\", index=False)"
367 | ]
368 | }
369 | ],
370 | "metadata": {
371 | "deepnote": {
372 | "is_reactive": false
373 | },
374 | "deepnote_execution_queue": [],
375 | "deepnote_notebook_id": "e609e7b1-ff5b-43c7-8bff-b115fd3b7749",
376 | "kernelspec": {
377 | "display_name": "Python 3 (ipykernel)",
378 | "language": "python",
379 | "name": "python3"
380 | },
381 | "language_info": {
382 | "codemirror_mode": {
383 | "name": "ipython",
384 | "version": 3
385 | },
386 | "file_extension": ".py",
387 | "mimetype": "text/x-python",
388 | "name": "python",
389 | "nbconvert_exporter": "python",
390 | "pygments_lexer": "ipython3",
391 | "version": "3.9.7"
392 | }
393 | },
394 | "nbformat": 4,
395 | "nbformat_minor": 2
396 | }
397 |
--------------------------------------------------------------------------------
/course/3 - Basic Analysis in Pandas DataFrames.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "cell_id": "00000-15d3f3d1-7d9d-48b0-9022-0136024f5fa7",
7 | "deepnote_cell_type": "code",
8 | "tags": []
9 | },
10 | "source": [
11 | "# 3 - Basic Analysis in Pandas DataFrames\n",
12 | "\n",
13 | "At this point, we've only be working with auto-generated data. Analyzing auto-generate data is a lot like running on a treadmill; no matter how hard to you try you'll always be stuck in the same place(s).\n",
14 | "\n",
15 | "I use auto-generated data to show you some of the fundamentals of Pandas. In the next one, we'll go into real data from NBA.com. In this one, we'll cover how to do some basic analysis on your data by using a few built-in methods that Pandas offers.\n",
16 | "\n",
17 | "Let's start by importing our sample data from `2 - Cleaning Data with Python & Pandas`"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import pandas as pd\n",
27 | "import matplotlib.pyplot as plt\n",
28 | "import random\n",
29 | "import numpy as np\n",
30 | "\n",
31 | "# utils.py was created by us\n",
32 | "import utils"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "# read sample data\n",
42 | "df = pd.read_csv(\"samples/2.csv\") "
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "> Are you missing the sample data? Be sure to [launched this code on Deepnote](https://deepnote.com/launch?url=https://github.com/codingforentrepreneurs/Try-Pandas)"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "### Analyze Data\n",
57 | "Let's take a basic look at how we can analyze this data."
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "df2 = df.copy()[['name', 'salary_as_float']]"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "The above command does 2 things: copies the dataframe `df` and selects only some of the columns (in this case `name` and `salary_as_float`. Creating a copy means we won't accidentally modify a previous dataframe. \n"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "df2.head()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "df2.shape"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "Using `.shape` will give us the size of our table in the layout (`row_length`, `column_length`). This `.shape` call matches closely with `numpy`. Something we'll have to revisit another time."
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "n_rows = df2.shape[0]\n",
108 | "n_columns = df2.shape[1]"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "Let's do some statistics:"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "avg_salary = df2['salary_as_float'].mean()\n",
125 | "most_common_salary = df2['salary_as_float'].mode() # returns a series\n",
126 | "top_salary = df2['salary_as_float'].max()\n",
127 | "bottom_salary = df2['salary_as_float'].min()\n",
128 | "\n",
129 | "print(\"Average Salary\\t\\t\\t\", utils.float_to_dollars(avg_salary))\n",
130 | "\n",
131 | "print(\"Top Salary\\t\\t\\t\", utils.float_to_dollars(top_salary))\n",
132 | "\n",
133 | "print(\"Bottom Salary\\t\\t\\t\", utils.float_to_dollars(bottom_salary))\n",
134 | "\n",
135 | "print(\"Top 3 Most Common Salaries\\t\", \", \".join(most_common_salary.apply(utils.float_to_dollars).values[:3]))"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "df3 = df2.copy()\n",
145 | "df3['salary_normalized'] = (df3['salary_as_float'] - df3['salary_as_float'].min()) / (df3['salary_as_float'].max() - df3['salary_as_float'].min())"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "Normalizing data is incredibly common. It convers a set of data (in this case `df['salary_as_float']`) and convers all numbers to be within the range of `0` and `1`. Data normalization is a common pre-processing practice when performing machine learning. We're going to use this normalized data as a way to parse our groups based on percentage values. "
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "def group_salary(val):\n",
162 | " # in the future, this will be stored in\n",
163 | " # utils.py in the courses/ directory\n",
164 | " if val > .95:\n",
165 | " return 'top'\n",
166 | " elif val < .95 and val > .50:\n",
167 | " return 'mid'\n",
168 | " return 'low'\n",
169 | "\n",
170 | "df3['salary_group'] = df3['salary_normalized'].apply(group_salary)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "df3['salary_group'].value_counts()"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "df3.groupby('salary_group')['salary_group'].value_counts().plot(kind='bar', title='People in Group')"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "As you can see from the chart, the distribution of data falls into 3 categories based on arbitrary splitting done in the `group_salary` method above. "
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "sal_group = df3[['salary_as_float','salary_group']].groupby('salary_group').agg([np.sum])\n",
205 | "sal_group.plot(kind = \"bar\", legend = True, title='Average Salary per Group (Normalized)', color='green')"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {},
211 | "source": [
212 | "This chart shows the sum total salary of each group. With Random data, this is not that interesting because there's nothing to be learned from it. With our NBA dataset, it's this chart may look vastly different."
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": []
221 | }
222 | ],
223 | "metadata": {
224 | "deepnote": {
225 | "is_reactive": false
226 | },
227 | "deepnote_execution_queue": [],
228 | "deepnote_notebook_id": "e609e7b1-ff5b-43c7-8bff-b115fd3b7749",
229 | "kernelspec": {
230 | "display_name": "Python 3 (ipykernel)",
231 | "language": "python",
232 | "name": "python3"
233 | },
234 | "language_info": {
235 | "codemirror_mode": {
236 | "name": "ipython",
237 | "version": 3
238 | },
239 | "file_extension": ".py",
240 | "mimetype": "text/x-python",
241 | "name": "python",
242 | "nbconvert_exporter": "python",
243 | "pygments_lexer": "ipython3",
244 | "version": "3.9.7"
245 | }
246 | },
247 | "nbformat": 4,
248 | "nbformat_minor": 2
249 | }
250 |
--------------------------------------------------------------------------------
/course/4 - Cleaning Real Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "ee709b50",
6 | "metadata": {},
7 | "source": [
8 | "# 4 - Cleaning Real Data\n",
9 | "\n",
10 | "Now it's time for the real stuff. Let's use a load in a real dataset and discover our next steps together.\n",
11 | "\n",
12 | "In *Appendix A - Scrape & Build NBA Salary Dataset*, we create a NBA Player salary dataset by web scraping [hoopshype.com](hoopshype.com). We won't cover web scraping here but you can run that notebook if you want to learn more."
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "id": "2ae03fdd",
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import datetime\n",
23 | "import pathlib\n",
24 | "import pandas as pd\n",
25 | "\n",
26 | "# import local utils.py\n",
27 | "import utils"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "id": "4b445d4c",
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "PERFROM_SCRAPE = True\n",
38 | "BASE_DIR = pathlib.Path().resolve()\n",
39 | "DATASET_PATH = BASE_DIR / 'datasets'\n",
40 | "INPUT_PATH = DATASET_PATH / 'nba-historical-salaries.csv'\n",
41 | "print(f'Dataset *{INPUT_PATH.name}* exists:', INPUT_PATH.exists())"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "id": "cb544e0f",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "df = pd.read_csv(INPUT_PATH)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "id": "df2d24f7",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "df.head()"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "id": "3cb16ea1",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "df.shape"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "id": "7b947270",
77 | "metadata": {},
78 | "source": [
79 | "The above commands tell us a lot about this data already:\n",
80 | "- Finanical data\n",
81 | "- Columns with dollar strings need to be cleaned (`$`)\n",
82 | "- Rename columns for consistency\n",
83 | "- There's 14,549 records each with 5 data points.\n",
84 | "- `adj_salary` is given data. Does this mean adjusted in today's dollars? Is this accurate?\n",
85 | "\n",
86 | "After this assessment, let's get to work"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "id": "75685eef",
92 | "metadata": {},
93 | "source": [
94 | "### Column consistency\n",
95 | "\n",
96 | "_How you do anything, is how you do everything._\n",
97 | "\n",
98 | "Let's start with the mundane task of committing to a consistent naming convention for our columns across our entire project here. \n",
99 | "\n",
100 | "Before we do, let's see the columns: "
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "id": "2efef30b",
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "df.columns"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "id": "1506e8b2",
116 | "metadata": {},
117 | "source": [
118 | "If you're a seasoned programmer, you will notice the issue. If you're new to programming you might miss it. If you look at each column name you will see a subtle shift in how each column casing is done.\n",
119 | "\n",
120 | "Casing types? Yes, seriously. Here are a few options:\n",
121 | "\n",
122 | "- `PascalCase` -> `ThisIsPascalCase`\n",
123 | "- `camelCase` -> `thisIsCamelCase`\n",
124 | "- `snake_case` -> `this_is_snake_case`\n",
125 | "- `kebab-case` -> `this-is-kebab-case` (aka `slugified-string`, `spinal-case`)\n",
126 | "\n",
127 | "\n",
128 | "Since I use Python and create a lot of web applications, I tend to use `snake_case` or `kebab-case`. If you're a SQL database person, you'd probably use `PascalCase`. If you're from JavaScript, you'd probably use a lot of `camelCase`.\n",
129 | "\n",
130 | "Whatever format you use, just be consistent. Let's rename our columns using `snake_case`:"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "id": "8e10f1d0",
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "# %pip install python-slugify"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "id": "03e0ea33",
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "from slugify import slugify\n",
151 | "\n",
152 | "def to_snake_case(val):\n",
153 | " # in the future, this will be stored in\n",
154 | " # utils.py in the courses/ directory\n",
155 | " kebab_case = slugify(val)\n",
156 | " return kebab_case.replace('-', '_')"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "id": "913dfbae",
162 | "metadata": {},
163 | "source": [
164 | "I like using the `python-slugify` package to consistently and reliably convert any string into a url-ready slug (aka `kebab-casing`). Once we have a `slug`/`kebab-case` we can just switch out the dashes (`-`) for underscores (`_`)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "id": "90a5cbc9",
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "old_columns = df.columns\n",
175 | "new_columns = [to_snake_case(x) for x in old_columns]"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "id": "ad1366b0",
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "new_column_mapping = dict(zip(old_columns, new_columns))\n",
186 | "new_column_mapping"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "id": "2f12d515",
192 | "metadata": {},
193 | "source": [
194 | "> `zip` is a cool built in python feature that combines two lists of the same length. Once you use `dict` around them, it will turn the left side list into keys and the right side list into values associated by their indices. I remember `zip` like a zipper on your pants, backpacks, luggage, etc; each size has \"teeth\" that corresponds to the other side. "
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "id": "eb4e2e18",
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "df.rename(columns=new_column_mapping, inplace=True)\n",
205 | "df.head()"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "id": "999c2d41",
211 | "metadata": {},
212 | "source": [
213 | "## Cleaning Rows\n",
214 | "\n",
215 | "Now that we've renamed our columns, let's clean up our rows. In `utils.py` we have the function `dollar_str_to_float` which converts dollar strings into floats"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "id": "4fad499a",
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "def clean_row(row_series):\n",
226 | " row_series['salary'] = utils.dollar_str_to_float(row_series['salary'])\n",
227 | " row_series['adj_salary'] = utils.dollar_str_to_float(row_series['adj_salary'])\n",
228 | " return row_series\n",
229 | "\n",
230 | "df_cleaned = df.copy().apply(clean_row, axis=1)\n",
231 | "df_cleaned.head()"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "id": "936bf0ca",
237 | "metadata": {},
238 | "source": [
239 | "I hope that your alarm bells are going off. We never covered `df.apply` we only covered `df['my_col'].apply`. What gives?\n",
240 | "\n",
241 | "When you run `.apply` on an entire DataFrame, you can modify each row as you see fit instead of just an entire column. Another way to write this would be to write:\n",
242 | "\n",
243 | "```python\n",
244 | "df_cleaned = df.copy().apply(clean_row, axis=1)\n",
245 | "df_cleaned['salary'] = df_cleaned['salary'].apply(utils.dollar_str_to_float)\n",
246 | "df_cleaned['adj_salary'] = df_cleaned['adj_salary'].apply(utils.dollar_str_to_float)\n",
247 | "```\n",
248 | "\n",
249 | "And that would be perfectly acceptable. But there's a major difference. And it's this:"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "id": "f8ca56e2",
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "def clean_row_2(row_series):\n",
260 | " dollar_cols = ['salary', 'adj_salary']\n",
261 | " for col in dollar_cols:\n",
262 | " row_series[col] = utils.dollar_str_to_float(row_series[col])\n",
263 | " return row_series\n",
264 | "\n",
265 | "df_cleaned_2 = df.copy().apply(clean_row_2, axis=1)\n",
266 | "df_cleaned_2.head()"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "id": "7c7d6a87",
272 | "metadata": {},
273 | "source": [
274 | "`clean_row_2` gives us a way to reduce complexity by iterating over the columns we want to adjust. "
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "id": "15d48802",
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "df_cleaned_2['adj_salary'].dtype"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "id": "3d326000",
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "players_per_year = df_cleaned_2['year_start'].value_counts(sort=False)\n",
295 | "players_per_year"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "id": "fdeaa9f0",
302 | "metadata": {},
303 | "outputs": [],
304 | "source": [
305 | "players_per_year.plot(title='Number of Players Per Year')"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "id": "388e551e",
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "adj_salary_df = df_cleaned_2.copy()[['year_start', 'adj_salary']]\n",
316 | "adj_salaries_cumlative = adj_salary_df.groupby(\"year_start\").sum()\n",
317 | "\n",
318 | "adj_salaries_cumlative.plot(title='Adjusted Cumaltive Salaries Over Time')"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "id": "dbe57cb2",
324 | "metadata": {},
325 | "source": [
326 | "Look at this two charts! The second appears to be out-pacing the first.\n",
327 | "\n",
328 | "- upward trend of number of players and salaries\n",
329 | "- What happend in 2019?\n",
330 | "- 2020 seams to be trending towards a massive year for player payments"
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "id": "36dc0131",
336 | "metadata": {},
337 | "source": [
338 | "The above dataset leaves me with a lot of questions:\n",
339 | "\n",
340 | "- Are these adjust salary numbers correct (they are from hypehoops.com)\n",
341 | "- Are the per-player salaries going up or just the top 5% of players?\n",
342 | "- How does a players' salary correlate to wins / losses / other stats?\n",
343 | "- How does a team (full of players) and their salaries correlate to wins / losses / other stats?\n",
344 | "- Do the audience metrics support these numbers? (In person, online, etc) In other words, is there really this much economic value being generated?\n",
345 | "\n",
346 | "Answers to these questions will inevitably leads to more questions which hopefully means more and better data analysis.\n"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": null,
352 | "id": "4f826a51",
353 | "metadata": {},
354 | "outputs": [],
355 | "source": []
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": null,
360 | "id": "e824c752",
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "# Export to samples dir\n",
365 | "\n",
366 | "# df_cleaned_2.to_csv('samples/4-player-salaries-cleaned.csv', index=False)\n",
367 | "\n",
368 | "# players_per_year.rename(columns={\"year_start\": \"players\"}, inplace=True)\n",
369 | "# players_per_year.to_csv('samples/4-player-salaries-per-year.csv', index_label='year', index=True)\n",
370 | "\n",
371 | "# adj_salaries_cumlative['adj_salary_$'] = adj_salaries_cumlative['adj_salary'].apply(utils.float_to_dollars)\n",
372 | "# adj_salaries_cumlative.rename(columns={\"year_start\": \"year\"}, inplace=True)\n",
373 | "# adj_salaries_cumlative.to_csv(\"samples/4-adj-salaries-cumlative-per-year.csv\", index_label=\"year\")"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": null,
379 | "id": "1819325b",
380 | "metadata": {},
381 | "outputs": [],
382 | "source": []
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "id": "abb71da5",
388 | "metadata": {},
389 | "outputs": [],
390 | "source": []
391 | }
392 | ],
393 | "metadata": {
394 | "kernelspec": {
395 | "display_name": "Python 3 (ipykernel)",
396 | "language": "python",
397 | "name": "python3"
398 | },
399 | "language_info": {
400 | "codemirror_mode": {
401 | "name": "ipython",
402 | "version": 3
403 | },
404 | "file_extension": ".py",
405 | "mimetype": "text/x-python",
406 | "name": "python",
407 | "nbconvert_exporter": "python",
408 | "pygments_lexer": "ipython3",
409 | "version": "3.9.7"
410 | }
411 | },
412 | "nbformat": 4,
413 | "nbformat_minor": 5
414 | }
415 |
--------------------------------------------------------------------------------
/course/5 - Merge Datasets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "7a03add6",
6 | "metadata": {},
7 | "source": [
8 | "# 5 - Merge Datasets\n",
9 | "\n",
10 | "Merging datasets is a very common practice to enrich or validate the values we have. It's easy to do but it's better used when done practically.\n",
11 | "\n",
12 | "\n",
13 | "In *4 - Cleaning Real Data* we used data from [hoopshype.com](hoopshype.com) that included Actual Salaries and Adjusted Salaries. In this one, we're going to create our own Adjusted Salaries using the dataset from *Appendix B - Inflation Rate Dataset*"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "id": "787c61a1",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import datetime\n",
24 | "import pandas as pd\n",
25 | "import pathlib\n",
26 | "import utils"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "id": "520356b4",
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "BASE_DIR = pathlib.Path().resolve()\n",
37 | "COURSES_DIR = BASE_DIR / 'course'\n",
38 | "APPENDIX_DIR = BASE_DIR / 'appendix'\n",
39 | "DATASET_PATH = BASE_DIR / 'datasets'\n",
40 | "SAMPLES_DIR = BASE_DIR / 'samples'\n",
41 | "INPUT_PATH = SAMPLES_DIR / '4-player-salaries-cleaned.csv'\n",
42 | "INFLATION_DATA_PATH = DATASET_PATH / 'inflation-rate.csv'\n",
43 | "print(f'Dataset *{INPUT_PATH.name}* exists:', INPUT_PATH.exists())"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "id": "1af83091",
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# Import Dataset from 4 - Cleaning Real Data\n",
54 | "df = pd.read_csv(INPUT_PATH)"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "id": "cbcd06a4",
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "df.head()"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "id": "b4104781",
70 | "metadata": {},
71 | "source": [
72 | "Going forward, we are going to be doing a lot of analysis in 2020 dollars (2020 has the most up to date data as of October 2021).\n",
73 | "\n",
74 | "We're going to assume a few things about this scraped data:\n",
75 | "- Player names are correct (`player` column)\n",
76 | "- Salary (`salary` column) listed is their actual salary\n",
77 | "- Start Year is accurate (`year_start` column)\n",
78 | "\n",
79 | "Given these assumptions, we're going to create our own Adjust Salary column to illustrate how to merge data."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "id": "2caa64ee",
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "inflation_df = pd.read_csv(INFLATION_DATA_PATH)"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "id": "f08375f9",
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "inflation_df.head()"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "id": "31f3176c",
105 | "metadata": {},
106 | "source": [
107 | "*Appendix B - Inflation Rate Dataset* shows exactly where and how the dataset for `inflation_df` is created. "
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "id": "11a310a5",
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "inflation_df.set_index('date', inplace=True)"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "id": "4d423b23",
123 | "metadata": {},
124 | "source": [
125 | "Typically, the DataFrame index is auto-incrementing integers (0, 1, 2, 3, 4, ...) but it can be a time series index (ie based in dates).\n",
126 | "\n",
127 | "Setting our index to a date-like string (ie `YYYY-MM-DD`) will result in time series data.\n",
128 | "\n",
129 | "The nice thing about this is we can take a slice this data in a cool way:"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "id": "2b14d26d",
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "year_start = 2000\n",
140 | "year_end = 2005\n",
141 | "inflation_df[f\"{year_start}\": f\"{year_end}\"]"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "id": "628cdfaa",
147 | "metadata": {},
148 | "source": [
149 | "Now we see a subset of our dataset. You can treat this as a new dataframe if you need or we can use it when enriching our data. We're not going to use this type of slicing in this guide but it is nice to see it in action."
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "id": "009773cf",
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "year_start = 2000\n",
160 | "year_end = 2001\n",
161 | "inflation_df[f\"{year_start}\": f\"{year_end}\"]"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "id": "91567b12",
167 | "metadata": {},
168 | "source": [
169 | "This slice should help show us something interesting: for the `year_start` and `year_end` we choose, it has 2 new values that are related both the `inflation_rate_percent` and `multiplier`.\n",
170 | "\n",
171 | "Now we *can* use an apply here to enrich our data:\n",
172 | "```python\n",
173 | "def merge_data_via_lookup(row):\n",
174 | " year_start = row['year_start']\n",
175 | " year_end = row['year_end']\n",
176 | " new_data = inflation_df[f\"{year_start}\": f\"{year_end}\"]\n",
177 | " row['multiplier'] = new_data['multiplier'].values[0]\n",
178 | " return row\n",
179 | " \n",
180 | "df.apply(merge_data_via_lookup, axis=1)\n",
181 | "```\n",
182 | "\n",
183 | "Technically speaking, this would work but it's not efficient and it can lead to confusion. Let's use the built-in `merge` function instead.\n",
184 | "\n",
185 | "Since `year_start` from `df` and the index (ie the `date` column) on `inflation_df` are correlated let's try a merge:"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "id": "8c30a745",
191 | "metadata": {},
192 | "source": [
193 | "First, let's move the date column out of the index in `inflation_df`:"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "id": "ac3a4dc4",
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "inflation_df.reset_index(inplace=True, drop=False)\n",
204 | "inflation_df.head()"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "id": "fea042bf",
210 | "metadata": {},
211 | "source": [
212 | "In this case, `reset_index` will preserve the original index (`date`) as a new column because of `drop=False`. "
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "id": "648f5664",
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "try:\n",
223 | " df.merge(inflation_df, left_on=\"year_start\", right_on=\"date\")\n",
224 | "except Exception as e:\n",
225 | " print(e)"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "id": "00908037",
231 | "metadata": {},
232 | "source": [
233 | "This merge failed because the data types do not match up. `year_start` is an integer and `date` is an object. Let's change that:"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "id": "f7f90a18",
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "# df['date'] = pd.to_datetime(df['year_start'])\n",
244 | "df['date'] = df['year_start'].apply(lambda x: datetime.datetime.strptime(f\"{x}-12-31\", \"%Y-%m-%d\"))\n",
245 | "df.head()"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "id": "523cd4f1",
251 | "metadata": {},
252 | "source": [
253 | "Above I used `f\"{x}-12-31\"` to match how the `inflation_df` represents the date for the year (as opposed to the start of the year `f\"{x}-01-01\"`)."
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "id": "f5c47b51",
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "inflation_df['date'] = inflation_df['date'].apply(lambda x: datetime.datetime.strptime(f\"{x}\", \"%Y-%m-%d\"))\n",
264 | "inflation_df.head()"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "id": "19da6a26",
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "print(inflation_df['date'].dtype, df['date'].dtype)"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "id": "628999f9",
280 | "metadata": {},
281 | "source": [
282 | "Now, `inflation_df['date']` and `df['date']` have the same data type, we can use `merge` on it:"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "id": "3db43fb1",
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "merged_df = df.merge(inflation_df, left_on=\"date\", right_on='date')\n",
293 | "merged_df.head()"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "id": "dcf0c5d5",
299 | "metadata": {},
300 | "source": [
301 | "A merge is a fast way to enrich our data based on corresponding values in two dataframes. The reason we do this is simple:"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "id": "04463e5b",
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "merged_df['adj_salary_audit'] = merged_df['salary'] * merged_df['multiplier']\n",
312 | "merged_df.head()"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "id": "b33d52fb",
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "merged_df['audit_delta'] = merged_df['adj_salary_audit'] - merged_df['adj_salary']"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "id": "faf29124",
329 | "metadata": {},
330 | "outputs": [],
331 | "source": [
332 | "total_adjusted = merged_df['adj_salary'].sum()\n",
333 | "total_adjusted_usd = utils.float_to_dollars(total_adjusted)\n",
334 | "total_adjusted_audit = merged_df['adj_salary_audit'].sum()\n",
335 | "total_adjusted_audit_usd = utils.float_to_dollars(total_adjusted_audit)"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "id": "9ab1fc65",
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "audit_delta_sum = utils.float_to_dollars(merged_df['audit_delta'].sum())"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": null,
351 | "id": "ecb8514a",
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "difference_perc = ((total_adjusted_audit - total_adjusted) / total_adjusted_audit) * 100\n",
356 | "print(f\"Difference between our internal audit and their numbers is {difference_perc:.4f}% which totals to {audit_delta_sum}\")"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": null,
362 | "id": "65077dc6",
363 | "metadata": {},
364 | "outputs": [],
365 | "source": [
366 | "difference_perc = ((total_adjusted_audit - total_adjusted) / total_adjusted_audit) * 100"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": null,
372 | "id": "06bd25ee",
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "print(\"Total Adjusted Salary (usd)\", total_adjusted_usd)\n",
377 | "print(\"Total Adjusted Salary Audit (usd)\", total_adjusted_audit_usd)\n",
378 | "\n",
379 | "print(\"Delta Total\", audit_delta_sum)\n",
380 | "print(f\"Detla Percent Difference {difference_perc:.4f}%\")"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "id": "c5cb52b3",
386 | "metadata": {},
387 | "source": [
388 | "This shows us that our adjusted salary number is about $410 million higher but that's under 1% difference. \n",
389 | "\n",
390 | "Since this data is good enough for future pandas lessons, we're not going to dig any deeper in improving the adjusted salaries. But there's a few questions that come to mind on how we could do it:\n",
391 | "\n",
392 | "- With this data, we used `year_start` and not `year_end` for our inflation rate multiplier. Perhaps `year_end` would yield closer results.\n",
393 | "- The source datasets might *both* be incorrect; how would we change this?\n",
394 | "- Does over `$410 million+` skew future results given total sum is over `$68 billion+`?"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": null,
400 | "id": "47ed0b72",
401 | "metadata": {},
402 | "outputs": [],
403 | "source": [
404 | "# Export to samples dir\n",
405 | "\n",
406 | "# merged_df.to_csv('samples/5-player-adj-salaries-audit.csv', index=False)"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": null,
412 | "id": "baf1e7ee",
413 | "metadata": {},
414 | "outputs": [],
415 | "source": []
416 | }
417 | ],
418 | "metadata": {
419 | "kernelspec": {
420 | "display_name": "Python 3 (ipykernel)",
421 | "language": "python",
422 | "name": "python3"
423 | },
424 | "language_info": {
425 | "codemirror_mode": {
426 | "name": "ipython",
427 | "version": 3
428 | },
429 | "file_extension": ".py",
430 | "mimetype": "text/x-python",
431 | "name": "python",
432 | "nbconvert_exporter": "python",
433 | "pygments_lexer": "ipython3",
434 | "version": "3.9.7"
435 | }
436 | },
437 | "nbformat": 4,
438 | "nbformat_minor": 5
439 | }
440 |
--------------------------------------------------------------------------------
/course/appendix/Appendix A - Scrape & Build NBA Salary Dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "08553e5b",
6 | "metadata": {},
7 | "source": [
8 | "# Appendix A - Scrape & Build NBA Salary Dataset\n",
9 | "The goal of this notebook is to prepare our course with a pre-existing dataset. The data cleaning is done in the course itself; this is meant only to create the dataset. "
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "id": "d279ba03",
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# %pip install requests requests-html matplotlib pandas"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "id": "5624667e",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import datetime\n",
30 | "from decimal import Decimal\n",
31 | "import matplotlib.pyplot as plt\n",
32 | "import requests\n",
33 | "from requests_html import HTML\n",
34 | "import pandas as pd\n",
35 | "import pathlib\n",
36 | "import time"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 3,
42 | "id": "2a3e439e",
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "PERFROM_SCRAPE = True\n",
47 | "BASE_DIR = pathlib.Path().resolve().parent.parent\n",
48 | "COURSES_DIR = BASE_DIR / 'course'\n",
49 | "DATASET_PATH = COURSES_DIR / 'datasets'\n",
50 | "OUTPUT_PATH = DATASET_PATH / 'nba-historical-salaries.csv'"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 6,
56 | "id": "c973adc3",
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/plain": [
62 | "True"
63 | ]
64 | },
65 | "execution_count": 6,
66 | "metadata": {},
67 | "output_type": "execute_result"
68 | }
69 | ],
70 | "source": [
71 | "COURSES_DIR.exists()"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "id": "1554825f",
77 | "metadata": {},
78 | "source": [
79 | "For this dataset, we use `hoopshype.com`'s record of player salaries."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 3,
85 | "id": "8cc7326e",
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "base_url = 'https://hoopshype.com/salaries/players/'"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "id": "7d701732",
95 | "metadata": {},
96 | "source": [
97 | "`hoopshype.com`'s salary data starts in the 1990-1991 season."
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 4,
103 | "id": "be7e83d9",
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "year_start = 1990"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "id": "d48e19f3",
113 | "metadata": {},
114 | "source": [
115 | "End scraping at last year's season (this year might not be available)."
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 5,
121 | "id": "25aac766",
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "data": {
126 | "text/plain": [
127 | "2020"
128 | ]
129 | },
130 | "execution_count": 5,
131 | "metadata": {},
132 | "output_type": "execute_result"
133 | }
134 | ],
135 | "source": [
136 | "year_end = datetime.datetime.now().year - 1\n",
137 | "year_end"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 7,
143 | "id": "64accc20",
144 | "metadata": {},
145 | "outputs": [
146 | {
147 | "name": "stdout",
148 | "output_type": "stream",
149 | "text": [
150 | "1990 https://hoopshype.com/salaries/players/1990-1991/\n",
151 | "1991 https://hoopshype.com/salaries/players/1991-1992/\n",
152 | "1992 https://hoopshype.com/salaries/players/1992-1993/\n",
153 | "1993 https://hoopshype.com/salaries/players/1993-1994/\n",
154 | "1994 https://hoopshype.com/salaries/players/1994-1995/\n",
155 | "1995 https://hoopshype.com/salaries/players/1995-1996/\n",
156 | "1996 https://hoopshype.com/salaries/players/1996-1997/\n",
157 | "1997 https://hoopshype.com/salaries/players/1997-1998/\n",
158 | "1998 https://hoopshype.com/salaries/players/1998-1999/\n",
159 | "1999 https://hoopshype.com/salaries/players/1999-2000/\n",
160 | "2000 https://hoopshype.com/salaries/players/2000-2001/\n",
161 | "2001 https://hoopshype.com/salaries/players/2001-2002/\n",
162 | "2002 https://hoopshype.com/salaries/players/2002-2003/\n",
163 | "2003 https://hoopshype.com/salaries/players/2003-2004/\n",
164 | "2004 https://hoopshype.com/salaries/players/2004-2005/\n",
165 | "2005 https://hoopshype.com/salaries/players/2005-2006/\n",
166 | "2006 https://hoopshype.com/salaries/players/2006-2007/\n",
167 | "2007 https://hoopshype.com/salaries/players/2007-2008/\n",
168 | "2008 https://hoopshype.com/salaries/players/2008-2009/\n",
169 | "2009 https://hoopshype.com/salaries/players/2009-2010/\n",
170 | "2010 https://hoopshype.com/salaries/players/2010-2011/\n",
171 | "2011 https://hoopshype.com/salaries/players/2011-2012/\n",
172 | "2012 https://hoopshype.com/salaries/players/2012-2013/\n",
173 | "2013 https://hoopshype.com/salaries/players/2013-2014/\n",
174 | "2014 https://hoopshype.com/salaries/players/2014-2015/\n",
175 | "2015 https://hoopshype.com/salaries/players/2015-2016/\n",
176 | "2016 https://hoopshype.com/salaries/players/2016-2017/\n",
177 | "2017 https://hoopshype.com/salaries/players/2017-2018/\n",
178 | "2018 https://hoopshype.com/salaries/players/2018-2019/\n",
179 | "2019 https://hoopshype.com/salaries/players/2019-2020/\n",
180 | "2020 https://hoopshype.com/salaries/players/2020-2021/\n"
181 | ]
182 | }
183 | ],
184 | "source": [
185 | "dfs = []\n",
186 | "if PERFROM_SCRAPE:\n",
187 | " for year in range(year_start, year_end+1):\n",
188 | " # NBA season spans 2 different calendar years\n",
189 | " year_range = f\"{year}-{year+1}\"\n",
190 | " # the lookup salary url is based on the above range\n",
191 | " url = f\"{base_url}{year_range}/\"\n",
192 | " # print year and url for manual review\n",
193 | " print(year, url)\n",
194 | " # perform lookup\n",
195 | " r = requests.get(url)\n",
196 | " # Convert response html text as a parsable object\n",
197 | " html = HTML(html=r.text)\n",
198 | " # Find the data table containing\n",
199 | " table = html.find('table', first=True)\n",
200 | " # table_data list holder\n",
201 | " table_data = []\n",
202 | " # iterate the table element and append all column values in each row\n",
203 | " for el in table.element.getchildren():\n",
204 | " for tr in el.getchildren():\n",
205 | " row_data = []\n",
206 | " for col in tr.getchildren():\n",
207 | " row_data.append(col.text_content().strip())\n",
208 | " table_data.append(row_data)\n",
209 | " # create the initial dataframe\n",
210 | " init_df = pd.DataFrame(table_data)\n",
211 | " # use the first row as the header\n",
212 | " new_header = init_df.iloc[0]\n",
213 | " # use everything after the first row as our dataset\n",
214 | " init_df = init_df[1:]\n",
215 | " # update header\n",
216 | " init_df.columns = new_header\n",
217 | "\n",
218 | " # attempt to rename columns, if it's avaiable\n",
219 | " # otherwise, move to the next year lookup\n",
220 | " try:\n",
221 | " renamed_cols = {\n",
222 | " \"Player\": 'player',\n",
223 | " f\"{new_header[2]}\": \"salary\",\n",
224 | " f\"{new_header[3]}\": \"adj_salary\"\n",
225 | " }\n",
226 | " init_df = init_df.rename(columns=renamed_cols)\n",
227 | " except:\n",
228 | " continue\n",
229 | "\n",
230 | " # create \n",
231 | " try:\n",
232 | " df = init_df.copy()[['player', 'salary', 'adj_salary']]\n",
233 | " except:\n",
234 | " continue\n",
235 | " # update dataset with year values \n",
236 | " df['year-start'] = year\n",
237 | " df['year-end'] = year + 1\n",
238 | " # append this dataset to our group of datasets\n",
239 | " dfs.append(df)\n",
240 | " # slow down lookups to ensure our scraping doesn't overload\n",
241 | " # hoopshype.com\n",
242 | " time.sleep(1.2)"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "id": "f86901e5",
248 | "metadata": {},
249 | "source": [
250 | "Convert our list of dataframes (ie season salaries) into our entire dataset via pandas concat."
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 12,
256 | "id": "e5417732",
257 | "metadata": {},
258 | "outputs": [
259 | {
260 | "data": {
261 | "text/plain": [
262 | "(14549, 5)"
263 | ]
264 | },
265 | "execution_count": 12,
266 | "metadata": {},
267 | "output_type": "execute_result"
268 | }
269 | ],
270 | "source": [
271 | "dataset_df = pd.concat(dfs) #[['player', 'year-start', 'year-end', 'salary', 'adj_salary']]\n",
272 | "dataset_df.reset_index(drop=True, inplace=True)\n",
273 | "dataset_df.shape"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "id": "455f44e8",
279 | "metadata": {},
280 | "source": [
281 | "Store file to our course data"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 14,
287 | "id": "262b15f4",
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "dataset_df.to_csv(OUTPUT_PATH, index=False)"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "id": "5b5697c6",
298 | "metadata": {},
299 | "outputs": [],
300 | "source": []
301 | }
302 | ],
303 | "metadata": {
304 | "kernelspec": {
305 | "display_name": "Python 3 (ipykernel)",
306 | "language": "python",
307 | "name": "python3"
308 | },
309 | "language_info": {
310 | "codemirror_mode": {
311 | "name": "ipython",
312 | "version": 3
313 | },
314 | "file_extension": ".py",
315 | "mimetype": "text/x-python",
316 | "name": "python",
317 | "nbconvert_exporter": "python",
318 | "pygments_lexer": "ipython3",
319 | "version": "3.9.7"
320 | }
321 | },
322 | "nbformat": 4,
323 | "nbformat_minor": 5
324 | }
325 |
--------------------------------------------------------------------------------
/course/appendix/Appendix C - The NBA API Experiments.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "0214eb00",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "# %pip install nba_api"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "eef82f84",
16 | "metadata": {},
17 | "source": [
18 | "## NBA API Experiments\n",
19 | "While I was experimenting the `nba_api` I created this notebook. It's a bit of a mixed bag of tests but certainly a worthy inclusion of things you might try yourself.\n",
20 | "\n",
21 | "There is no real order in this notebook -- feel free to submit a [pull request](https://github.com/codingforentrepreneurs/Try-Pandas/pulls) if you find ways to improve it.\n",
22 | "\n",
23 | "\n",
24 | "#### _NBA Shot Chart explainer_\n",
25 | "Interesting project (perhas a future video)?\n",
26 | "- https://www.youtube.com/watch?v=a3u-3gEYvxM\n",
27 | "- https://github.com/hkair/nba-shotcharts"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "id": "e40478dd",
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "import pandas as pd\n",
38 | "import pathlib"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "id": "39fe9277",
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "from nba_api.stats.endpoints import commonplayerinfo, playercareerstats\n",
49 | "from nba_api.stats.static import players"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "id": "9da954c0",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "BASE_DIR = pathlib.Path().resolve().parent\n",
60 | "SAMPLES_DIR = BASE_DIR / \"samples\"\n",
61 | "salary_df = pd.read_csv(SAMPLES_DIR / '5-player-adj-salaries-audit.csv')"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "id": "7b5f24e0",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "def split_player_name(row):\n",
72 | " name = row['player']\n",
73 | " name_list = name.split()\n",
74 | " row['first_name'] = None\n",
75 | " row['last_name'] = None\n",
76 | " if len(name_list) == 2:\n",
77 | " row['first_name'] = name_list[0]\n",
78 | " row['last_name'] = name_list[1]\n",
79 | " if len(name_list) > 2 :\n",
80 | " row['first_name'] = \" \".join(name_list[:-1])\n",
81 | " row['last_name'] = name_list[-1]\n",
82 | " return row\n",
83 | "\n",
84 | "salary_df = salary_df.apply(split_player_name, axis=1)\n",
85 | "# salary_df.sort_values(\"player\", inplace=True)\n",
86 | "salary_df.head()"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "id": "ad2c98dc",
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "career = playercareerstats.PlayerCareerStats(player_id=3)\n",
97 | "career"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "id": "3eb2925f",
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "def season_id_to_season_end(val):\n",
108 | " season_start, season_end = val.split(\"-\")\n",
109 | " if f\"{season_start}\".startswith(\"1\"):\n",
110 | " season_end = f\"19{season_end}\"\n",
111 | " else:\n",
112 | " season_end = f\"20{season_end}\"\n",
113 | " return season_end\n",
114 | "\n",
115 | "# Charles Barkle player_id = 3\n",
116 | "def get_season_data(player_id=1):\n",
117 | " career = playercareerstats.PlayerCareerStats(player_id=player_id)\n",
118 | " player_df = career.get_data_frames()[0]\n",
119 | " player_df['season_start'] = player_df['SEASON_ID'].apply(lambda x: x.split(\"-\")[0])\n",
120 | " player_df['season_end'] = player_df['SEASON_ID'].apply(season_id_to_season_end)\n",
121 | " # player_df_final = player_df.copy()[['PLAYER_ID', 'TEAM_ABBREVIATION', 'season_start', 'season_end']]\n",
122 | " return player_df"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "id": "360d220f",
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "player_ = get_season_data(player_id=3)\n",
133 | "player_.head()"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "id": "4ed52b9f",
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "nba_players = players.get_players()\n",
144 | "\n",
145 | "nba_players_df = pd.DataFrame(nba_players)\n",
146 | "nba_players_df.head()"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "id": "6fedf048",
153 | "metadata": {},
154 | "outputs": [],
155 | "source": []
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "id": "95d97685",
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# this is not efficient, perhaps another method to enrich all the data.\n",
165 | "\n",
166 | "# all_player_teams = []\n",
167 | "# for p_id in nba_players_df['id'].values[:10]:\n",
168 | "# all_player_teams.append(get_season_data(player_id=p_id))\n",
169 | "\n",
170 | "# all_player_teams_df = pd.concat(all_player_teams)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "id": "541a07cb",
177 | "metadata": {},
178 | "outputs": [],
179 | "source": []
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "id": "42f075f6",
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "salary_df.merge(nba_players_df, left_on=\"player\", right_on=\"full_name\")"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "id": "e404bfc0",
195 | "metadata": {},
196 | "outputs": [],
197 | "source": [
198 | "from nba_api.stats.static import teams\n",
199 | "\n",
200 | "nba_teams = teams.get_teams()\n",
201 | "# Select the dictionary for the Celtics, which contains their team ID\n",
202 | "celtics = [team for team in nba_teams if team['abbreviation'] == 'BOS'][0]\n",
203 | "celtics_id = celtics['id']"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "id": "4aed8df9",
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "from nba_api.stats.endpoints import leaguegamefinder\n",
214 | "\n",
215 | "# Query for games where the Celtics were playing\n",
216 | "gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=celtics_id)\n",
217 | "# The first DataFrame of those returned is what we want.\n",
218 | "games = gamefinder.get_data_frames()[0]\n",
219 | "games.head()"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "id": "7770f675",
226 | "metadata": {},
227 | "outputs": [],
228 | "source": []
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "id": "250ec48a",
234 | "metadata": {},
235 | "outputs": [],
236 | "source": [
237 | "player_info = commonplayerinfo.CommonPlayerInfo(player_id=51)\n",
238 | "player_info_df = pd.concat(player_info.get_data_frames()[:1])\n",
239 | "player_info_df.head()"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "id": "b8476bc2",
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "from nba_api.stats.endpoints import playercareerstats\n",
250 | "# Anthony Davis\n",
251 | "career = playercareerstats.PlayerCareerStats(player_id=51)\n",
252 | "player_df = career.get_data_frames()[0]\n",
253 | "player_df['season_start'] = player_df['SEASON_ID'].apply(lambda x: x.split(\"-\")[0])\n",
254 | "\n",
255 | "def season_end(val):\n",
256 | " season_start, season_end = val.split(\"-\")\n",
257 | " if f\"{season_end}\".startswith(\"9\"):\n",
258 | " season_end = f\"19{season_end}\"\n",
259 | " else:\n",
260 | " season_end = f\"20{season_end}\"\n",
261 | " return season_end\n",
262 | "\n",
263 | "player_df['season_end'] = player_df['SEASON_ID'].apply(season_end)\n",
264 | "player_df_final = player_df.copy()[['PLAYER_ID', 'TEAM_ABBREVIATION', 'season_start', 'season_end']]\n",
265 | "player_df_final.tail(n=100)"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "id": "13b8a211",
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "player_df.columns"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": null,
281 | "id": "c14ebaac",
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "player_games = player_df.copy()[['SEASON_ID', 'GAME_ID', \"TEAM_ID\", 'TEAM_NAME', \"WL\", \"MIN\", \"PTS\"]]\n",
286 | "player_games.head()"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "id": "c7e5b1e6",
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "players = players.get_players()"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": null,
302 | "id": "07a4cd9c",
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "players_df = pd.DataFrame(players)"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "id": "04dc2b72",
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "players_df.head()"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "id": "57cd0d44",
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "\n",
327 | "\n",
328 | "player_info.common_player_info.get_data_frame()"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "id": "8fcf9929",
335 | "metadata": {},
336 | "outputs": [],
337 | "source": [
338 | "# def add_common_info(row):\n",
339 | "# player_id = row['id']\n",
340 | "# player_info = commonplayerinfo.CommonPlayerInfo(player_id=player_id)\n",
341 | "# print(player_info)\n",
342 | "# return row\n",
343 | "\n",
344 | "# players_df.apply(add_common_info , axis=1)\n",
345 | "# players_df.head()"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": null,
351 | "id": "79aff263",
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "from nba_api.stats.static import teams\n",
356 | "\n",
357 | "\n",
358 | "teams = teams.get_teams()"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "id": "d0cdec29",
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "teams_df = pd.DataFrame(teams)\n",
369 | "teams_df.head()"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "id": "730be027",
376 | "metadata": {},
377 | "outputs": [],
378 | "source": []
379 | }
380 | ],
381 | "metadata": {
382 | "kernelspec": {
383 | "display_name": "Python 3 (ipykernel)",
384 | "language": "python",
385 | "name": "python3"
386 | },
387 | "language_info": {
388 | "codemirror_mode": {
389 | "name": "ipython",
390 | "version": 3
391 | },
392 | "file_extension": ".py",
393 | "mimetype": "text/x-python",
394 | "name": "python",
395 | "nbconvert_exporter": "python",
396 | "pygments_lexer": "ipython3",
397 | "version": "3.9.7"
398 | }
399 | },
400 | "nbformat": 4,
401 | "nbformat_minor": 5
402 | }
403 |
--------------------------------------------------------------------------------
/course/appendix/Appendix D - NBA Player Detail.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "62e34320",
6 | "metadata": {},
7 | "source": [
8 | "# 6 - NBA Player Detail\n",
9 | "\n",
10 | "Now we're going to heavily use the [nba_api](https://github.com/swar/nba_api) data library as it's proven to be a great API for extracting data from https://stats.nba.com."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "id": "c4ac00b3",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# %pip install nba_api"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "id": "f79bb047",
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "import utils"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "id": "a6e59df0",
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import pandas as pd\n",
41 | "import pathlib"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "id": "e4f6c115",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "from nba_api.stats.endpoints import commonplayerinfo, playercareerstats\n",
52 | "from nba_api.stats.static import players"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "id": "a462b5bd",
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "BASE_DIR = pathlib.Path().resolve()\n",
63 | "SAMPLES_DIR = BASE_DIR / 'samples'\n",
64 | "SAMPLE_PLAYERS_DIR = SAMPLES_DIR / 'players'\n",
65 | "SAMPLE_PLAYERS_DIR.mkdir(exist_ok=True, parents=True)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "id": "c54afdfd",
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "salary_df = pd.read_csv('samples/5-player-adj-salaries-audit.csv')\n",
76 | "salary_df.columns = [f\"{x}\".upper() for x in salary_df.columns]"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "id": "bf3a739f",
82 | "metadata": {},
83 | "source": [
84 | "I converted our columns to match the column names in the `nba_api` library."
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "id": "33ec3ad5",
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "random_row = salary_df.sample(n=1)\n",
95 | "name = random_row['PLAYER'].item() # .item() will get the value\n",
96 | "random_row"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "id": "47bb9f3d",
102 | "metadata": {},
103 | "source": [
104 | "Using `.sample(n=1)` will return a random sample of our data. This sample can be as large as you'd like but I chose to return `1` row (`n=1`) to ultimately get `1` player's name for use in the API."
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "id": "90c49f0d",
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "# name = 'Michael Jordan'\n",
115 | "name"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "id": "d88d5d14",
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "player_results = players.find_players_by_full_name(name) \n",
126 | "player_df = pd.DataFrame(player_results)\n",
127 | "player_df.head()"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "id": "2afa3bb1",
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "player_df"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "id": "c532bdbc",
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "player_id = player_df.iloc[0]['id'].item()\n",
148 | "# player_id = player_df.loc[0]['id'].item()"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "id": "07f6e6ac",
154 | "metadata": {},
155 | "source": [
156 | "`.iloc` is much like using an index value in a standard python list. `iloc[23]` will yield the 24th element in the DataFrame. `iloc[0]` will return the 1st element at the 0th position. `loc[0]` *may* yield the same result if the index column is not shuffled. "
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "id": "887546a2",
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "career = playercareerstats.PlayerCareerStats(player_id=player_id)\n",
167 | "career_df = career.get_data_frames()[0]\n",
168 | "career_df.head()"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "id": "49841b9c",
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "def season_id_to_season_end(val):\n",
179 | " season_start, season_end = val.split(\"-\")\n",
180 | " if f\"{season_start}\".startswith(\"1\"):\n",
181 | " season_end = f\"19{season_end}\"\n",
182 | " else:\n",
183 | " season_end = f\"20{season_end}\"\n",
184 | " return season_end"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "id": "1342c025",
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "career_df['YEAR_START'] = career_df['SEASON_ID'].apply(lambda x: x.split(\"-\")[0])\n",
195 | "career_df['YEAR_END'] = career_df['SEASON_ID'].apply(season_id_to_season_end)"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "id": "8c0a6c4b",
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "career_df.head()"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "id": "e3e8a5a9",
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "year_start = career_df['YEAR_START'].min()\n",
216 | "year_end = career_df['YEAR_END'].max()\n",
217 | "year_start, year_end"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "id": "10884871",
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "key_stats = ['GP',\n",
228 | " 'GS',\n",
229 | " 'MIN',\n",
230 | " 'FGM',\n",
231 | " 'FGA',\n",
232 | " 'FG_PCT',\n",
233 | " 'FG3M',\n",
234 | " 'FG3A',\n",
235 | " 'FG3_PCT',\n",
236 | " 'FTM',\n",
237 | " 'FTA',\n",
238 | " 'FT_PCT',\n",
239 | " 'OREB',\n",
240 | " 'DREB',\n",
241 | " 'REB',\n",
242 | " 'AST',\n",
243 | " 'STL',\n",
244 | " 'BLK',\n",
245 | " 'TOV',\n",
246 | " 'PF',\n",
247 | " 'PTS']\n",
248 | " \n",
249 | "key_stats_labels = [\n",
250 | " \"Games Played\",\n",
251 | " \"Games Started\",\n",
252 | " \"Minutes\",\n",
253 | " \"Field Goals Made\",\n",
254 | " \"Field Goals Attempted\",\n",
255 | " \"Field Goal Percentage\",\n",
256 | " \"3-point Field Goals Made\",\n",
257 | " \"3-point Field Goals Made Attempted\",\n",
258 | " \"3-point Field Goal Percentage\",\n",
259 | " 'Free Throws Made',\n",
260 | " 'Free Throws Attempted',\n",
261 | " 'Free Throw Percentage',\n",
262 | " 'Offensive Rebouns',\n",
263 | " 'Defensive Rebouns',\n",
264 | " 'Rebouns',\n",
265 | " 'Assists',\n",
266 | " 'Steals',\n",
267 | " 'Blocks',\n",
268 | " 'Turnovers',\n",
269 | " 'Personal Fouls',\n",
270 | " 'Points'\n",
271 | "]\n",
272 | " \n",
273 | "key_stats_mapping = dict(zip(key_stats, key_stats_labels))\n",
274 | "stats_without_perc = [x for x in key_stats if not \"pct\" in x.lower()]\n",
275 | "stats_with_perc = [x for x in key_stats if \"pct\" in x.lower()]"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": null,
281 | "id": "cbd694ec",
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "career_df"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "id": "bb1e311c",
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "career_stats_columns = stats_without_perc + [\"SEASON_ID\"]\n",
296 | "career_stats = career_df.copy()[career_stats_columns]\n",
297 | "career_stats.head()"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "id": "69e70a1b",
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "# career_stats = career_stats.astype(int)\n",
308 | "career_stats.set_index(\"SEASON_ID\", inplace=True, drop=True)\n",
309 | "totals_df = career_stats.rename(columns=key_stats_mapping)\n",
310 | "totals = totals_df.sum()"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "id": "0c8cf184",
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "totals_df"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "id": "849b1797",
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "career_stats_perc = career_df.copy()[stats_with_perc]\n",
331 | "averages_df = career_stats_perc.rename(columns=key_stats_mapping)\n",
332 | "averages = averages_df.mean()"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "id": "550cfc04",
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "totals"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": null,
348 | "id": "3111002f",
349 | "metadata": {},
350 | "outputs": [],
351 | "source": [
352 | "averages"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "id": "16bf9466",
359 | "metadata": {},
360 | "outputs": [],
361 | "source": []
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": null,
366 | "id": "794a21fb",
367 | "metadata": {},
368 | "outputs": [],
369 | "source": [
370 | "earnings = salary_df.copy()[salary_df['PLAYER'] == name][['SALARY', 'ADJ_SALARY', 'YEAR_START']]\n",
371 | "earnings.set_index(\"YEAR_START\", inplace=True, drop=True)\n",
372 | "earnings.tail(n=15)"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": null,
378 | "id": "4ec4bdfd",
379 | "metadata": {},
380 | "outputs": [],
381 | "source": [
382 | "total_earnings = earnings.sum().apply(utils.float_to_dollars)\n",
383 | "total_earnings"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": null,
389 | "id": "8ad188d4",
390 | "metadata": {},
391 | "outputs": [],
392 | "source": [
393 | "name"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": null,
399 | "id": "d74c9382",
400 | "metadata": {},
401 | "outputs": [],
402 | "source": [
403 | "total_games = career_stats.GP.sum()\n",
404 | "total_games"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": null,
410 | "id": "9f255d71",
411 | "metadata": {},
412 | "outputs": [],
413 | "source": [
414 | "earnings_per_game = earnings['SALARY'].sum() / total_games\n",
415 | "utils.float_to_dollars(earnings_per_game)"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "id": "757bb983",
422 | "metadata": {},
423 | "outputs": [],
424 | "source": [
425 | "plot = earnings.plot()"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": null,
431 | "id": "1d8583e6",
432 | "metadata": {},
433 | "outputs": [],
434 | "source": []
435 | },
436 | {
437 | "cell_type": "markdown",
438 | "id": "7fea57c7",
439 | "metadata": {},
440 | "source": [
441 | "### Export Player Data"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": null,
447 | "id": "218616fd",
448 | "metadata": {},
449 | "outputs": [],
450 | "source": [
451 | "PLAYER_DIR = SAMPLE_PLAYERS_DIR / f\"{name}\"\n",
452 | "PLAYER_DIR.mkdir(exist_ok=True, parents=True)"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": null,
458 | "id": "3b2d2aea",
459 | "metadata": {},
460 | "outputs": [],
461 | "source": [
462 | "# Eearnings Plot\n",
463 | "figure = plot.figure\n",
464 | "figure.savefig(PLAYER_DIR / f\"salary_over_time.png\")"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "id": "42d5c8db",
471 | "metadata": {},
472 | "outputs": [],
473 | "source": [
474 | "stats_output = PLAYER_DIR / \"stats.xlsx\""
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": null,
480 | "id": "91cde06a",
481 | "metadata": {},
482 | "outputs": [],
483 | "source": [
484 | "totals_df"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "id": "784a63ba",
491 | "metadata": {},
492 | "outputs": [],
493 | "source": [
494 | "with pd.ExcelWriter(stats_output) as writer:\n",
495 | " totals_df.to_excel(writer, sheet_name='Career Stats')\n",
496 | " totals.to_excel(writer, sheet_name='Career Totals')\n",
497 | " averages.to_excel(writer, sheet_name='Career Averages')\n",
498 | " earnings.to_excel(writer, sheet_name='Yearly Earnings')\n",
499 | " total_earnings.to_excel(writer, sheet_name='Total Earnings')"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": null,
505 | "id": "09a59565",
506 | "metadata": {},
507 | "outputs": [],
508 | "source": [
509 | "name"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": null,
515 | "id": "aea7e10a",
516 | "metadata": {},
517 | "outputs": [],
518 | "source": [
519 | "import requests"
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": null,
525 | "id": "c358192a",
526 | "metadata": {},
527 | "outputs": [],
528 | "source": [
529 | "import requests\n",
530 | "\n",
531 | "url = \"https://stats.nba.com/stats/playerindex?College=&Country=&DraftPick=&DraftRound=&DraftYear=&Height=&Historical=1&LeagueID=00&Season=2021-22&SeasonType=Regular%20Season&TeamID=0\"\n",
532 | "url = \"https://stats.nba.com/stats/playercareerstats?LeagueID=&PerMode=Totals&PlayerID=2544\"\n",
533 | "\n",
534 | "r = requests.get(url, stream=True)\n",
535 | "r.json()"
536 | ]
537 | },
538 | {
539 | "cell_type": "code",
540 | "execution_count": null,
541 | "id": "2c89cf45",
542 | "metadata": {},
543 | "outputs": [],
544 | "source": []
545 | }
546 | ],
547 | "metadata": {
548 | "kernelspec": {
549 | "display_name": "Python 3 (ipykernel)",
550 | "language": "python",
551 | "name": "python3"
552 | },
553 | "language_info": {
554 | "codemirror_mode": {
555 | "name": "ipython",
556 | "version": 3
557 | },
558 | "file_extension": ".py",
559 | "mimetype": "text/x-python",
560 | "name": "python",
561 | "nbconvert_exporter": "python",
562 | "pygments_lexer": "ipython3",
563 | "version": "3.9.7"
564 | }
565 | },
566 | "nbformat": 4,
567 | "nbformat_minor": 5
568 | }
569 |
--------------------------------------------------------------------------------
/course/appendix/appendix-b-united-states-inflation-rate.csv:
--------------------------------------------------------------------------------
1 | Macrotrends Data Download
2 |
3 | U.S. Inflation Rate 1960-2021
4 |
5 |
6 | DISCLAIMER AND TERMS OF USE: HISTORICAL DATA IS PROVIDED "AS IS" AND SOLELY
7 | FOR INFORMATIONAL PURPOSES - NOT FOR TRADING PURPOSES OR ADVICE.
8 | NEITHER MACROTRENDS LLC NOR ANY OF OUR INFORMATION PROVIDERS WILL BE LIABLE
9 | FOR ANY DAMAGES RELATING TO YOUR USE OF THE DATA PROVIDED.
10 |
11 |
12 | ATTRIBUTION: Proper attribution requires clear indication of the data source as "www.macrotrends.net".
13 | A "dofollow" backlink to the originating page is also required if the data is displayed on a web page.
14 |
15 |
16 | , , ,
17 | date, Inflation Rate (%), Annual Change,
18 | 1960-12-31,1.458
19 | 1961-12-31,1.0707,-0.39
20 | 1962-12-31,1.1988,0.13
21 | 1963-12-31,1.2397,0.04
22 | 1964-12-31,1.2789,0.04
23 | 1965-12-31,1.5852,0.31
24 | 1966-12-31,3.0151,1.43
25 | 1967-12-31,2.7728,-0.24
26 | 1968-12-31,4.2718,1.5
27 | 1969-12-31,5.4624,1.19
28 | 1970-12-31,5.8383,0.38
29 | 1971-12-31,4.2928,-1.55
30 | 1972-12-31,3.2723,-1.02
31 | 1973-12-31,6.1778,2.91
32 | 1974-12-31,11.0548,4.88
33 | 1975-12-31,9.1431,-1.91
34 | 1976-12-31,5.7448,-3.4
35 | 1977-12-31,6.5017,0.76
36 | 1978-12-31,7.631,1.13
37 | 1979-12-31,11.2545,3.62
38 | 1980-12-31,13.5492,2.29
39 | 1981-12-31,10.3347,-3.21
40 | 1982-12-31,6.1314,-4.2
41 | 1983-12-31,3.2124,-2.92
42 | 1984-12-31,4.3005,1.09
43 | 1985-12-31,3.5456,-0.75
44 | 1986-12-31,1.898,-1.65
45 | 1987-12-31,3.6646,1.77
46 | 1988-12-31,4.0777,0.41
47 | 1989-12-31,4.827,0.75
48 | 1990-12-31,5.398,0.57
49 | 1991-12-31,4.235,-1.16
50 | 1992-12-31,3.0288,-1.21
51 | 1993-12-31,2.9517,-0.08
52 | 1994-12-31,2.6074,-0.34
53 | 1995-12-31,2.8054,0.2
54 | 1996-12-31,2.9312,0.13
55 | 1997-12-31,2.3377,-0.59
56 | 1998-12-31,1.5523,-0.79
57 | 1999-12-31,2.188,0.64
58 | 2000-12-31,3.3769,1.19
59 | 2001-12-31,2.8262,-0.55
60 | 2002-12-31,1.586,-1.24
61 | 2003-12-31,2.2701,0.68
62 | 2004-12-31,2.6772,0.41
63 | 2005-12-31,3.3927,0.72
64 | 2006-12-31,3.2259,-0.17
65 | 2007-12-31,2.8527,-0.37
66 | 2008-12-31,3.8391,0.99
67 | 2009-12-31,-0.3555,-4.19
68 | 2010-12-31,1.64,2
69 | 2011-12-31,3.1568,1.52
70 | 2012-12-31,2.0693,-1.09
71 | 2013-12-31,1.4648,-0.6
72 | 2014-12-31,1.6222,0.16
73 | 2015-12-31,0.1186,-1.5
74 | 2016-12-31,1.2616,1.14
75 | 2017-12-31,2.1301,0.87
76 | 2018-12-31,2.4426,0.31
77 | 2019-12-31,1.8122,-0.63
78 | 2020-12-31,1.2336,-0.58
79 |
--------------------------------------------------------------------------------
/course/datasets/inflation-rate.csv:
--------------------------------------------------------------------------------
1 | date,inflation_rate_percent,multiplier
2 | 1960-12-31,1.01458,8.751003401396263
3 | 1961-12-31,1.010707,8.65829899406679
4 | 1962-12-31,1.011988,8.555732868439929
5 | 1963-12-31,1.012397,8.450966239963108
6 | 1964-12-31,1.012789,8.344251606171778
7 | 1965-12-31,1.015852,8.214042602831697
8 | 1966-12-31,1.030151,7.97362969393001
9 | 1967-12-31,1.027728,7.758501951810214
10 | 1968-12-31,1.042718,7.440652172313332
11 | 1969-12-31,1.054624,7.055265357429124
12 | 1970-12-31,1.058383,6.666079630369272
13 | 1971-12-31,1.042928,6.391696867251884
14 | 1972-12-31,1.032723,6.189168699885534
15 | 1973-12-31,1.061778,5.829060971206343
16 | 1974-12-31,1.110548,5.2488149735142855
17 | 1975-12-31,1.091431,4.809112965926647
18 | 1976-12-31,1.057448,4.5478481834819755
19 | 1977-12-31,1.065017,4.270211821484517
20 | 1978-12-31,1.07631,3.967455307006825
21 | 1979-12-31,1.112545,3.5661077143008386
22 | 1980-12-31,1.135492,3.1405837419381557
23 | 1981-12-31,1.103347,2.8464152636823705
24 | 1982-12-31,1.061314,2.681972784380845
25 | 1983-12-31,1.032124,2.59849861487655
26 | 1984-12-31,1.043005,2.491357773813692
27 | 1985-12-31,1.035456,2.4060489038778017
28 | 1986-12-31,1.01898,2.3612327070970984
29 | 1987-12-31,1.036646,2.2777618464713094
30 | 1988-12-31,1.040777,2.1885205442388806
31 | 1989-12-31,1.04827,2.08774508880239
32 | 1990-12-31,1.05398,1.9808204034254824
33 | 1991-12-31,1.04235,1.900340963616331
34 | 1992-12-31,1.030288,1.8444754899759395
35 | 1993-12-31,1.029517,1.7915930382654586
36 | 1994-12-31,1.026074,1.7460661105002742
37 | 1995-12-31,1.028054,1.6984186730466238
38 | 1996-12-31,1.029312,1.6500523388891057
39 | 1997-12-31,1.023377,1.6123601946194852
40 | 1998-12-31,1.015523,1.5877141085130386
41 | 1999-12-31,1.02188,1.553718742428699
42 | 2000-12-31,1.033769,1.502965113510561
43 | 2001-12-31,1.028262,1.4616557973654194
44 | 2002-12-31,1.01586,1.4388358606160483
45 | 2003-12-31,1.022701,1.4068978720232481
46 | 2004-12-31,1.026772,1.37021448970487
47 | 2005-12-31,1.033927,1.3252526432764307
48 | 2006-12-31,1.032259,1.2838373346964576
49 | 2007-12-31,1.028527,1.2482291030730917
50 | 2008-12-31,1.038391,1.20208004795216
51 | 2009-12-31,0.996445,1.206368688640276
52 | 2010-12-31,1.0164,1.186903471704325
53 | 2011-12-31,1.031568,1.1505819022151957
54 | 2012-12-31,1.020693,1.127255602042138
55 | 2013-12-31,1.014648,1.1109819386054456
56 | 2014-12-31,1.016222,1.0932472812096623
57 | 2015-12-31,1.001186,1.091952225869781
58 | 2016-12-31,1.012616,1.0783477901492577
59 | 2017-12-31,1.021301,1.0558569806053826
60 | 2018-12-31,1.024426,1.030681552992
61 | 2019-12-31,1.018122,1.012336
62 | 2020-12-31,1.012336,1.0
63 |
--------------------------------------------------------------------------------
/course/samples/4-adj-salaries-cumlative-per-year.csv:
--------------------------------------------------------------------------------
1 | year,adj_salary,adj_salary_$
2 | 1990,582539682.0,"$582,539,682.00"
3 | 1991,700377392.0,"$700,377,392.00"
4 | 1992,788801985.0,"$788,801,985.00"
5 | 1993,893209929.0,"$893,209,929.00"
6 | 1994,1042891125.0,"$1,042,891,125.00"
7 | 1995,1224345459.0,"$1,224,345,459.00"
8 | 1996,1321922576.0,"$1,321,922,576.00"
9 | 1997,1525445588.0,"$1,525,445,588.00"
10 | 1998,1691690974.0,"$1,691,690,974.00"
11 | 1999,2038382935.0,"$2,038,382,935.00"
12 | 2000,2248609728.0,"$2,248,609,728.00"
13 | 2001,2257291832.0,"$2,257,291,832.00"
14 | 2002,2392844658.0,"$2,392,844,658.00"
15 | 2003,2347269535.0,"$2,347,269,535.00"
16 | 2004,2410317680.0,"$2,410,317,680.00"
17 | 2005,2514965375.0,"$2,514,965,375.00"
18 | 2006,2464069199.0,"$2,464,069,199.00"
19 | 2007,2553671099.0,"$2,553,671,099.00"
20 | 2008,2543013922.0,"$2,543,013,922.00"
21 | 2009,2525568731.0,"$2,525,568,731.00"
22 | 2010,2396025966.0,"$2,396,025,966.00"
23 | 2011,2302629043.0,"$2,302,629,043.00"
24 | 2012,2371940974.0,"$2,371,940,974.00"
25 | 2013,2338442773.0,"$2,338,442,773.00"
26 | 2014,2382200105.0,"$2,382,200,105.00"
27 | 2015,2543053337.0,"$2,543,053,337.00"
28 | 2016,3172547897.0,"$3,172,547,897.00"
29 | 2017,3532646079.0,"$3,532,646,079.00"
30 | 2018,3687776555.0,"$3,687,776,555.00"
31 | 2019,3557605726.0,"$3,557,605,726.00"
32 | 2020,3905172153.0,"$3,905,172,153.00"
33 |
--------------------------------------------------------------------------------
/course/samples/4-player-salaries-per-year.csv:
--------------------------------------------------------------------------------
1 | year,players
2 | 1990,352
3 | 1991,383
4 | 1992,401
5 | 1993,385
6 | 1994,418
7 | 1995,451
8 | 1996,415
9 | 1997,444
10 | 1998,426
11 | 1999,516
12 | 2000,455
13 | 2001,450
14 | 2002,451
15 | 2003,454
16 | 2004,470
17 | 2005,479
18 | 2006,495
19 | 2007,469
20 | 2008,460
21 | 2009,456
22 | 2010,459
23 | 2011,463
24 | 2012,494
25 | 2013,492
26 | 2014,513
27 | 2015,500
28 | 2016,545
29 | 2017,586
30 | 2018,576
31 | 2019,513
32 | 2020,578
33 |
--------------------------------------------------------------------------------
/course/samples/players/Caleb Swanigan/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Caleb Swanigan/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Caleb Swanigan/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Caleb Swanigan/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Charlie Ward/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Charlie Ward/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Charlie Ward/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Charlie Ward/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Chris Mihm/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Chris Mihm/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Chris Mihm/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Chris Mihm/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Chris Wilcox/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Chris Wilcox/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Chris Wilcox/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Chris Wilcox/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Darvin Ham/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Darvin Ham/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Darvin Ham/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Darvin Ham/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Devin Harris/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Devin Harris/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Devin Harris/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Devin Harris/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Eric Gordon/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Eric Gordon/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Eric Gordon/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Eric Gordon/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Gary Trent Jr/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Gary Trent Jr/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Gary Trent Jr/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Gary Trent Jr/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Gerald Wilkins/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Gerald Wilkins/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Gerald Wilkins/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Gerald Wilkins/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Jahidi White/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jahidi White/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Jahidi White/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jahidi White/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Jason Smith/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jason Smith/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Jason Smith/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jason Smith/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Jermaine O'Neal/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jermaine O'Neal/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Jermaine O'Neal/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Jermaine O'Neal/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Ken Norman/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Ken Norman/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Ken Norman/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Ken Norman/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Kevin Garnett/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Kevin Garnett/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Kevin Garnett/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Kevin Garnett/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Ledell Eackles/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Ledell Eackles/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Ledell Eackles/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Ledell Eackles/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Luke Harangody/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Luke Harangody/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Luke Harangody/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Luke Harangody/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Michael Beasley/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Michael Beasley/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Michael Beasley/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Michael Beasley/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Michael Jordan/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Michael Jordan/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Michael Jordan/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Michael Jordan/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Shaquille O'Neal/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Shaquille O'Neal/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Shaquille O'Neal/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Shaquille O'Neal/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Steve Scheffler/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Steve Scheffler/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Steve Scheffler/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Steve Scheffler/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Toby Bailey/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Toby Bailey/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Toby Bailey/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Toby Bailey/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Tony Farmer/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Tony Farmer/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Tony Farmer/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Tony Farmer/stats.xlsx
--------------------------------------------------------------------------------
/course/samples/players/Tristan Thompson/salary_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Tristan Thompson/salary_over_time.png
--------------------------------------------------------------------------------
/course/samples/players/Tristan Thompson/stats.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingforentrepreneurs/Try-Pandas/ea7407e4e88e6bd05c3cac8e0cf1bae96777b56a/course/samples/players/Tristan Thompson/stats.xlsx
--------------------------------------------------------------------------------
/course/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Below are various simple utility methods we use in many of the notebooks.
3 | To use just:
4 |
5 | import utils
6 |
7 | utils.float_to_dollars(32.00)
8 |
9 | """
10 | from slugify import slugify
11 |
12 |
13 | def float_to_dollars(value:float) -> str:
14 | """
15 | Take in a float (32.00)
16 | """
17 | return f"${value:,.2f}"
18 |
19 |
20 | def dollar_str_to_float(value:str) -> float:
21 | return float(value.replace("$", "").replace(",", "_"))
22 |
23 |
24 | def group_salary(value:float) -> str:
25 | if value > .95:
26 | return 'top'
27 | elif value < .95 and value > .50:
28 | return 'mid'
29 | return 'low'
30 |
31 |
32 | def to_snake_case(val):
33 | # in the future, this will be stored in
34 | # utils.py in the courses/ directory
35 | kebab_case = slugify(val)
36 | return kebab_case.replace('-', '_')
37 |
--------------------------------------------------------------------------------
/nbs_ref/1 - DataFrame.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "cell_id": "00000-404671cc-c8d9-4153-b6c0-fef41a4c8ad4",
8 | "deepnote_cell_type": "code",
9 | "deepnote_to_be_reexecuted": false,
10 | "execution_millis": 5,
11 | "execution_start": 1634660464202,
12 | "source_hash": "9d6c0093",
13 | "tags": []
14 | },
15 | "outputs": [],
16 | "source": [
17 | "import datetime\n",
18 | "number = 10\n",
19 | "data = [{\"number\": x, \"time\": datetime.datetime.now(), \"added_by\": \"Justin\"} for x in range(0, number)]\n",
20 | "data"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "cell_id": "00001-b9f74741-0730-4321-abaf-4f5a0f3a33ac",
28 | "deepnote_cell_type": "code",
29 | "deepnote_to_be_reexecuted": false,
30 | "execution_millis": 0,
31 | "execution_start": 1634660404069,
32 | "source_hash": "9b82ee11",
33 | "tags": []
34 | },
35 | "outputs": [],
36 | "source": [
37 | "import pandas as pd"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {
44 | "cell_id": "00002-a3425b6e-ca8e-42b9-8794-babe70bdf7cd",
45 | "deepnote_cell_type": "code",
46 | "deepnote_to_be_reexecuted": false,
47 | "execution_millis": 1,
48 | "execution_start": 1634660494571,
49 | "source_hash": "68b98649",
50 | "tags": []
51 | },
52 | "outputs": [],
53 | "source": [
54 | "df = pd.DataFrame(data)"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "cell_id": "00003-d30403ec-6d18-4950-9406-8a09845ea4a1",
62 | "deepnote_cell_type": "code",
63 | "deepnote_to_be_reexecuted": false,
64 | "execution_millis": 24,
65 | "execution_start": 1634660495073,
66 | "source_hash": "c085b6ba",
67 | "tags": []
68 | },
69 | "outputs": [],
70 | "source": [
71 | "df.head()"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "cell_id": "00004-bae5cad7-e47a-4649-ae30-da286f06f94d",
79 | "deepnote_cell_type": "code",
80 | "deepnote_to_be_reexecuted": false,
81 | "execution_millis": 20,
82 | "execution_start": 1634660531873,
83 | "source_hash": "c6672ebc",
84 | "tags": []
85 | },
86 | "outputs": [],
87 | "source": [
88 | "df.to_csv(\"temp.csv\")"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {
95 | "cell_id": "00005-39181e7e-fd4e-4d05-a413-b3dde31d113f",
96 | "deepnote_cell_type": "code",
97 | "tags": []
98 | },
99 | "outputs": [],
100 | "source": []
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {
105 | "created_in_deepnote_cell": true,
106 | "deepnote_cell_type": "markdown",
107 | "tags": []
108 | },
109 | "source": [
110 | "\n",
111 | "
\n",
112 | "Created in Deepnote"
113 | ]
114 | }
115 | ],
116 | "metadata": {
117 | "deepnote": {
118 | "is_reactive": false
119 | },
120 | "deepnote_execution_queue": [],
121 | "deepnote_notebook_id": "6d02c57e-af66-4ca3-a9e8-3e4d1a4b40ec"
122 | },
123 | "nbformat": 4,
124 | "nbformat_minor": 2
125 | }
126 |
--------------------------------------------------------------------------------
/nbs_ref/2 - Import & Export.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "cell_id": "00000-73e17520-ba02-468e-b566-ca0b1d88df64",
8 | "deepnote_cell_type": "code",
9 | "deepnote_to_be_reexecuted": false,
10 | "execution_millis": 4,
11 | "execution_start": 1634662754163,
12 | "source_hash": "fb25d99a",
13 | "tags": []
14 | },
15 | "outputs": [],
16 | "source": [
17 | "import random\n",
18 | "import pandas as pd"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "cell_id": "00001-d427eb7c-182d-487d-92ec-57a6bf0ace8d",
26 | "deepnote_cell_type": "code",
27 | "deepnote_to_be_reexecuted": false,
28 | "execution_millis": 2,
29 | "execution_start": 1634662872515,
30 | "source_hash": "10381e97",
31 | "tags": []
32 | },
33 | "outputs": [],
34 | "source": [
35 | "items = []\n",
36 | "number = random.randint(0, 10_000)\n",
37 | "\n",
38 | "for x in range(0, number):\n",
39 | " dollars = random.randint(200_000, 50_000_000)\n",
40 | " data = {\n",
41 | " \"Player Name\": f\"Player-{x}\",\n",
42 | " \"Player Salary\": f\"${dollars:,.2f}\"\n",
43 | " }\n",
44 | " items.append(data)"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {
51 | "cell_id": "00002-7c63a5ab-bbe9-4098-9d10-68ccb0a0c6c4",
52 | "deepnote_cell_type": "code",
53 | "deepnote_to_be_reexecuted": false,
54 | "execution_millis": 9,
55 | "execution_start": 1634663034475,
56 | "source_hash": "77d806e5",
57 | "tags": []
58 | },
59 | "outputs": [],
60 | "source": [
61 | "items[df.shape[0]-1]\n",
62 | "items[-1]"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {
69 | "cell_id": "00002-b0c3ddbd-1856-4c13-b20a-2e9213a9de33",
70 | "deepnote_cell_type": "code",
71 | "deepnote_to_be_reexecuted": false,
72 | "execution_millis": 10,
73 | "execution_start": 1634663045778,
74 | "source_hash": "ee80ac5f",
75 | "tags": []
76 | },
77 | "outputs": [],
78 | "source": [
79 | "df = pd.DataFrame(items)\n",
80 | "df.tail(n=5)"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {
87 | "cell_id": "00003-8693a73e-9587-4404-807f-dac4a386fd65",
88 | "deepnote_cell_type": "code",
89 | "deepnote_to_be_reexecuted": false,
90 | "execution_millis": 2,
91 | "execution_start": 1634662954296,
92 | "source_hash": "14f60b8f",
93 | "tags": []
94 | },
95 | "outputs": [],
96 | "source": [
97 | "df.shape"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {
104 | "cell_id": "00003-37cc71ed-0226-47af-a776-9f18083dc3a4",
105 | "deepnote_cell_type": "code",
106 | "deepnote_to_be_reexecuted": false,
107 | "execution_millis": 11,
108 | "execution_start": 1634663125029,
109 | "source_hash": "9a72dbc",
110 | "tags": []
111 | },
112 | "outputs": [],
113 | "source": [
114 | "df.to_csv(\"example.csv\", index=False)"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {
121 | "cell_id": "00004-01213fba-c9c9-4809-bd53-d712180c865f",
122 | "deepnote_cell_type": "code",
123 | "deepnote_to_be_reexecuted": false,
124 | "execution_millis": 16,
125 | "execution_start": 1634663135503,
126 | "source_hash": "e261385d",
127 | "tags": []
128 | },
129 | "outputs": [],
130 | "source": [
131 | "df2 = pd.read_csv(\"example.csv\")\n",
132 | "df2.to_csv(\"example.csv\", index=False)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "cell_id": "00005-5bb5b873-84a6-4c13-8283-80324940e936",
140 | "deepnote_cell_type": "code",
141 | "deepnote_to_be_reexecuted": false,
142 | "execution_millis": 2,
143 | "execution_start": 1634663135575,
144 | "source_hash": "4e0cbe0d",
145 | "tags": []
146 | },
147 | "outputs": [],
148 | "source": [
149 | "df2.head()"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {
156 | "cell_id": "00008-f54b4824-92df-447c-8b68-9fe9cb57db7e",
157 | "deepnote_cell_type": "code",
158 | "tags": []
159 | },
160 | "outputs": [],
161 | "source": []
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {
166 | "created_in_deepnote_cell": true,
167 | "deepnote_cell_type": "markdown",
168 | "tags": []
169 | },
170 | "source": [
171 | "\n",
172 | "
\n",
173 | "Created in Deepnote"
174 | ]
175 | }
176 | ],
177 | "metadata": {
178 | "deepnote": {
179 | "is_reactive": false
180 | },
181 | "deepnote_execution_queue": [],
182 | "deepnote_notebook_id": "7d195f65-2af9-44d5-b3b3-a1ec93a8da0c"
183 | },
184 | "nbformat": 4,
185 | "nbformat_minor": 2
186 | }
187 |
--------------------------------------------------------------------------------
/nbs_ref/3 - Rename Columns.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "cell_id": "00000-80b53f9e-8a99-42ce-8537-563e35eae81e",
8 | "deepnote_cell_type": "code",
9 | "deepnote_to_be_reexecuted": false,
10 | "execution_millis": 3,
11 | "execution_start": 1634664323897,
12 | "source_hash": "3296bc83",
13 | "tags": []
14 | },
15 | "outputs": [],
16 | "source": [
17 | "import pathlib\n",
18 | "import pandas as pd "
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "cell_id": "00001-619fdf6d-e70c-463c-9ef0-cade76400689",
26 | "deepnote_cell_type": "code",
27 | "deepnote_to_be_reexecuted": false,
28 | "execution_millis": 8,
29 | "execution_start": 1634664325242,
30 | "source_hash": "cc9f5d6c",
31 | "tags": []
32 | },
33 | "outputs": [],
34 | "source": [
35 | "BASE_DIR = pathlib.Path().resolve().parent\n",
36 | "COURSE_DIR = BASE_DIR / \"course\"\n",
37 | "SAMPLES_DIR = COURSE_DIR / 'samples'\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {
44 | "cell_id": "00002-fb8339de-56d3-4821-9ccb-c43a97be82f2",
45 | "deepnote_cell_type": "code",
46 | "deepnote_to_be_reexecuted": false,
47 | "execution_millis": 10,
48 | "execution_start": 1634664551676,
49 | "source_hash": "b2e8acb5",
50 | "tags": []
51 | },
52 | "outputs": [],
53 | "source": [
54 | "df = pd.read_csv(SAMPLES_DIR / \"1.csv\")\n",
55 | "df.head()"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {
62 | "cell_id": "00003-eec93174-c12b-41f5-a12e-113ea58e2387",
63 | "deepnote_cell_type": "code",
64 | "deepnote_to_be_reexecuted": false,
65 | "execution_millis": 3,
66 | "execution_start": 1634664554225,
67 | "source_hash": "c5e12177",
68 | "tags": []
69 | },
70 | "outputs": [],
71 | "source": [
72 | "columns = df.columns\n",
73 | "auto_changed = [x.lower().replace(\" \", \"_\") for x in list(columns)]\n",
74 | "mapped_columns = dict(zip(columns, auto_changed))\n",
75 | "mapped_columns"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {
82 | "cell_id": "00004-b288a341-abc8-44bf-8528-77c75d9a4717",
83 | "deepnote_cell_type": "code",
84 | "deepnote_to_be_reexecuted": false,
85 | "execution_millis": 6,
86 | "execution_start": 1634664554594,
87 | "source_hash": "53315aff",
88 | "tags": []
89 | },
90 | "outputs": [],
91 | "source": [
92 | "custom_mapped_columns = {'Player Name': 'name', 'Player Salary': 'salary'}"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {
99 | "cell_id": "00003-1bc88210-86e1-408f-8ca0-4784e53b59f2",
100 | "deepnote_cell_type": "code",
101 | "deepnote_to_be_reexecuted": false,
102 | "execution_millis": 5,
103 | "execution_start": 1634664558540,
104 | "source_hash": "805d5ae3",
105 | "tags": []
106 | },
107 | "outputs": [],
108 | "source": [
109 | "new_df = df.rename(columns=custom_mapped_columns)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {
116 | "cell_id": "00004-17e8390b-3ee1-4304-aff1-d86311f55a43",
117 | "deepnote_cell_type": "code",
118 | "deepnote_to_be_reexecuted": false,
119 | "execution_millis": 18,
120 | "execution_start": 1634664559234,
121 | "source_hash": "e0dec228",
122 | "tags": []
123 | },
124 | "outputs": [],
125 | "source": [
126 | "new_df.head()"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "cell_id": "00007-0ca575d4-3aab-46aa-aed3-dfde855c2558",
134 | "deepnote_cell_type": "code",
135 | "deepnote_to_be_reexecuted": false,
136 | "execution_millis": 5,
137 | "execution_start": 1634664565357,
138 | "source_hash": "c085b6ba",
139 | "tags": []
140 | },
141 | "outputs": [],
142 | "source": [
143 | "df.head()"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {
150 | "cell_id": "00008-78290255-8993-4195-93b1-e7c651c18689",
151 | "deepnote_cell_type": "code",
152 | "deepnote_to_be_reexecuted": false,
153 | "execution_millis": 4,
154 | "execution_start": 1634664605275,
155 | "source_hash": "a7cdfe1a",
156 | "tags": []
157 | },
158 | "outputs": [],
159 | "source": [
160 | "df = new_df.copy()"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {
167 | "cell_id": "00009-f639a375-19ca-406a-8f95-83b6b275ef8c",
168 | "deepnote_cell_type": "code",
169 | "deepnote_to_be_reexecuted": false,
170 | "execution_millis": 4,
171 | "execution_start": 1634664608527,
172 | "source_hash": "c085b6ba",
173 | "tags": []
174 | },
175 | "outputs": [],
176 | "source": [
177 | "df.head()"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {
184 | "cell_id": "00010-31d4f407-4738-4bbe-8ce6-d5d86f6a79ad",
185 | "deepnote_cell_type": "code",
186 | "tags": []
187 | },
188 | "outputs": [],
189 | "source": []
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {
194 | "created_in_deepnote_cell": true,
195 | "deepnote_cell_type": "markdown",
196 | "tags": []
197 | },
198 | "source": [
199 | "\n",
200 | "
\n",
201 | "Created in Deepnote"
202 | ]
203 | }
204 | ],
205 | "metadata": {
206 | "deepnote": {
207 | "is_reactive": false
208 | },
209 | "deepnote_execution_queue": [],
210 | "deepnote_notebook_id": "6728ff69-a810-43ce-9c54-6d8830c0bc28"
211 | },
212 | "nbformat": 4,
213 | "nbformat_minor": 2
214 | }
215 |
--------------------------------------------------------------------------------
/nbs_ref/4 - Clean Rows.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "cell_id": "00000-349f686e-3dfe-4eec-a63b-ef2ae23c894f",
8 | "deepnote_cell_type": "code",
9 | "deepnote_to_be_reexecuted": false,
10 | "execution_millis": 19,
11 | "execution_start": 1634664703106,
12 | "source_hash": "3296bc83",
13 | "tags": []
14 | },
15 | "outputs": [],
16 | "source": [
17 | "import pathlib\n",
18 | "import pandas as pd "
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "cell_id": "00001-05cb6c26-afaa-4d70-a56a-1192c0ce5297",
26 | "deepnote_cell_type": "code",
27 | "deepnote_to_be_reexecuted": false,
28 | "execution_millis": 377930,
29 | "execution_start": 1634664703216,
30 | "source_hash": "cc9f5d6c",
31 | "tags": []
32 | },
33 | "outputs": [],
34 | "source": [
35 | "BASE_DIR = pathlib.Path().resolve().parent\n",
36 | "COURSE_DIR = BASE_DIR / \"course\"\n",
37 | "SAMPLES_DIR = COURSE_DIR / 'samples'\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {
44 | "cell_id": "00002-97e9fae1-78cb-4530-abdc-108b358f86d0",
45 | "deepnote_cell_type": "code",
46 | "deepnote_to_be_reexecuted": false,
47 | "execution_millis": 54,
48 | "execution_start": 1634664703263,
49 | "source_hash": "ed8fc3d1",
50 | "tags": []
51 | },
52 | "outputs": [],
53 | "source": [
54 | "df = pd.read_csv(SAMPLES_DIR / \"1.csv\")\n"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "cell_id": "00004-286f55fa-610e-47e2-a42f-6c2230a095e9",
62 | "deepnote_cell_type": "code",
63 | "deepnote_to_be_reexecuted": false,
64 | "execution_millis": 148804,
65 | "execution_start": 1634664703398,
66 | "source_hash": "53315aff",
67 | "tags": []
68 | },
69 | "outputs": [],
70 | "source": [
71 | "custom_mapped_columns = {'Player Name': 'name', 'Player Salary': 'salary'}"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "cell_id": "00005-93b6dcf8-51e7-471b-9f27-ca88fa93f463",
79 | "deepnote_cell_type": "code",
80 | "deepnote_to_be_reexecuted": false,
81 | "execution_millis": 144901,
82 | "execution_start": 1634664703441,
83 | "source_hash": "805d5ae3",
84 | "tags": []
85 | },
86 | "outputs": [],
87 | "source": [
88 | "new_df = df.rename(columns=custom_mapped_columns)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {
95 | "cell_id": "00008-c49d10d4-b635-41e0-9aa9-3432117382e0",
96 | "deepnote_cell_type": "code",
97 | "deepnote_to_be_reexecuted": false,
98 | "execution_millis": 98364,
99 | "execution_start": 1634664703639,
100 | "source_hash": "a7cdfe1a",
101 | "tags": []
102 | },
103 | "outputs": [],
104 | "source": [
105 | "df = new_df.copy()"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {
112 | "cell_id": "00009-b67e1732-661c-4924-abe5-0fb426055b53",
113 | "deepnote_cell_type": "code",
114 | "deepnote_to_be_reexecuted": false,
115 | "execution_millis": 72,
116 | "execution_start": 1634664810180,
117 | "source_hash": "838c0c28",
118 | "tags": []
119 | },
120 | "outputs": [],
121 | "source": [
122 | "df.tail()"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {
129 | "cell_id": "00010-ae0b342b-d7f3-46b2-8668-f256c3b685c1",
130 | "deepnote_cell_type": "code",
131 | "deepnote_to_be_reexecuted": false,
132 | "execution_millis": 0,
133 | "execution_start": 1634664863379,
134 | "source_hash": "89b06b8e",
135 | "tags": []
136 | },
137 | "outputs": [],
138 | "source": [
139 | "my_salary_list = list(df['salary'].values)"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {
146 | "cell_id": "00008-31c5accf-8a7d-4bdc-b97a-c8a210f16c2d",
147 | "deepnote_cell_type": "code",
148 | "deepnote_to_be_reexecuted": false,
149 | "execution_millis": 11,
150 | "execution_start": 1634664863791,
151 | "source_hash": "263ee8a5",
152 | "tags": []
153 | },
154 | "outputs": [],
155 | "source": [
156 | "# float(my_salary_list[0])"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {
163 | "cell_id": "00009-905b7c10-33fb-4e9d-91a2-b7599107dcaa",
164 | "deepnote_cell_type": "code",
165 | "deepnote_to_be_reexecuted": false,
166 | "execution_millis": 36,
167 | "execution_start": 1634664931801,
168 | "source_hash": "eccf32d0",
169 | "tags": []
170 | },
171 | "outputs": [],
172 | "source": [
173 | "current_str = '$23,564,932.00'.replace(\"$\", \"\").replace(\",\", \"_\")\n",
174 | "current_dollars = float(current_str)\n",
175 | "\n",
176 | "current_dollars * 32"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {
183 | "cell_id": "00010-0abc1f6e-6f69-4400-9d45-8f403624e6ea",
184 | "deepnote_cell_type": "code",
185 | "deepnote_to_be_reexecuted": false,
186 | "execution_millis": 152,
187 | "execution_start": 1634665011941,
188 | "source_hash": "6dc07231",
189 | "tags": []
190 | },
191 | "outputs": [],
192 | "source": [
193 | "my_salary_list_cleaned = [float(x.replace(\"$\", \"\").replace(\",\", \"_\")) for x in my_salary_list]\n",
194 | "my_salary_list_cleaned[0]"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {
201 | "cell_id": "00011-788b27c2-271b-4c6c-99d3-41d2064ac321",
202 | "deepnote_cell_type": "code",
203 | "deepnote_to_be_reexecuted": false,
204 | "execution_millis": 19,
205 | "execution_start": 1634665032102,
206 | "source_hash": "b2382e4e",
207 | "tags": []
208 | },
209 | "outputs": [],
210 | "source": [
211 | "df['salary_cleaned'] = my_salary_list_cleaned"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {
218 | "cell_id": "00012-ede91513-304c-4cc7-ba13-832758f63350",
219 | "deepnote_cell_type": "code",
220 | "deepnote_to_be_reexecuted": false,
221 | "execution_millis": 15,
222 | "execution_start": 1634665037154,
223 | "source_hash": "c085b6ba",
224 | "tags": []
225 | },
226 | "outputs": [],
227 | "source": [
228 | "df.head()"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {
235 | "cell_id": "00013-c82a8240-dc20-4bab-9ea6-7483496b83ca",
236 | "deepnote_cell_type": "code",
237 | "deepnote_to_be_reexecuted": false,
238 | "execution_millis": 93,
239 | "execution_start": 1634665204365,
240 | "source_hash": "3272c222",
241 | "tags": []
242 | },
243 | "outputs": [],
244 | "source": [
245 | "def clean_salary_data(val):\n",
246 | " new_val = float(val.replace(\"$\", \"\").replace(\",\", \"_\"))\n",
247 | " return new_val\n",
248 | "\n",
249 | "df['salary_cleaned_2'] = df['salary'].apply(clean_salary_data)"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "metadata": {
256 | "cell_id": "00014-3a1907ec-5e2b-4f75-9a0a-ae7a7ed0b8aa",
257 | "deepnote_cell_type": "code",
258 | "deepnote_to_be_reexecuted": false,
259 | "execution_millis": 52,
260 | "execution_start": 1634665205940,
261 | "source_hash": "c085b6ba",
262 | "tags": []
263 | },
264 | "outputs": [],
265 | "source": [
266 | "df.head()"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {
273 | "cell_id": "00015-e4db563d-51a1-4641-9b90-ec2c197a076e",
274 | "deepnote_cell_type": "code",
275 | "deepnote_to_be_reexecuted": false,
276 | "execution_millis": 8,
277 | "execution_start": 1634665297961,
278 | "source_hash": "d4fbbd00",
279 | "tags": []
280 | },
281 | "outputs": [],
282 | "source": [
283 | "rows_length = df.shape[0] # (row_length, col_length)\n",
284 | "new_rows = int(rows_length / 2.0)\n",
285 | "new_rows"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "metadata": {
292 | "cell_id": "00016-4c33d2d5-39a1-4fe4-ab92-40217746486d",
293 | "deepnote_cell_type": "code",
294 | "deepnote_to_be_reexecuted": false,
295 | "execution_millis": 2,
296 | "execution_start": 1634665349160,
297 | "source_hash": "962d0c10",
298 | "tags": []
299 | },
300 | "outputs": [],
301 | "source": [
302 | "new_col_data = [True for x in range(0, new_rows)]\n",
303 | "df['new_data'] = new_col_data + new_col_data"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "metadata": {
310 | "cell_id": "00017-ab5d82a6-e83a-4e03-8859-4aa28bb192c1",
311 | "deepnote_cell_type": "code",
312 | "deepnote_to_be_reexecuted": false,
313 | "execution_millis": 22,
314 | "execution_start": 1634665353581,
315 | "source_hash": "c085b6ba",
316 | "tags": []
317 | },
318 | "outputs": [],
319 | "source": [
320 | "df.head()"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {
327 | "cell_id": "00018-2025550d-f4c6-4f94-b1e9-0a68076f0d83",
328 | "deepnote_cell_type": "code",
329 | "deepnote_to_be_reexecuted": false,
330 | "execution_millis": 1,
331 | "execution_start": 1634665562850,
332 | "source_hash": "ba524046",
333 | "tags": []
334 | },
335 | "outputs": [],
336 | "source": [
337 | "def apply_on_df(row):\n",
338 | " row['random_data'] = 12 \n",
339 | " return row\n",
340 | "\n",
341 | "# df.apply(apply_on_df, axis=1)\n"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": null,
347 | "metadata": {
348 | "cell_id": "00019-23a40cd4-d1bb-478e-acb8-951f3467964a",
349 | "deepnote_cell_type": "code",
350 | "deepnote_to_be_reexecuted": false,
351 | "execution_millis": 41,
352 | "execution_start": 1634665517660,
353 | "source_hash": "c085b6ba",
354 | "tags": []
355 | },
356 | "outputs": [],
357 | "source": [
358 | "df.head()"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {
365 | "cell_id": "00020-83834316-225d-407e-9663-bfb6fc0a4158",
366 | "deepnote_cell_type": "code",
367 | "deepnote_to_be_reexecuted": false,
368 | "execution_millis": 22,
369 | "execution_start": 1634665590996,
370 | "source_hash": "a0af127d",
371 | "tags": []
372 | },
373 | "outputs": [],
374 | "source": [
375 | "df['half_salary'] = df['salary_cleaned'] * 0.5"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": null,
381 | "metadata": {
382 | "cell_id": "00021-a77b18f8-30cb-4db2-afcb-f70d0710d12a",
383 | "deepnote_cell_type": "code",
384 | "deepnote_to_be_reexecuted": false,
385 | "execution_millis": 19,
386 | "execution_start": 1634665638969,
387 | "source_hash": "22926cbe",
388 | "tags": []
389 | },
390 | "outputs": [],
391 | "source": [
392 | "df['half_salary_again'] = df['salary_cleaned'].apply(lambda x: x * 0.5)"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "metadata": {
399 | "cell_id": "00021-e898b7bb-71c3-4cf9-b30e-2e2c0bf7cb5f",
400 | "deepnote_cell_type": "code",
401 | "deepnote_to_be_reexecuted": false,
402 | "execution_millis": 74,
403 | "execution_start": 1634665642531,
404 | "source_hash": "c085b6ba",
405 | "tags": []
406 | },
407 | "outputs": [],
408 | "source": [
409 | "df.head()"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "metadata": {
416 | "cell_id": "00022-09f6345e-c840-43f2-897e-7b8c8c30b2a3",
417 | "deepnote_cell_type": "code",
418 | "tags": []
419 | },
420 | "outputs": [],
421 | "source": []
422 | },
423 | {
424 | "cell_type": "markdown",
425 | "metadata": {
426 | "created_in_deepnote_cell": true,
427 | "deepnote_cell_type": "markdown",
428 | "tags": []
429 | },
430 | "source": [
431 | "\n",
432 | "
\n",
433 | "Created in Deepnote"
434 | ]
435 | }
436 | ],
437 | "metadata": {
438 | "deepnote": {
439 | "is_reactive": false
440 | },
441 | "deepnote_execution_queue": [],
442 | "deepnote_notebook_id": "2f18222c-6554-4fca-aaf0-222952069583"
443 | },
444 | "nbformat": 4,
445 | "nbformat_minor": 2
446 | }
447 |
--------------------------------------------------------------------------------
/nbs_ref/5 - Basic Analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "cell_id": "00000-d1d3b693-8fc2-4852-b977-bf59db96cdf7",
8 | "deepnote_cell_type": "code",
9 | "deepnote_to_be_reexecuted": false,
10 | "execution_millis": 1,
11 | "execution_start": 1634670769787,
12 | "source_hash": "120b09e9",
13 | "tags": []
14 | },
15 | "outputs": [],
16 | "source": [
17 | "import pathlib\n",
18 | "import pandas as pd\n",
19 | "\n",
20 | "BASE_DIR = pathlib.Path().resolve().parent\n",
21 | "COURSE_DIR = BASE_DIR / \"course\"\n",
22 | "SAMPLES_DIR = COURSE_DIR / 'samples'\n"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {
29 | "cell_id": "00001-4da4045c-083d-4950-a801-f30394e6b103",
30 | "deepnote_cell_type": "code",
31 | "deepnote_to_be_reexecuted": false,
32 | "execution_millis": 64,
33 | "execution_start": 1634670843243,
34 | "source_hash": "811c99cf",
35 | "tags": []
36 | },
37 | "outputs": [],
38 | "source": [
39 | "init_df = pd.read_csv(SAMPLES_DIR / '2.csv')\n",
40 | "init_df.head()"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {
47 | "cell_id": "00002-c7f63dad-4bd5-49fb-befc-dd9d01183da4",
48 | "deepnote_cell_type": "code",
49 | "deepnote_to_be_reexecuted": false,
50 | "execution_millis": 34,
51 | "execution_start": 1634670926698,
52 | "source_hash": "2c5513cb",
53 | "tags": []
54 | },
55 | "outputs": [],
56 | "source": [
57 | "columns = ['name', 'salary_as_float']\n",
58 | "df = init_df.copy()[columns]\n",
59 | "df.head()"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {
66 | "cell_id": "00003-7151714e-0b8b-46ac-a35e-f057309c0bd2",
67 | "deepnote_cell_type": "code",
68 | "deepnote_to_be_reexecuted": false,
69 | "execution_millis": 2,
70 | "execution_start": 1634670959996,
71 | "source_hash": "889a1265",
72 | "tags": []
73 | },
74 | "outputs": [],
75 | "source": [
76 | "n_rows = df.shape[0] # (rows, cols)"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {
83 | "cell_id": "00004-4a2dced2-1c95-42cc-9c4d-be27b5ab38f4",
84 | "deepnote_cell_type": "code",
85 | "deepnote_to_be_reexecuted": false,
86 | "execution_millis": 5,
87 | "execution_start": 1634671018731,
88 | "source_hash": "cd46aaa8",
89 | "tags": []
90 | },
91 | "outputs": [],
92 | "source": [
93 | "salaries = list(df['salary_as_float'].values)\n",
94 | "sum_salaries = sum(salaries)\n",
95 | "avg_salaries = sum_salaries / n_rows\n",
96 | "print(avg_salaries)"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {
103 | "cell_id": "00005-ac5763c8-8738-4ebb-823b-252a21900f4d",
104 | "deepnote_cell_type": "code",
105 | "deepnote_to_be_reexecuted": false,
106 | "execution_millis": 8,
107 | "execution_start": 1634671056500,
108 | "source_hash": "1e1d38cd",
109 | "tags": []
110 | },
111 | "outputs": [],
112 | "source": [
113 | "avg = df['salary_as_float'].mean()\n",
114 | "avg"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {
121 | "cell_id": "00006-4f6b6acb-26b3-499f-a66a-12c9123cc087",
122 | "deepnote_cell_type": "code",
123 | "deepnote_to_be_reexecuted": false,
124 | "execution_millis": 7,
125 | "execution_start": 1634671094965,
126 | "source_hash": "3e7b4aaa",
127 | "tags": []
128 | },
129 | "outputs": [],
130 | "source": [
131 | "df_sum = df['salary_as_float'].sum() # / n_rows\n",
132 | "df_sum"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "cell_id": "00007-a33f5ea6-757b-44f9-abf0-25c4d3d8e645",
140 | "deepnote_cell_type": "code",
141 | "deepnote_to_be_reexecuted": false,
142 | "execution_millis": 9,
143 | "execution_start": 1634671153935,
144 | "source_hash": "8a870ad4",
145 | "tags": []
146 | },
147 | "outputs": [],
148 | "source": [
149 | "df_mode = df['salary_as_float'].mode()\n",
150 | "df_mode"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {
157 | "cell_id": "00008-b944d33a-73af-458a-9183-e1ec7eca7685",
158 | "deepnote_cell_type": "code",
159 | "deepnote_to_be_reexecuted": false,
160 | "execution_millis": 3,
161 | "execution_start": 1634671186003,
162 | "source_hash": "e2c1864b",
163 | "tags": []
164 | },
165 | "outputs": [],
166 | "source": [
167 | "top_salary = df['salary_as_float'].max()\n",
168 | "bottom_salary = df['salary_as_float'].min()\n",
169 | "print(top_salary, bottom_salary)"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {
175 | "created_in_deepnote_cell": true,
176 | "deepnote_cell_type": "markdown",
177 | "tags": []
178 | },
179 | "source": [
180 | "\n",
181 | "
\n",
182 | "Created in Deepnote"
183 | ]
184 | }
185 | ],
186 | "metadata": {
187 | "deepnote": {
188 | "is_reactive": false
189 | },
190 | "deepnote_execution_queue": [],
191 | "deepnote_notebook_id": "cf920c62-4d9f-44fd-8086-4b610a57be2c"
192 | },
193 | "nbformat": 4,
194 | "nbformat_minor": 2
195 | }
196 |
--------------------------------------------------------------------------------
/nbs_ref/6 - Grouping & Plots.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "cell_id": "00000-6ff3fb28-b769-44e1-acda-797876f4de99",
8 | "deepnote_cell_type": "code",
9 | "deepnote_to_be_reexecuted": false,
10 | "execution_millis": 37,
11 | "execution_start": 1634673068073,
12 | "source_hash": "1477e9f2",
13 | "tags": []
14 | },
15 | "outputs": [],
16 | "source": [
17 | "import pathlib\n",
18 | "import pandas as pd\n",
19 | "\n",
20 | "BASE_DIR = pathlib.Path().resolve().parent\n",
21 | "COURSE_DIR = BASE_DIR / \"course\"\n",
22 | "SAMPLES_DIR = COURSE_DIR / 'samples'\n",
23 | "init_df = pd.read_csv(SAMPLES_DIR / '2.csv')\n",
24 | "df = init_df.copy()[['name', 'salary_as_float']]"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {
31 | "cell_id": "00001-5487d78d-d59f-4be1-a7ee-3f88ddabcede",
32 | "deepnote_cell_type": "code",
33 | "deepnote_to_be_reexecuted": false,
34 | "execution_millis": 11,
35 | "execution_start": 1634673069101,
36 | "source_hash": "c085b6ba",
37 | "tags": []
38 | },
39 | "outputs": [],
40 | "source": [
41 | "df.head()"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {
48 | "cell_id": "00002-338a599e-31af-4acf-81c3-d1bbcdfd4e0a",
49 | "deepnote_cell_type": "code",
50 | "deepnote_to_be_reexecuted": false,
51 | "execution_millis": 10,
52 | "execution_start": 1634673168311,
53 | "source_hash": "9c98dc87",
54 | "tags": []
55 | },
56 | "outputs": [],
57 | "source": [
58 | "df['salary_norm'] = (df['salary_as_float'] - df['salary_as_float'].min()) / (df['salary_as_float'].max() - df['salary_as_float'].min())"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {
65 | "cell_id": "00003-e4cc0c7c-bd5e-4626-8b09-7faece38b632",
66 | "deepnote_cell_type": "code",
67 | "deepnote_to_be_reexecuted": false,
68 | "execution_millis": 10,
69 | "execution_start": 1634673185917,
70 | "source_hash": "c085b6ba",
71 | "tags": []
72 | },
73 | "outputs": [],
74 | "source": [
75 | "df.head()"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {
82 | "cell_id": "00004-d7c94965-1fa7-458a-b619-9ece1eeb0199",
83 | "deepnote_cell_type": "code",
84 | "deepnote_to_be_reexecuted": false,
85 | "execution_millis": 5,
86 | "execution_start": 1634673269717,
87 | "source_hash": "17f60c52",
88 | "tags": []
89 | },
90 | "outputs": [],
91 | "source": [
92 | "def group_our_row(val):\n",
93 | " if val > .95:\n",
94 | " return \"top\"\n",
95 | " elif val <= .95 and val > .5:\n",
96 | " return \"mid\"\n",
97 | " return \"low\"\n",
98 | "\n",
99 | "df['group'] = df['salary_norm'].apply(group_our_row)"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {
106 | "cell_id": "00005-72c969b5-62b6-4f97-aa81-94dc40d354af",
107 | "deepnote_cell_type": "code",
108 | "deepnote_to_be_reexecuted": false,
109 | "execution_millis": 13,
110 | "execution_start": 1634673275779,
111 | "source_hash": "c085b6ba",
112 | "tags": []
113 | },
114 | "outputs": [],
115 | "source": [
116 | "df.head()"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {
123 | "cell_id": "00006-6279936a-01f8-45c5-a8f5-3202c2f88fc7",
124 | "deepnote_cell_type": "code",
125 | "deepnote_to_be_reexecuted": false,
126 | "execution_millis": 7,
127 | "execution_start": 1634673339360,
128 | "source_hash": "f3b0fec6",
129 | "tags": []
130 | },
131 | "outputs": [],
132 | "source": [
133 | "df['group'].value_counts()"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {
140 | "cell_id": "00007-bbc32c6f-2cba-428c-bcfa-3f0da950ba4f",
141 | "deepnote_cell_type": "code",
142 | "deepnote_to_be_reexecuted": false,
143 | "execution_millis": 17,
144 | "execution_start": 1634673389735,
145 | "source_hash": "8d05d690",
146 | "tags": []
147 | },
148 | "outputs": [],
149 | "source": [
150 | "df.groupby(\"group\")['salary_as_float'].mean()"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {
157 | "cell_id": "00008-53df8c13-34f7-4dec-8f52-693110591029",
158 | "deepnote_cell_type": "code",
159 | "deepnote_to_be_reexecuted": false,
160 | "execution_millis": 13,
161 | "execution_start": 1634673438596,
162 | "source_hash": "c45ddc98",
163 | "tags": []
164 | },
165 | "outputs": [],
166 | "source": [
167 | "df.groupby(\"group\")['salary_as_float'].mean().apply(lambda x: f\"${x:,.2f}\")"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "cell_id": "00009-e320a2f4-7264-4506-bdbb-91e61a293ceb",
175 | "deepnote_cell_type": "code",
176 | "deepnote_to_be_reexecuted": false,
177 | "execution_millis": 17,
178 | "execution_start": 1634673460506,
179 | "source_hash": "d18ad3de",
180 | "tags": []
181 | },
182 | "outputs": [],
183 | "source": [
184 | "df.groupby(\"group\")['salary_as_float'].sum().apply(lambda x: f\"${x:,.2f}\")"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "cell_id": "00010-e8b9efa3-22bd-452a-bd30-2b2a290c7c9d",
192 | "deepnote_cell_type": "code",
193 | "deepnote_to_be_reexecuted": false,
194 | "execution_millis": 34,
195 | "execution_start": 1634673501615,
196 | "source_hash": "c085b6ba",
197 | "tags": []
198 | },
199 | "outputs": [],
200 | "source": [
201 | "df.head()"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {
208 | "cell_id": "00011-4abbbef9-1ef7-4457-93e3-8ee48fecbd0b",
209 | "deepnote_cell_type": "code",
210 | "deepnote_to_be_reexecuted": false,
211 | "execution_millis": 11,
212 | "execution_start": 1634673632965,
213 | "source_hash": "d6663e40",
214 | "tags": []
215 | },
216 | "outputs": [],
217 | "source": [
218 | "mean_group_data = df.groupby(\"group\")['salary_as_float'].mean()\n",
219 | "type(mean_group_data)"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {
226 | "cell_id": "00012-94072016-e962-48ff-8077-2c7a58f14d43",
227 | "deepnote_cell_type": "code",
228 | "deepnote_to_be_reexecuted": false,
229 | "execution_millis": 174,
230 | "execution_start": 1634673740679,
231 | "source_hash": "1e4998d2",
232 | "tags": []
233 | },
234 | "outputs": [],
235 | "source": [
236 | "mean_group_data.plot(kind='bar')"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {
243 | "cell_id": "00013-c3058928-6583-4b3c-8cf2-bf4963d552b8",
244 | "deepnote_cell_type": "code",
245 | "tags": []
246 | },
247 | "outputs": [],
248 | "source": []
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {
253 | "created_in_deepnote_cell": true,
254 | "deepnote_cell_type": "markdown",
255 | "tags": []
256 | },
257 | "source": [
258 | "\n",
259 | "
\n",
260 | "Created in Deepnote"
261 | ]
262 | }
263 | ],
264 | "metadata": {
265 | "deepnote": {
266 | "is_reactive": false
267 | },
268 | "deepnote_execution_queue": [],
269 | "deepnote_notebook_id": "cf991719-f636-434d-85ce-17792dadcb6e"
270 | },
271 | "nbformat": 4,
272 | "nbformat_minor": 2
273 | }
274 |
--------------------------------------------------------------------------------
/nbs_ref/7 - Clean Real Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "cell_id": "00000-6184a4d0-7989-48ea-a1d7-229bbb45c9c4",
8 | "deepnote_cell_type": "code",
9 | "deepnote_to_be_reexecuted": false,
10 | "execution_millis": 43,
11 | "execution_start": 1634674284536,
12 | "source_hash": "771ec97",
13 | "tags": []
14 | },
15 | "outputs": [],
16 | "source": [
17 | "import pathlib\n",
18 | "import pandas as pd\n",
19 | "import utils\n",
20 | "\n",
21 | "BASE_DIR = pathlib.Path().resolve().parent\n",
22 | "COURSE_DIR = BASE_DIR / \"course\"\n",
23 | "DATASET_DIR = COURSE_DIR / \"datasets\"\n",
24 | "INPUT_PATH = DATASET_DIR / 'nba-historical-salaries.csv' # appendix a"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {
31 | "cell_id": "00001-b13827b3-ba63-4a27-98cd-75ea460dc229",
32 | "deepnote_cell_type": "code",
33 | "deepnote_to_be_reexecuted": false,
34 | "execution_millis": 45,
35 | "execution_start": 1634674284580,
36 | "source_hash": "cdb9e8e8",
37 | "tags": []
38 | },
39 | "outputs": [],
40 | "source": [
41 | "df = pd.read_csv(INPUT_PATH)\n",
42 | "df.head()"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {
49 | "cell_id": "00002-98de7395-c20b-4112-9c50-f0343a582927",
50 | "deepnote_cell_type": "code",
51 | "deepnote_to_be_reexecuted": false,
52 | "execution_millis": 2719,
53 | "execution_start": 1634674463776,
54 | "source_hash": "451b0cc8",
55 | "tags": []
56 | },
57 | "outputs": [],
58 | "source": [
59 | "def clean_row(row):\n",
60 | " # this is a pandas series\n",
61 | " cols = ['salary', 'adj_salary']\n",
62 | " for col in cols:\n",
63 | " row[col] = utils.dollar_str_to_float(row[col])\n",
64 | " return row\n",
65 | "\n",
66 | "df_cleaned = df.copy().apply(clean_row, axis=1)"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "cell_id": "00003-eb652a22-ff35-4416-bbd0-866a2241418f",
74 | "deepnote_cell_type": "code",
75 | "deepnote_to_be_reexecuted": false,
76 | "execution_millis": 46,
77 | "execution_start": 1634674405676,
78 | "source_hash": "79cfa8a3",
79 | "tags": []
80 | },
81 | "outputs": [],
82 | "source": [
83 | "df_cleaned.head()"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {
90 | "cell_id": "00004-cafe7973-4639-45eb-8b62-baa02fb76924",
91 | "deepnote_cell_type": "code",
92 | "deepnote_to_be_reexecuted": false,
93 | "execution_millis": 2,
94 | "execution_start": 1634674498860,
95 | "source_hash": "fa21f23c",
96 | "tags": []
97 | },
98 | "outputs": [],
99 | "source": [
100 | "df_cleaned['salary'].dtype"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "cell_id": "00005-02c9e2fc-11f1-4403-b922-ab088a21921d",
108 | "deepnote_cell_type": "code",
109 | "deepnote_to_be_reexecuted": false,
110 | "execution_millis": 0,
111 | "execution_start": 1634674907005,
112 | "source_hash": "8a728a42",
113 | "tags": []
114 | },
115 | "outputs": [],
116 | "source": [
117 | "player_per_year = df_cleaned.groupby('year-start')['year-end'].value_counts()"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {
124 | "cell_id": "00006-e7a1d785-6db7-41fe-a8d4-599239a49b3d",
125 | "deepnote_cell_type": "code",
126 | "deepnote_to_be_reexecuted": false,
127 | "execution_millis": 332,
128 | "execution_start": 1634674923778,
129 | "source_hash": "7ac234a2",
130 | "tags": []
131 | },
132 | "outputs": [],
133 | "source": [
134 | "player_per_year.plot(title='# of Players per Year')"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {
141 | "cell_id": "00007-d3746b2c-d81a-44ee-bfce-4e5d281dc32c",
142 | "deepnote_cell_type": "code",
143 | "deepnote_to_be_reexecuted": false,
144 | "execution_millis": 16,
145 | "execution_start": 1634675071018,
146 | "source_hash": "39dba7f8",
147 | "tags": []
148 | },
149 | "outputs": [],
150 | "source": [
151 | "adj_salaries = df_cleaned.groupby('year-start')['adj_salary'].mean()\n",
152 | "adj_salaries"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {
159 | "cell_id": "00008-3ca443e2-a570-4d3d-ae29-b544ee0051e3",
160 | "deepnote_cell_type": "code",
161 | "deepnote_to_be_reexecuted": false,
162 | "execution_millis": 235,
163 | "execution_start": 1634675106167,
164 | "source_hash": "bec8be5d",
165 | "tags": []
166 | },
167 | "outputs": [],
168 | "source": [
169 | "adj_salaries.plot(title='Adj Average Salary over Time')"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {
176 | "cell_id": "00009-7171f951-b87c-4a04-92b6-21ee1e66530c",
177 | "deepnote_cell_type": "code",
178 | "deepnote_to_be_reexecuted": false,
179 | "execution_millis": 269,
180 | "execution_start": 1634675166491,
181 | "source_hash": "d4097160",
182 | "tags": []
183 | },
184 | "outputs": [],
185 | "source": [
186 | "adj_salaries_sum = df_cleaned.groupby('year-start')['adj_salary'].sum()\n",
187 | "adj_salaries_sum.plot()"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {
194 | "cell_id": "00010-a3c81557-18a9-4f76-b9c2-03b1784e10ae",
195 | "deepnote_cell_type": "code",
196 | "tags": []
197 | },
198 | "outputs": [],
199 | "source": []
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {
204 | "created_in_deepnote_cell": true,
205 | "deepnote_cell_type": "markdown",
206 | "tags": []
207 | },
208 | "source": [
209 | "\n",
210 | "
\n",
211 | "Created in Deepnote"
212 | ]
213 | }
214 | ],
215 | "metadata": {
216 | "deepnote": {
217 | "is_reactive": false
218 | },
219 | "deepnote_execution_queue": [],
220 | "deepnote_notebook_id": "07b99e53-6ee8-425b-9587-4e24c7c22fc9"
221 | },
222 | "nbformat": 4,
223 | "nbformat_minor": 2
224 | }
225 |
--------------------------------------------------------------------------------
/nbs_ref/8 - Merge Datasets.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "cell_id": "00000-b6e94cba-c4cd-42da-8208-cfa28665606a",
8 | "deepnote_cell_type": "code",
9 | "deepnote_to_be_reexecuted": false,
10 | "execution_millis": 8,
11 | "execution_start": 1634678578677,
12 | "source_hash": "59e619dd",
13 | "tags": []
14 | },
15 | "outputs": [],
16 | "source": [
17 | "import datetime\n",
18 | "import pathlib\n",
19 | "import pandas as pd\n",
20 | "import utils\n",
21 | "\n",
22 | "BASE_DIR = pathlib.Path().resolve().parent\n",
23 | "COURSE_DIR = BASE_DIR / \"course\"\n",
24 | "DATASET_DIR = COURSE_DIR / \"datasets\"\n",
25 | "SAMPLES_DIR = COURSE_DIR / \"samples\"\n",
26 | "INPUT_PATH = SAMPLES_DIR / '4-player-salaries-cleaned.csv'\n",
27 | "INFLATION_DATA_INPUT_PATH = DATASET_DIR / 'inflation-rate.csv' # appendix b"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "cell_id": "00001-1c6dfc81-739b-46f9-b4eb-9b196fd0ea2e",
35 | "deepnote_cell_type": "code",
36 | "deepnote_to_be_reexecuted": false,
37 | "execution_millis": 30,
38 | "execution_start": 1634676577391,
39 | "source_hash": "1249c42",
40 | "tags": []
41 | },
42 | "outputs": [],
43 | "source": [
44 | "df = pd.read_csv(INPUT_PATH)\n",
45 | "inflation_df = pd.read_csv(INFLATION_DATA_INPUT_PATH)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {
52 | "cell_id": "00002-d5211440-03f4-45a1-9fe1-e51e60032257",
53 | "deepnote_cell_type": "code",
54 | "deepnote_to_be_reexecuted": false,
55 | "execution_millis": 3,
56 | "execution_start": 1634676577495,
57 | "source_hash": "1d1d0047",
58 | "tags": []
59 | },
60 | "outputs": [],
61 | "source": [
62 | "inflation_df.head()"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {
69 | "cell_id": "00003-7be75291-9b75-4f89-8575-395e572cf507",
70 | "deepnote_cell_type": "code",
71 | "deepnote_to_be_reexecuted": false,
72 | "execution_millis": 21,
73 | "execution_start": 1634676577496,
74 | "source_hash": "c085b6ba",
75 | "tags": []
76 | },
77 | "outputs": [],
78 | "source": [
79 | "df.head()"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {
86 | "cell_id": "00003-bfa3f414-9661-4507-9161-8254062d455b",
87 | "deepnote_cell_type": "code",
88 | "deepnote_to_be_reexecuted": false,
89 | "execution_millis": 2,
90 | "execution_start": 1634676788998,
91 | "source_hash": "c65297e4",
92 | "tags": []
93 | },
94 | "outputs": [],
95 | "source": [
96 | "og_salary = df.iloc[0]['salary'] # \n",
97 | "adj_salary = df.iloc[0]['adj_salary']\n",
98 | "year_start =df.iloc[0]['year_start']\n",
99 | "year_end =df.iloc[0]['year_end']"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {
106 | "cell_id": "00004-0b860d70-f703-4c2b-9a76-ffdbb053e963",
107 | "deepnote_cell_type": "code",
108 | "deepnote_to_be_reexecuted": false,
109 | "execution_millis": 1,
110 | "execution_start": 1634676683383,
111 | "source_hash": "87e09398",
112 | "tags": []
113 | },
114 | "outputs": [],
115 | "source": [
116 | "inflation_df.set_index(\"date\", inplace=True)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {
123 | "cell_id": "00005-94068da7-b63e-47de-bff3-e11e1650af85",
124 | "deepnote_cell_type": "code",
125 | "deepnote_to_be_reexecuted": false,
126 | "execution_millis": 2,
127 | "execution_start": 1634676921931,
128 | "source_hash": "88a8f6df",
129 | "tags": []
130 | },
131 | "outputs": [],
132 | "source": [
133 | "multiplier = float(inflation_df[f'{year_start-1}': f'{year_end-1}']['multiplier'])"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {
140 | "cell_id": "00006-ab40009d-b5fb-4476-87d2-6ef1d80f7033",
141 | "deepnote_cell_type": "code",
142 | "deepnote_to_be_reexecuted": false,
143 | "execution_millis": 4,
144 | "execution_start": 1634676923367,
145 | "source_hash": "8be8d4a1",
146 | "tags": []
147 | },
148 | "outputs": [],
149 | "source": [
150 | "(multiplier * og_salary) - adj_salary"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {
157 | "cell_id": "00011-073931b9-f5de-42db-8816-1b38ab49b020",
158 | "deepnote_cell_type": "code",
159 | "deepnote_to_be_reexecuted": false,
160 | "execution_millis": 0,
161 | "execution_start": 1634678420331,
162 | "source_hash": "7e1a403f",
163 | "tags": []
164 | },
165 | "outputs": [],
166 | "source": [
167 | "def cal_adj_salary_2(row):\n",
168 | " og_salary = row['salary'] # \n",
169 | " adj_salary = row['adj_salary']\n",
170 | " year_start =row['year_start']\n",
171 | " year_end =row['year_end']\n",
172 | " multiplier = float(inflation_df[f'{year_start-1}': f'{year_end-1}']['multiplier'])\n",
173 | " row['adj_salary_2'] = (multiplier * og_salary)\n",
174 | " return row\n",
175 | "\n",
176 | "# df.apply(cal_adj_salary_2, axis=1)"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {
183 | "cell_id": "00009-27db00c0-6f60-45de-8e91-b5858052beb4",
184 | "deepnote_cell_type": "code",
185 | "deepnote_to_be_reexecuted": false,
186 | "execution_millis": 3,
187 | "execution_start": 1634678445812,
188 | "source_hash": "65afa563",
189 | "tags": []
190 | },
191 | "outputs": [],
192 | "source": [
193 | "inflation_df.reset_index(inplace=True, drop=False) "
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {
200 | "cell_id": "00010-ff3f5c94-531f-4293-8ab8-c06ffc0fe23d",
201 | "deepnote_cell_type": "code",
202 | "deepnote_to_be_reexecuted": false,
203 | "execution_millis": 5,
204 | "execution_start": 1634678455068,
205 | "source_hash": "1d36096",
206 | "tags": []
207 | },
208 | "outputs": [],
209 | "source": [
210 | "inflation_df.columns\n"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {
217 | "cell_id": "00011-bf991f67-55b0-4e01-99e5-a97d51e63c13",
218 | "deepnote_cell_type": "code",
219 | "deepnote_to_be_reexecuted": false,
220 | "execution_millis": 3,
221 | "execution_start": 1634678503692,
222 | "source_hash": "e7234db2",
223 | "tags": []
224 | },
225 | "outputs": [],
226 | "source": [
227 | "df.merge(inflation_df, left_on='year_start', right_on='date')"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {
234 | "cell_id": "00012-948e47fa-6cd4-4576-ad0c-305e8261329d",
235 | "deepnote_cell_type": "code",
236 | "deepnote_to_be_reexecuted": false,
237 | "execution_millis": 2,
238 | "execution_start": 1634678672675,
239 | "source_hash": "ed6039a1",
240 | "tags": []
241 | },
242 | "outputs": [],
243 | "source": [
244 | "df['date'].dtype"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "metadata": {
251 | "cell_id": "00013-6182c67d-299b-4087-bdf1-cd3b826d2841",
252 | "deepnote_cell_type": "code",
253 | "deepnote_to_be_reexecuted": false,
254 | "execution_millis": 6,
255 | "execution_start": 1634678666293,
256 | "source_hash": "f473fc50",
257 | "tags": []
258 | },
259 | "outputs": [],
260 | "source": [
261 | "inflation_df['date'].dtype"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "metadata": {
268 | "cell_id": "00014-d4d9e9bc-e347-4f98-8a68-e0313dc428da",
269 | "deepnote_cell_type": "code",
270 | "deepnote_to_be_reexecuted": false,
271 | "execution_millis": 253,
272 | "execution_start": 1634678659184,
273 | "source_hash": "b1530698",
274 | "tags": []
275 | },
276 | "outputs": [],
277 | "source": [
278 | "df['date'] = df['year_start'].apply(lambda x: datetime.datetime.strptime(f\"{x}-12-31\", \"%Y-%m-%d\"))\n",
279 | "inflation_df['date'] = inflation_df['date'].apply(lambda x: datetime.datetime.strptime(f\"{x}\", \"%Y-%m-%d\"))\n"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {
286 | "cell_id": "00015-1d51e6da-d2f4-469e-812b-46d2df48b9c3",
287 | "deepnote_cell_type": "code",
288 | "deepnote_to_be_reexecuted": false,
289 | "execution_millis": 1,
290 | "execution_start": 1634678723983,
291 | "source_hash": "24afa461",
292 | "tags": []
293 | },
294 | "outputs": [],
295 | "source": [
296 | "merged_df = df.merge(inflation_df, left_on='date', right_on='date')"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": null,
302 | "metadata": {
303 | "cell_id": "00016-265717d2-93cb-49a1-92b4-c7a8ea0957e8",
304 | "deepnote_cell_type": "code",
305 | "deepnote_to_be_reexecuted": false,
306 | "execution_millis": 17,
307 | "execution_start": 1634678732032,
308 | "source_hash": "89beb45",
309 | "tags": []
310 | },
311 | "outputs": [],
312 | "source": [
313 | "merged_df.head()"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {
320 | "cell_id": "00017-46d049c9-c2fe-46f6-9080-3307ab2c0c93",
321 | "deepnote_cell_type": "code",
322 | "deepnote_to_be_reexecuted": false,
323 | "execution_millis": 370,
324 | "execution_start": 1634678798480,
325 | "source_hash": "2889b9fe",
326 | "tags": []
327 | },
328 | "outputs": [],
329 | "source": [
330 | "merged_df['adj_salary_audit'] = merged_df['salary'] * merged_df['multiplier']"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "metadata": {
337 | "cell_id": "00018-6de37f92-4158-4bbe-a84e-2ff627983cc7",
338 | "deepnote_cell_type": "code",
339 | "deepnote_to_be_reexecuted": false,
340 | "execution_millis": 50,
341 | "execution_start": 1634678804259,
342 | "source_hash": "89beb45",
343 | "tags": []
344 | },
345 | "outputs": [],
346 | "source": [
347 | "merged_df.head()"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": null,
353 | "metadata": {
354 | "cell_id": "00019-e18192c1-603c-45ee-99f2-e66dfb71c753",
355 | "deepnote_cell_type": "code",
356 | "deepnote_to_be_reexecuted": false,
357 | "execution_millis": 1,
358 | "execution_start": 1634678934679,
359 | "source_hash": "2c2e27e2",
360 | "tags": []
361 | },
362 | "outputs": [],
363 | "source": [
364 | "merged_df['delta'] = merged_df['adj_salary'] - merged_df['adj_salary_audit']"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": null,
370 | "metadata": {
371 | "cell_id": "00019-6cecd6cc-e8c1-45be-8855-347572f58515",
372 | "deepnote_cell_type": "code",
373 | "deepnote_to_be_reexecuted": false,
374 | "execution_millis": 2,
375 | "execution_start": 1634678946003,
376 | "source_hash": "2019aaf6",
377 | "tags": []
378 | },
379 | "outputs": [],
380 | "source": [
381 | "merged_df['delta'].sum()"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "metadata": {
388 | "cell_id": "00021-acc29beb-a865-42ae-beaf-bbdc07225e6e",
389 | "deepnote_cell_type": "code",
390 | "deepnote_to_be_reexecuted": false,
391 | "execution_millis": 8,
392 | "execution_start": 1634678984535,
393 | "source_hash": "d377b739",
394 | "tags": []
395 | },
396 | "outputs": [],
397 | "source": [
398 | "f\"{merged_df['adj_salary'].sum():,.2f}\""
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": null,
404 | "metadata": {
405 | "cell_id": "00022-b8840192-32e2-43ab-8809-60796273a018",
406 | "deepnote_cell_type": "code",
407 | "deepnote_to_be_reexecuted": false,
408 | "execution_millis": 9,
409 | "execution_start": 1634679013094,
410 | "source_hash": "455f7aa7",
411 | "tags": []
412 | },
413 | "outputs": [],
414 | "source": [
415 | "merged_df['adj_salary'].sum() - merged_df['delta'].sum()"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "metadata": {
422 | "cell_id": "00023-1ee187d6-0d07-4bb0-a760-9edfc993963c",
423 | "deepnote_cell_type": "code",
424 | "tags": []
425 | },
426 | "outputs": [],
427 | "source": []
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "metadata": {
432 | "created_in_deepnote_cell": true,
433 | "deepnote_cell_type": "markdown",
434 | "tags": []
435 | },
436 | "source": [
437 | "\n",
438 | "
\n",
439 | "Created in Deepnote"
440 | ]
441 | }
442 | ],
443 | "metadata": {
444 | "deepnote": {
445 | "is_reactive": false
446 | },
447 | "deepnote_execution_queue": [],
448 | "deepnote_notebook_id": "7599b9fd-85af-4ebd-a7ee-55058917791d"
449 | },
450 | "nbformat": 4,
451 | "nbformat_minor": 2
452 | }
453 |
--------------------------------------------------------------------------------
/nbs_ref/9 - Using an NBA Stats API.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "cell_id": "00000-cb991b88-6ed2-45ac-9f9e-f0a764026d74",
7 | "deepnote_cell_type": "markdown",
8 | "tags": []
9 | },
10 | "source": [
11 | "Let's use the free [balldontlie.io](https://www.balldontlie.io/) API for extracting a new dataset!"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {
18 | "cell_id": "00000-3b0814d8-1a58-447d-9b49-aa092239ee41",
19 | "deepnote_cell_type": "code",
20 | "deepnote_to_be_reexecuted": false,
21 | "execution_millis": 88,
22 | "execution_start": 1634749327463,
23 | "source_hash": "9af06d13",
24 | "tags": []
25 | },
26 | "outputs": [],
27 | "source": [
28 | "import requests\n",
29 | "import datetime\n",
30 | "import pathlib\n",
31 | "import pandas as pd\n",
32 | "import time\n",
33 | "import utils\n",
34 | "\n",
35 | "BASE_DIR = pathlib.Path().resolve().parent\n",
36 | "COURSE_DIR = BASE_DIR / \"course\"\n",
37 | "DATASET_DIR = COURSE_DIR / \"datasets\"\n",
38 | "SAMPLES_DIR = COURSE_DIR / \"samples\"\n",
39 | "INPUT_PATH = SAMPLES_DIR / '4-player-salaries-cleaned.csv'\n",
40 | "salary_df = pd.read_csv(INPUT_PATH)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {
47 | "cell_id": "00002-342871ca-eb44-4928-845c-6dd854374ca5",
48 | "deepnote_cell_type": "code",
49 | "deepnote_to_be_reexecuted": false,
50 | "execution_millis": 0,
51 | "execution_start": 1634749327600,
52 | "source_hash": "9811f36f",
53 | "tags": []
54 | },
55 | "outputs": [],
56 | "source": [
57 | "players_endpoint = \"https://www.balldontlie.io/api/v1/players?per_page=100&page=0\"\n",
58 | "stats_endpoint = f'https://www.balldontlie.io/api/v1/stats'"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {
65 | "cell_id": "00003-f338b11d-5033-419d-91dc-b49a25aec495",
66 | "deepnote_cell_type": "code",
67 | "deepnote_to_be_reexecuted": false,
68 | "execution_millis": 3760665,
69 | "execution_start": 1634749327600,
70 | "source_hash": "4b520d41",
71 | "tags": []
72 | },
73 | "outputs": [],
74 | "source": [
75 | "# !curl \"https://www.balldontlie.io/api/v1/players?per_page=100\""
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {
82 | "cell_id": "00009-740edd60-cad4-4a68-9d27-27303ce274ad",
83 | "deepnote_cell_type": "code",
84 | "deepnote_to_be_reexecuted": false,
85 | "execution_millis": 0,
86 | "execution_start": 1634749327601,
87 | "source_hash": "59b00187",
88 | "tags": []
89 | },
90 | "outputs": [],
91 | "source": [
92 | "def get_players_dataset(per_page=100):\n",
93 | " dataset = []\n",
94 | " base_url = \"https://www.balldontlie.io/api/v1/players\"\n",
95 | " init_url = f\"{base_url}?per_page={per_page}\"\n",
96 | " r = requests.get(init_url)\n",
97 | " if not r.status_code in range(200, 299):\n",
98 | " return []\n",
99 | " json_data = r.json()\n",
100 | " meta_data = json_data['meta']\n",
101 | " total_pages = int(meta_data.get('total_pages'))\n",
102 | " for x in range(0, total_pages + 1):\n",
103 | " time.sleep(0.25)\n",
104 | " url = f\"{base_url}?per_page={per_page}&page={x}\"\n",
105 | " r = requests.get(url)\n",
106 | " if not r.status_code in range(200, 299):\n",
107 | " print('skipping')\n",
108 | " continue\n",
109 | " json_data = r.json()\n",
110 | " data = json_data['data']\n",
111 | " # dataset.append(data)\n",
112 | " dataset += data\n",
113 | " return dataset"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {
120 | "cell_id": "00010-1710a7bc-14d4-420b-bf3e-cc9779f71535",
121 | "deepnote_cell_type": "code",
122 | "deepnote_to_be_reexecuted": false,
123 | "execution_millis": 12194,
124 | "execution_start": 1634749327643,
125 | "source_hash": "59be25f8",
126 | "tags": []
127 | },
128 | "outputs": [],
129 | "source": [
130 | "players_dataset = get_players_dataset()"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {
137 | "cell_id": "00011-f7353f2f-db7c-4ec1-962c-644e35454a5c",
138 | "deepnote_cell_type": "code",
139 | "deepnote_to_be_reexecuted": false,
140 | "execution_millis": 87,
141 | "execution_start": 1634749577130,
142 | "source_hash": "27076b9b",
143 | "tags": []
144 | },
145 | "outputs": [],
146 | "source": [
147 | "player_df = pd.DataFrame(players_dataset)[['id', 'first_name', 'last_name']]\n",
148 | "player_df['full_name'] = player_df['first_name'] + \" \" + player_df['last_name']\n",
149 | "player_df.drop_duplicates(subset=['id'], inplace=True)\n",
150 | "player_df.head()"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {
157 | "cell_id": "00015-46e44e13-d060-4e26-8eec-5f724f6f6832",
158 | "deepnote_cell_type": "code",
159 | "deepnote_to_be_reexecuted": false,
160 | "execution_millis": 7,
161 | "execution_start": 1634749596218,
162 | "source_hash": "6e624904",
163 | "tags": []
164 | },
165 | "outputs": [],
166 | "source": [
167 | "player_df.shape"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "cell_id": "00016-29a87eff-a101-4ff3-bdec-85fae9f9e4ba",
175 | "deepnote_cell_type": "code",
176 | "deepnote_to_be_reexecuted": false,
177 | "execution_millis": 0,
178 | "execution_start": 1634749339938,
179 | "source_hash": "b623e53d",
180 | "tags": []
181 | },
182 | "outputs": [],
183 | "source": []
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {
189 | "cell_id": "00017-f79da473-1056-44bc-b9d1-eae843bbc5cf",
190 | "deepnote_cell_type": "code",
191 | "deepnote_to_be_reexecuted": false,
192 | "execution_millis": 2530419,
193 | "execution_start": 1634749339939,
194 | "source_hash": "b623e53d",
195 | "tags": []
196 | },
197 | "outputs": [],
198 | "source": []
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {
204 | "cell_id": "00018-90c65eb6-08a0-48a7-883b-12d8ff089708",
205 | "deepnote_cell_type": "code",
206 | "deepnote_to_be_reexecuted": false,
207 | "execution_millis": 2420384,
208 | "execution_start": 1634749339984,
209 | "source_hash": "9a5e2c99",
210 | "tags": []
211 | },
212 | "outputs": [],
213 | "source": [
214 | "def get_stats(player_id=1, postseason=False, per_page=100):\n",
215 | " dataset = []\n",
216 | " postseason_param = \"true\" if postseason else \"false\"\n",
217 | " base_url = f\"https://www.balldontlie.io/api/v1/stats?player_ids[]={player_id}&postseason={postseason_param}\"\n",
218 | " init_url = f\"{base_url}&per_page={per_page}\"\n",
219 | " r = requests.get(init_url)\n",
220 | " if not r.status_code in range(200, 299):\n",
221 | " return []\n",
222 | " json_data = r.json()\n",
223 | " meta_data = json_data['meta']\n",
224 | " total_pages = int(meta_data.get('total_pages'))\n",
225 | " for x in range(0, total_pages + 1):\n",
226 | " time.sleep(0.25)\n",
227 | " url = f\"{base_url}&per_page={per_page}&page={x}\"\n",
228 | " r = requests.get(url)\n",
229 | " if not r.status_code in range(200, 299):\n",
230 | " print('skipping')\n",
231 | " continue\n",
232 | " json_data = r.json()\n",
233 | " data = json_data['data']\n",
234 | " # dataset.append(data)\n",
235 | " dataset += data\n",
236 | " return dataset"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {
243 | "cell_id": "00012-30978274-c083-430f-9461-b91981687a58",
244 | "deepnote_cell_type": "code",
245 | "deepnote_to_be_reexecuted": false,
246 | "execution_millis": 1,
247 | "execution_start": 1634749501803,
248 | "source_hash": "fe3ddbb4",
249 | "tags": []
250 | },
251 | "outputs": [],
252 | "source": [
253 | "NESTED_STATS_COLS = ['game', 'team', 'player']\n",
254 | "\n",
255 | "def unpack_nested_dict(row):\n",
256 | " for col in NESTED_STATS_COLS:\n",
257 | " col_val = row[col] # row['game']\n",
258 | " if isinstance(col_val, dict):\n",
259 | " for key, val in col_val.items():\n",
260 | " new_col_key = f\"{col}_{key}\"\n",
261 | " # game_id\n",
262 | " # game_period\n",
263 | " # game_status\n",
264 | " row[new_col_key] = val\n",
265 | " return row\n",
266 | "\n",
267 | "def get_second_played(val):\n",
268 | " h, m, s = 0,0,0\n",
269 | " if val:\n",
270 | " time_string = val.split(\":\") # always create a list\n",
271 | " if len(time_string) == 2:\n",
272 | " m, s = time_string\n",
273 | " if len(time_string) == 3:\n",
274 | " h, m, s = time_string\n",
275 | " if len(time_string) == 1:\n",
276 | " m = time_string[0]\n",
277 | " if f\"{h}\".isdigit():\n",
278 | " h = int(h)\n",
279 | " if f\"{m}\".isdigit():\n",
280 | " m = int(m)\n",
281 | " if f\"{s}\".isdigit():\n",
282 | " s = int(s)\n",
283 | " return datetime.timedelta(hours=h, minutes=m, seconds=s).total_seconds()\n",
284 | "\n",
285 | "def get_stats_df(stats_dataset):\n",
286 | " if len(stats_dataset) == 0:\n",
287 | " return pd.DataFrame()\n",
288 | " df = pd.DataFrame(stats_dataset)\n",
289 | " df = df.apply(unpack_nested_dict, axis=1)\n",
290 | " df.drop(columns=NESTED_STATS_COLS, inplace=True)\n",
291 | " if \"game_date\" in df.columns:\n",
292 | " df['date'] = pd.to_datetime(df['game_date'])\n",
293 | " df['year'] = df['date'].apply(lambda x: x.year)\n",
294 | " if \"min\" in df.columns:\n",
295 | " df['seconds'] = df['min'].apply(get_second_played)\n",
296 | " df['did_play'] = df['seconds'].apply(lambda x: x > 0)\n",
297 | " df.drop_duplicates(subset=['id'], inplace=True)\n",
298 | " return df"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {
305 | "cell_id": "00012-8825c3ec-c1d8-4867-9b96-e8ecd95dbbe9",
306 | "deepnote_cell_type": "code",
307 | "deepnote_to_be_reexecuted": false,
308 | "execution_millis": 7,
309 | "execution_start": 1634749372390,
310 | "source_hash": "5256c07f",
311 | "tags": []
312 | },
313 | "outputs": [],
314 | "source": [
315 | "# player_id = player_df.sample(n=1)['id'].item()\n",
316 | "# player_id"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {
323 | "cell_id": "00013-f184159c-3491-447d-b854-5ddbba5940a7",
324 | "deepnote_cell_type": "code",
325 | "deepnote_to_be_reexecuted": false,
326 | "execution_millis": 3,
327 | "execution_start": 1634749714102,
328 | "source_hash": "b6aefb90",
329 | "tags": []
330 | },
331 | "outputs": [],
332 | "source": [
333 | "name = 'Michael Jordan'\n",
334 | "player = player_df[player_df[\"full_name\"] == name]\n",
335 | "player_id = 0\n",
336 | "\n",
337 | "if not player.empty:\n",
338 | " player_id = player['id'].item()\n",
339 | "\n",
340 | "player_id"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {
347 | "cell_id": "00011-662f1c28-2461-48bb-b976-6b9852b5cb7f",
348 | "deepnote_cell_type": "code",
349 | "deepnote_to_be_reexecuted": false,
350 | "execution_millis": 8181,
351 | "execution_start": 1634749722803,
352 | "source_hash": "a573dd94",
353 | "tags": []
354 | },
355 | "outputs": [],
356 | "source": [
357 | "reg_season_stats = get_stats(player_id=player_id, postseason=False)\n",
358 | "post_season_stats = get_stats(player_id=player_id, postseason=True)"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {
365 | "cell_id": "00013-37085edb-190d-4349-90e7-c2f9f2b16c18",
366 | "deepnote_cell_type": "code",
367 | "deepnote_to_be_reexecuted": false,
368 | "execution_millis": 36408,
369 | "execution_start": 1634749731021,
370 | "source_hash": "1b7daed3",
371 | "tags": []
372 | },
373 | "outputs": [],
374 | "source": [
375 | "reg_season_df = get_stats_df(reg_season_stats)\n",
376 | "post_season_df = get_stats_df(post_season_stats)"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": null,
382 | "metadata": {
383 | "cell_id": "00014-cb51df0a-dc70-4e78-a818-ec97e7404a54",
384 | "deepnote_cell_type": "code",
385 | "deepnote_to_be_reexecuted": false,
386 | "execution_millis": 204,
387 | "execution_start": 1634749767443,
388 | "source_hash": "4c028ab8",
389 | "tags": []
390 | },
391 | "outputs": [],
392 | "source": [
393 | "reg_season_df.head()"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": null,
399 | "metadata": {
400 | "cell_id": "00016-718161f6-0116-4be8-8494-4eb4dea76e80",
401 | "deepnote_cell_type": "code",
402 | "deepnote_to_be_reexecuted": false,
403 | "execution_millis": 10,
404 | "execution_start": 1634749810520,
405 | "source_hash": "c5f00695",
406 | "tags": []
407 | },
408 | "outputs": [],
409 | "source": [
410 | "post_season_df.shape"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "metadata": {
417 | "cell_id": "00018-681c6d19-666d-4e0c-b4ad-e525d27f260a",
418 | "deepnote_cell_type": "code",
419 | "deepnote_to_be_reexecuted": false,
420 | "execution_millis": 9,
421 | "execution_start": 1634750145075,
422 | "source_hash": "cb04ba6c",
423 | "tags": []
424 | },
425 | "outputs": [],
426 | "source": [
427 | "avg_pts_per_year = reg_season_df.groupby('year')['pts'].mean()\n",
428 | "# avg_pts_per_year"
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": null,
434 | "metadata": {
435 | "cell_id": "00019-290df566-fec7-4b24-a063-4d35d0e2240f",
436 | "deepnote_cell_type": "code",
437 | "deepnote_to_be_reexecuted": false,
438 | "execution_millis": 1,
439 | "execution_start": 1634750084879,
440 | "source_hash": "c574989f",
441 | "tags": []
442 | },
443 | "outputs": [],
444 | "source": [
445 | "avg_pts_per_year_per_postseason = post_season_df.groupby('year')['pts'].mean()\n",
446 | "# avg_pts_per_year_per_postseason"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": null,
452 | "metadata": {
453 | "cell_id": "00020-fd615d8f-285f-46d2-8537-e175c3e50bf5",
454 | "deepnote_cell_type": "code",
455 | "deepnote_to_be_reexecuted": false,
456 | "execution_millis": 3,
457 | "execution_start": 1634750134937,
458 | "source_hash": "b623e53d",
459 | "tags": []
460 | },
461 | "outputs": [],
462 | "source": [
463 | "player_salary_df = salary_df.copy()[salary_df['player'] == name][['adj_salary', 'year_start']]\n",
464 | "player_salary_df.head(n=20)"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "metadata": {
471 | "cell_id": "00021-0139f72a-d42f-4cdc-b9aa-3b5b3b10fba9",
472 | "deepnote_cell_type": "code",
473 | "deepnote_to_be_reexecuted": false,
474 | "execution_millis": 3,
475 | "execution_start": 1634750199585,
476 | "source_hash": "b26c02e0",
477 | "tags": []
478 | },
479 | "outputs": [],
480 | "source": [
481 | "mean_df = pd.DataFrame(avg_pts_per_year)\n",
482 | "mean_df.reset_index(drop=False, inplace=True)\n",
483 | "mean_df.head()"
484 | ]
485 | },
486 | {
487 | "cell_type": "code",
488 | "execution_count": null,
489 | "metadata": {
490 | "cell_id": "00022-b1d8c3af-6667-440d-a216-86ad40d5265f",
491 | "deepnote_cell_type": "code",
492 | "deepnote_to_be_reexecuted": false,
493 | "execution_millis": 48,
494 | "execution_start": 1634750305065,
495 | "source_hash": "b1c02bae",
496 | "tags": []
497 | },
498 | "outputs": [],
499 | "source": [
500 | "merged_df = mean_df.merge(player_salary_df, left_on='year', right_on='year_start')\n",
501 | "merged_df.drop(columns=['year_start'], inplace=True)\n",
502 | "merged_df['adj_salary_$'] = merged_df['adj_salary'].apply(utils.float_to_dollars)\n",
503 | "merged_df.head(n=100)"
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": null,
509 | "metadata": {
510 | "cell_id": "00023-5ddbc582-c278-4080-a907-52110cadf5ea",
511 | "deepnote_cell_type": "code",
512 | "tags": []
513 | },
514 | "outputs": [],
515 | "source": []
516 | },
517 | {
518 | "cell_type": "markdown",
519 | "metadata": {
520 | "created_in_deepnote_cell": true,
521 | "deepnote_cell_type": "markdown",
522 | "tags": []
523 | },
524 | "source": [
525 | "\n",
526 | "
\n",
527 | "Created in Deepnote"
528 | ]
529 | }
530 | ],
531 | "metadata": {
532 | "deepnote": {
533 | "is_reactive": false
534 | },
535 | "deepnote_execution_queue": [],
536 | "deepnote_notebook_id": "16cecac8-66ad-4d39-8a6c-73c2d96fd581"
537 | },
538 | "nbformat": 4,
539 | "nbformat_minor": 2
540 | }
541 |
--------------------------------------------------------------------------------
/nbs_ref/temp.csv:
--------------------------------------------------------------------------------
1 | ,number,time,added_by
2 | 0,0,2021-10-19 16:21:04.142161,Justin
3 | 1,1,2021-10-19 16:21:04.142165,Justin
4 | 2,2,2021-10-19 16:21:04.142167,Justin
5 | 3,3,2021-10-19 16:21:04.142168,Justin
6 | 4,4,2021-10-19 16:21:04.142169,Justin
7 | 5,5,2021-10-19 16:21:04.142170,Justin
8 | 6,6,2021-10-19 16:21:04.142171,Justin
9 | 7,7,2021-10-19 16:21:04.142173,Justin
10 | 8,8,2021-10-19 16:21:04.142174,Justin
11 | 9,9,2021-10-19 16:21:04.142175,Justin
12 |
--------------------------------------------------------------------------------
/nbs_ref/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Below are various simple utility methods we use in many of the notebooks.
3 | To use just:
4 |
5 | import utils
6 |
7 | utils.float_to_dollars(32.00)
8 |
9 | """
10 | from slugify import slugify
11 |
12 |
13 | def float_to_dollars(value:float) -> str:
14 | """
15 | Take in a float (32.00)
16 | """
17 | return f"${value:,.2f}"
18 |
19 |
20 | def dollar_str_to_float(value:str) -> float:
21 | return float(value.replace("$", "").replace(",", "_"))
22 |
23 |
24 | def group_salary(value:float) -> str:
25 | if value > .95:
26 | return 'top'
27 | elif value <= .95 and value > .50:
28 | return 'mid'
29 | return 'low'
30 |
31 |
32 | def to_snake_case(val):
33 | # in the future, this will be stored in
34 | # utils.py in the courses/ directory
35 | kebab_case = slugify(val)
36 | return kebab_case.replace('-', '_')
37 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | pre-commit
3 | nbstripout
4 | matplotlib
5 | numpy
6 | requests
7 | requests-html
8 | openpyxl
9 | python-slugify
10 |
--------------------------------------------------------------------------------
/start-here.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "cell_id": "00001-691e21a4-843f-476d-8cbb-9aa691d1d0d8",
7 | "deepnote_cell_type": "markdown",
8 | "tags": []
9 | },
10 | "source": [
11 | "# Welcome\n",
12 | "\n",
13 | "This is an interactive notebook. It allows us to write & run code, include Markdown-based instructions, and create visualizations.\n",
14 | "\n",
15 | "Assuming you have [launched this code on Deepnote](https://deepnote.com/launch?url=https://github.com/codingforentrepreneurs/Try-Pandas), click `Run Notebook` above.\n",
16 | "\n",
17 | "What you'll find in this project is a lot of notebooks. Each notebook should work in isolation so you can learn Pandas.\n",
18 | "\n",
19 | "Below is a very basic example of how you can both write code and allow anyone to collaborate with you. "
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {
26 | "cell_id": "00000-4fa6d049-fec9-4534-a889-f8f1d2535af4",
27 | "deepnote_cell_type": "code",
28 | "deepnote_to_be_reexecuted": false,
29 | "execution_millis": 6838,
30 | "execution_start": 1633809511037,
31 | "output_cleared": false,
32 | "source_hash": "bcce9116",
33 | "tags": []
34 | },
35 | "outputs": [],
36 | "source": [
37 | "name = input(\"Type your name then press enter:\")\n",
38 | "print(f\"Nice to meet you {name}. Are you ready to get started?\")\n",
39 | "\n",
40 | "while True:\n",
41 | " number = input(\"Type a number then press enter:\")\n",
42 | " if not number.isdigit():\n",
43 | " print(\"Please enter a number\")\n",
44 | " continue\n",
45 | " number = int(number)\n",
46 | " break\n",
47 | "\n",
48 | "print(f\"I enjoy {number} as well.\")\n",
49 | "\n",
50 | "import pandas as pd\n",
51 | "import random\n",
52 | "import datetime\n",
53 | "\n",
54 | "random_number = random.randint(0, number)\n",
55 | "random_data = [{\"number\": x * random_number, \"time\": datetime.datetime.now(), \"added_by\": name} for x in range(0, number)]\n",
56 | "\n",
57 | "df = pd.DataFrame(random_data)"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {
63 | "cell_id": "00002-ce4b4c81-ddce-49be-9e43-742db288db06",
64 | "deepnote_cell_type": "markdown",
65 | "tags": []
66 | },
67 | "source": [
68 | "From your number, we have generated a random Pandas spreadsheet although it's not called a spreasheet, it's called a DataFrame. Let's take a look at the top 10 rows (if your number was > 10) in it:"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {
75 | "cell_id": "00004-82f5d27a-b832-4133-b493-2fb4857137da",
76 | "deepnote_cell_type": "code",
77 | "deepnote_to_be_reexecuted": false,
78 | "execution_millis": 23,
79 | "execution_start": 1633809520831,
80 | "output_cleared": false,
81 | "source_hash": "990bc731",
82 | "tags": []
83 | },
84 | "outputs": [],
85 | "source": [
86 | "df.head(n=10)"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {
92 | "cell_id": "00004-49f828e9-9888-4d85-b1e0-d85d40201501",
93 | "deepnote_cell_type": "markdown",
94 | "tags": []
95 | },
96 | "source": [
97 | "Above is an example of how this course will work. There are videos associated to each lesson but the goal is to allow you to run the code anytime to dive deep into what is going on.\n",
98 | "\n",
99 | "Are you ready to begin? Open `Try-Pandas/Courses/1 - Pandas & Datasets.ipynb` to start the course!"
100 | ]
101 | }
102 | ],
103 | "metadata": {
104 | "deepnote": {
105 | "is_reactive": false
106 | },
107 | "deepnote_execution_queue": [],
108 | "deepnote_notebook_id": "c9ce9944-34a5-4f65-b253-4c8d164ece75",
109 | "kernelspec": {
110 | "display_name": "Python 3 (ipykernel)",
111 | "language": "python",
112 | "name": "python3"
113 | },
114 | "language_info": {
115 | "codemirror_mode": {
116 | "name": "ipython",
117 | "version": 3
118 | },
119 | "file_extension": ".py",
120 | "mimetype": "text/x-python",
121 | "name": "python",
122 | "nbconvert_exporter": "python",
123 | "pygments_lexer": "ipython3",
124 | "version": "3.9.7"
125 | }
126 | },
127 | "nbformat": 4,
128 | "nbformat_minor": 2
129 | }
130 |
--------------------------------------------------------------------------------