├── .gitignore
├── Data_Exploration
    ├── Lineup_Data_Exploration.ipynb
    ├── Lineup_Matchup_Exploration.ipynb
    ├── PlayByPlay_Data_Exploration.ipynb
    └── Player_Data_Exploration.ipynb
├── Data_Modeling
    ├── CompThePlayer.ipynb
    ├── RAPM_Model_NoPriors.ipynb
    └── Scratch
    │   ├── RegressionTest_Lineups.ipynb
    │   ├── RegressionTest_OffensiveRatings.ipynb
    │   └── RegressionTest_Players.ipynb
├── Data_Scraping
    ├── Cleanup_NBA_Player_Data.ipynb
    ├── Generate_Matchup_Data.ipynb
    ├── Scrape_BBallRefAdvanced_Stats.ipynb
    ├── Scrape_BBallRef_Stats.ipynb
    ├── Scrape_ESPN_RealPM_Stats.ipynb
    ├── Scrape_Hollinger_Stats.ipynb
    ├── Scrape_NBA_AdvancedStats_Data.ipynb
    ├── Scrape_NBA_DraftCombine_Data.ipynb
    ├── Scrape_NBA_Lineup_Stats.ipynb
    ├── Scrape_NBA_PlayType_Data.ipynb
    ├── Scrape_NBA_PlayerBios_Data.ipynb
    └── Scrape_PlayByPlay_Data.ipynb
├── README.md
├── bokeh_app
    ├── main.py
    ├── tabs
    │   ├── lineups.py
    │   ├── playbyplay.py
    │   └── players.py
    └── templates
    │   ├── index.html
    │   └── styles.css
└── img
    ├── nba_stats_explorer_curry_lineup_ex.png
    ├── nba_stats_explorer_curry_pbp_ex.png
    └── nba_stats_explorer_curry_player_ex.png


/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore data files
2 | *.json
3 | *.csv
4 | 
5 | # Ignore backup and checkpoint files
6 | .ipynb_checkpoints
7 | .swp
8 | 


--------------------------------------------------------------------------------
/Data_Modeling/Scratch/RegressionTest_OffensiveRatings.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "toc": true
  7 |    },
  8 |    "source": [
  9 |     "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
 10 |     "<div class=\"toc\"><ul class=\"toc-item\"></ul></div>"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# Import modules and packages\n",
 20 |     "import numpy as np\n",
 21 |     "import matplotlib.pyplot as plt\n",
 22 |     "import matplotlib.cm as cm\n",
 23 |     "import json\n",
 24 |     "import pandas as pd\n",
 25 |     "import seaborn as sns\n",
 26 |     "import re\n",
 27 |     "from scipy.stats import gaussian_kde\n",
 28 |     "from sklearn import metrics\n",
 29 |     "from sklearn.metrics import confusion_matrix\n",
 30 |     "from sklearn.preprocessing import StandardScaler\n",
 31 |     "from sklearn.preprocessing import LabelEncoder\n",
 32 |     "from sklearn.model_selection import train_test_split\n",
 33 |     "from sklearn.model_selection import cross_val_score\n",
 34 |     "from sklearn.model_selection import GridSearchCV\n",
 35 |     "from sklearn import svm\n",
 36 |     "from sklearn.neural_network import MLPRegressor\n",
 37 |     "from sklearn.linear_model import LinearRegression\n",
 38 |     "from sklearn.linear_model import Ridge\n",
 39 |     "from sklearn.ensemble import RandomForestClassifier\n",
 40 |     "from xgboost import XGBClassifier\n",
 41 |     "import unidecode\n",
 42 |     "import unicodedata\n",
 43 |     "%matplotlib inline"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "sns.set_style(\"whitegrid\")"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Read in compiled NBA player data\n",
 62 |     "df_orig = pd.read_csv('../CompleteNBAPlayerStats.csv')\n",
 63 |     "print(\"Table of BBall Player Stats:\\n\\n\", df_orig)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# Dataframes after cutting on GP/MPG/other parameters\n",
 73 |     "df = df_orig.copy()\n",
 74 |     "df = df[df.GP > min_num_games]\n",
 75 |     "df = df[df.MPG > min_MPG]\n",
 76 |     "\n",
 77 |     "# Add some additional variables\n",
 78 |     "## To convert to \"per 36-min\" stats\n",
 79 |     "df['2PA_PG'] = df['2PA_PT']*df.MPG/36.\n",
 80 |     "df['3PA_PG'] = df['3PA_PT']*df.MPG/36.\n",
 81 |     "df['FGA_PG'] = df.FGA_PT*df.MPG/36.\n",
 82 |     "\n",
 83 |     "df[\"2PR\"] = df[\"2PA_PH\"]/df[\"FGA_PH\"]\n",
 84 |     "df[\"3PR\"] = df[\"3PA_PH\"]/df[\"FGA_PH\"]\n",
 85 |     "\n",
 86 |     "df['FG_FREQ_RIM'] = (df.FGA_RA)/df.FGA_PG # restricted area\n",
 87 |     "df['FG_FREQ_MR_AND_PT'] = (df.FGA_MR + df.FGA_NONRA)/df.FGA_PG # combined paint and midrange\n",
 88 |     "df['FG_FREQ_MR'] = (df.FGA_MR)/df.FGA_PG\n",
 89 |     "df['FG_FREQ_CORNERS'] = (df.FGA_LC + df.FGA_RC)/df.FGA_PG\n",
 90 |     "df['FG_FREQ_AB'] = df.FGA_AB/df.FGA_PG\n",
 91 |     "df['FG_FREQ_01DRIB'] = (df['FGA_0DRIB'] + df['FGA_1DRIB'])/df.FGA_PG\n",
 92 |     "df['FG_FREQ_GT1DRIB'] = (df['FGA_2DRIB'] + df['FGA_36DRIB'] + df['FGA_GT7DRIB'])/df.FGA_PG\n",
 93 |     "df['FG_FREQ_CANDS'] = df['FGA_CANDS']/df.FGA_PG\n",
 94 |     "\n",
 95 |     "df[\"FG_FREQ_RIM\"].fillna(0, inplace=True)\n",
 96 |     "df[\"FG_FREQ_MR_AND_PT\"].fillna(0, inplace=True)\n",
 97 |     "df[\"FG_FREQ_MR\"].fillna(0, inplace=True)\n",
 98 |     "df[\"FG_FREQ_CORNERS\"].fillna(0, inplace=True)\n",
 99 |     "df[\"FG_FREQ_AB\"].fillna(0, inplace=True)\n",
100 |     "df[\"FG_FREQ_01DRIB\"].fillna(0, inplace=True)\n",
101 |     "df[\"FG_FREQ_GT1DRIB\"].fillna(0, inplace=True)\n",
102 |     "df[\"FG_FREQ_CANDS\"].fillna(0, inplace=True)\n"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# Various dataframes separating rookies from established veterans,\n",
112 |     "# as well as median data to exclude outliers for veteran players\n",
113 |     "df_vets = df[df[\"name\"].isin(df[\"name\"].value_counts()[df[\"name\"].value_counts()>1].index)]\n",
114 |     "df_rooks = df[df[\"name\"].isin(df[\"name\"].value_counts()[df[\"name\"].value_counts()==1].index)]\n",
115 |     "df_med = df_vets.groupby(\"name\").median().reset_index()\n",
116 |     "\n",
117 |     "# Dataframes by player position\n",
118 |     "# Centers\n",
119 |     "dfc = df[df['pos'].str.contains('C')]\n",
120 |     "# Forwards\n",
121 |     "dff = df[df['pos'].str.contains('F')]\n",
122 |     "# Guards\n",
123 |     "dfg = df[df['pos'].str.contains('G')]\n"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "x_train = df[df.year != 2020][['2PM_PH', '3PM_PH', 'FTM_PH', 'AST_PH', 'ORB_PH', 'DRB_PH']]\n",
133 |     "y_train = df[df.year != 2020][['OFFRTG']]\n",
134 |     "\n",
135 |     "reg = LinearRegression()\n",
136 |     "reg.fit(x_train, y_train)\n",
137 |     "\n",
138 |     "x_test = df[df.year == 2020][['2PM_PH', '3PM_PH', 'FTM_PH', 'AST_PH', 'ORB_PH', 'DRB_PH']]\n",
139 |     "y_test_tot = df[df.year == 2020].OFFRTG\n",
140 |     "y_pred_tot = reg.predict(x_test)\n",
141 |     "\n",
142 |     "#print(df[df.year == 2020].name.values[i])\n",
143 |     "#for i,name in enumerate(df[df.year == 2020].name):\n",
144 |     "#    print(name, ':', y_pred[:,0][i], df[df.year == 2020].iloc[i,:].OFFRTG)\n",
145 |     "    \n",
146 |     "print('Made', len(y_pred_tot), 'predictions with a RMS error of', np.sqrt(metrics.mean_squared_error(y_test_tot, y_pred_tot)))\n"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "x_train = df[df.year != 2020][['EFGP', '3PR', 'FTM_PH', 'AST_PH', 'TOR', 'REBR']]\n",
156 |     "y_train = df[df.year != 2020][['OFFRTG']]\n",
157 |     "\n",
158 |     "reg = LinearRegression()\n",
159 |     "reg.fit(x_train, y_train)\n",
160 |     "\n",
161 |     "x_test = df[df.year == 2020][['EFGP', '3PR', 'FTM_PH', 'AST_PH', 'TOR', 'REBR']]\n",
162 |     "y_test_eff = df[df.year == 2020].OFFRTG\n",
163 |     "y_pred_eff = reg.predict(x_test)\n",
164 |     "\n",
165 |     "#print(df[df.year == 2020].name.values[i])\n",
166 |     "#for i,name in enumerate(df[df.year == 2020].name):\n",
167 |     "#    print(name, ':', y_pred[:,0][i], df[df.year == 2020].iloc[i,:].OFFRTG)\n",
168 |     "    \n",
169 |     "print('Made', len(y_pred_eff), 'predictions with a RMS error of', np.sqrt(metrics.mean_squared_error(y_test_eff, y_pred_eff)))\n"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "fig, ax = plt.subplots(figsize=(10, 8))\n",
179 |     "plt.xlabel(\"Offensive Rating\")\n",
180 |     "plt.ylabel(\"Predicted Offensive Rating\")\n",
181 |     "plt.scatter(y_test_tot, y_pred_tot)\n",
182 |     "plt.scatter(y_test_eff, y_pred_eff)\n",
183 |     "#plt.scatter(y_test_eff, df[df.year == 2020].PTS_PH)\n",
184 |     "xmin, xmax = ax.get_xlim()\n",
185 |     "#ymin, ymax = ax.get_ylim()\n",
186 |     "ymin = 90\n",
187 |     "ymax = 125\n",
188 |     "plt.plot([90,130], [90,130], 'r--')\n",
189 |     "ax.set_xlim(xmin, xmax)\n",
190 |     "ax.set_ylim(ymin, ymax)\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "scrolled": false
198 |    },
199 |    "outputs": [],
200 |    "source": [
201 |     "mlp_reg = MLPRegressor()\n",
202 |     "mlp_reg.fit(x_train, y_train.values.ravel())\n",
203 |     "y_mlp_pred = mlp_reg.predict(x_test)\n",
204 |     "for i,pred in enumerate(y_mlp_pred):\n",
205 |     "    print(str(df_test.name.values[i]) + ' has a predicted +/- of ' + str(pred) + ' compared to real +/- of ' + str(y_test.values[i][0]))\n",
206 |     "\n",
207 |     "print('Made', len(y_mlp_pred), 'predictions with a RMS error of', np.sqrt(metrics.mean_squared_error(y_test, y_mlp_pred)))\n"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": []
216 |   }
217 |  ],
218 |  "metadata": {
219 |   "kernelspec": {
220 |    "display_name": "Python 3",
221 |    "language": "python",
222 |    "name": "python3"
223 |   },
224 |   "language_info": {
225 |    "codemirror_mode": {
226 |     "name": "ipython",
227 |     "version": 3
228 |    },
229 |    "file_extension": ".py",
230 |    "mimetype": "text/x-python",
231 |    "name": "python",
232 |    "nbconvert_exporter": "python",
233 |    "pygments_lexer": "ipython3",
234 |    "version": "3.7.3"
235 |   },
236 |   "toc": {
237 |    "base_numbering": 1,
238 |    "nav_menu": {},
239 |    "number_sections": true,
240 |    "sideBar": false,
241 |    "skip_h1_title": false,
242 |    "title_cell": "Table of Contents",
243 |    "title_sidebar": "Contents",
244 |    "toc_cell": true,
245 |    "toc_position": {},
246 |    "toc_section_display": true,
247 |    "toc_window_display": false
248 |   }
249 |  },
250 |  "nbformat": 4,
251 |  "nbformat_minor": 2
252 | }
253 | 


--------------------------------------------------------------------------------
/Data_Scraping/Cleanup_NBA_Player_Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "toc": true
  7 |    },
  8 |    "source": [
  9 |     "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
 10 |     "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Load-and-Convert-Types-for-Each-Player-Dataset\" data-toc-modified-id=\"Load-and-Convert-Types-for-Each-Player-Dataset-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Load and Convert Types for Each Player Dataset</a></span></li><li><span><a href=\"#Merge-All-Dataframes-and-Rearrange-Columns\" data-toc-modified-id=\"Merge-All-Dataframes-and-Rearrange-Columns-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Merge All Dataframes and Rearrange Columns</a></span></li><li><span><a href=\"#Create-New-Variables-and-Resolve-Infs/NaNs\" data-toc-modified-id=\"Create-New-Variables-and-Resolve-Infs/NaNs-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Create New Variables and Resolve Infs/NaNs</a></span></li></ul></div>"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# Import modules and packages\n",
 20 |     "import numpy as np\n",
 21 |     "import matplotlib.pyplot as plt\n",
 22 |     "import matplotlib.cm as cm\n",
 23 |     "import json\n",
 24 |     "import pandas as pd\n",
 25 |     "import seaborn as sns\n",
 26 |     "from sklearn.metrics import confusion_matrix\n",
 27 |     "from sklearn.preprocessing import StandardScaler\n",
 28 |     "from sklearn.preprocessing import LabelEncoder\n",
 29 |     "from sklearn.model_selection import train_test_split\n",
 30 |     "from sklearn.model_selection import cross_val_score\n",
 31 |     "from sklearn.model_selection import GridSearchCV\n",
 32 |     "from sklearn import svm\n",
 33 |     "from sklearn.linear_model import LinearRegression\n",
 34 |     "from sklearn.ensemble import RandomForestClassifier\n",
 35 |     "from xgboost import XGBClassifier\n",
 36 |     "import unidecode\n",
 37 |     "import unicodedata\n",
 38 |     "import difflib\n",
 39 |     "from fuzzywuzzy import fuzz \n",
 40 |     "from fuzzywuzzy import process\n",
 41 |     "%matplotlib inline"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# Modify dataframe columns to specified types (float, int, string)\n",
 51 |     "def ConvertDataFrame(df, str_list, int_list):\n",
 52 |     "    cols = df.columns.drop(str_list)\n",
 53 |     "    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)\n",
 54 |     "    for string in str_list:\n",
 55 |     "        df[string] = df[string].astype('str')\n",
 56 |     "    for integer in int_list:\n",
 57 |     "        df[integer] = df[integer].astype('int')\n",
 58 |     "    return df\n"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "# Load and Convert Types for Each Player Dataset"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 3,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "Table of NBA Player Bios Over the Last 20 Years:\n",
 78 |       "\n",
 79 |       "                  name  height  weight       college   country  \\\n",
 80 |       "0            AC Green    81.0   225.0  Oregon State       USA   \n",
 81 |       "1           AJ Guyton    73.0   180.0       Indiana       USA   \n",
 82 |       "2         Aaron McKie    77.0   209.0        Temple       USA   \n",
 83 |       "3      Aaron Williams    81.0   225.0        Xavier       USA   \n",
 84 |       "4          Adam Keefe    81.0   230.0      Stanford       USA   \n",
 85 |       "...               ...     ...     ...           ...       ...   \n",
 86 |       "9339  Vincent Poirier    84.0   235.0          None    France   \n",
 87 |       "9340    Vlatko Cancar    80.0   236.0          None  Slovenia   \n",
 88 |       "9343   Wenyen Gabriel    81.0   205.0          None     Sudan   \n",
 89 |       "9354  Zach Norvell Jr    77.0   205.0          None       USA   \n",
 90 |       "9355   Zylan Cheatham    77.0   220.0          None       USA   \n",
 91 |       "\n",
 92 |       "      actual_draft_year  draft_round  draft_number      draft nationality  \n",
 93 |       "0                1985.0          1.0          23.0    drafted    domestic  \n",
 94 |       "1                2000.0          2.0          32.0    drafted    domestic  \n",
 95 |       "2                1994.0          1.0          17.0    drafted    domestic  \n",
 96 |       "3                   NaN          NaN           NaN  undrafted    domestic  \n",
 97 |       "4                1992.0          1.0          10.0    drafted    domestic  \n",
 98 |       "...                 ...          ...           ...        ...         ...  \n",
 99 |       "9339                NaN          NaN           NaN  undrafted     foreign  \n",
100 |       "9340             2017.0          2.0          49.0    drafted     foreign  \n",
101 |       "9343                NaN          NaN           NaN  undrafted     foreign  \n",
102 |       "9354                NaN          NaN           NaN  undrafted    domestic  \n",
103 |       "9355                NaN          NaN           NaN  undrafted    domestic  \n",
104 |       "\n",
105 |       "[1934 rows x 10 columns]\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "# Grab NBA player bios (including height/weight)\n",
111 |     "df_bios = pd.read_csv('../NBAPlayerBios.csv', index_col=0)\n",
112 |     "df_bios = ConvertDataFrame(df_bios, ['name', 'college', 'country', 'draft', 'nationality'], [])\n",
113 |     "#print(df_bios[df_bios.duplicated(subset=['name'], keep='first')])\n",
114 |     "#print(df_bios[df_bios.duplicated(subset=['name'], keep='last')])\n",
115 |     "df_bios = df_bios.drop_duplicates(subset=['name'], keep=False)\n",
116 |     "df_bios['name'] = df_bios['name'].str.replace('.', '')\n",
117 |     "print(\"Table of NBA Player Bios Over the Last 20 Years:\\n\\n\", df_bios)\n"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 4,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "Table of NBA Draft Player Measurements Over the Last 20 Years:\n",
130 |       "\n",
131 |       "                         name  draft_year  wingspan\n",
132 |       "0                Malik Allen        2001     86.50\n",
133 |       "1           Harold Arceneaux        2001     80.50\n",
134 |       "2              Lamont Barnes        2001     87.50\n",
135 |       "3                Mario Bland        2001     84.00\n",
136 |       "4              Primoz Brezec        2001     86.00\n",
137 |       "...                      ...         ...       ...\n",
138 |       "1312  Quinndary Weatherspoon        2020     81.00\n",
139 |       "1313              Coby White        2020     77.00\n",
140 |       "1314             Kris Wilkes        2020     82.75\n",
141 |       "1315          Grant Williams        2020     81.75\n",
142 |       "1316           Dylan Windler        2020     82.00\n",
143 |       "\n",
144 |       "[1294 rows x 3 columns]\n"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "# Grab physical player measurements from Draft Combine data\n",
150 |     "#df_comb = pd.read_csv('NBACombineStats.csv', usecols=['name', 'draft_year', 'height', 'weight', 'wingspan'])\n",
151 |     "df_comb = pd.read_csv('../NBACombineStats.csv', usecols=['name', 'draft_year', 'wingspan'])\n",
152 |     "df_comb = ConvertDataFrame(df_comb, ['name'], ['draft_year'])\n",
153 |     "#print(df_comb[df_comb.duplicated(subset=['name'], keep='first')])\n",
154 |     "#print(df_comb[df_comb.duplicated(subset=['name'], keep='last')])\n",
155 |     "df_comb = df_comb.drop_duplicates(subset=['name'], keep='last')\n",
156 |     "df_comb['name'] = df_comb['name'].str.replace('.', '')\n",
157 |     "print(\"Table of NBA Draft Player Measurements Over the Last 20 Years:\\n\\n\", df_comb)\n"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 5,
163 |    "metadata": {
164 |     "scrolled": false
165 |    },
166 |    "outputs": [
167 |     {
168 |      "name": "stdout",
169 |      "output_type": "stream",
170 |      "text": [
171 |       "Table of basic and advanced player stats:\n",
172 |       "\n",
173 |       "                     name  year   age     W     L       WLR  PTS_PT  FGM_PT  \\\n",
174 |       "0             AJ Hammons  2017  24.0   4.0  18.0  0.222222    10.6     3.7   \n",
175 |       "1               AJ Price  2015  28.0  11.0  15.0  0.733333    14.8     5.7   \n",
176 |       "2           Aaron Brooks  2015  30.0  50.0  32.0  1.562500    18.2     6.6   \n",
177 |       "3           Aaron Brooks  2016  31.0  36.0  33.0  1.090909    16.0     6.1   \n",
178 |       "4           Aaron Brooks  2017  32.0  36.0  29.0  1.241379    13.0     4.9   \n",
179 |       "...                  ...   ...   ...   ...   ...       ...     ...     ...   \n",
180 |       "3003             Zhou Qi  2019  23.0   0.0   1.0  0.000000    75.4    37.7   \n",
181 |       "3004        Zoran Dragic  2015  26.0   6.0  10.0  0.600000    13.4     5.3   \n",
182 |       "3005      Zylan Cheatham  2020  24.0   1.0   1.0  1.000000     3.6     1.8   \n",
183 |       "3006  Antonius Cleveland  2020   NaN   NaN   NaN       NaN     NaN     NaN   \n",
184 |       "3007            Kyle Guy  2020   NaN   NaN   NaN       NaN     NaN     NaN   \n",
185 |       "\n",
186 |       "      FGA_PT  FGP_PT  ...  CHARGE_DRAWN_PT  CONTESTS_2PT_PT  CONTESTS_3PT_PT  \\\n",
187 |       "0        9.3   0.405  ...              0.0              9.7              1.8   \n",
188 |       "1       15.2   0.372  ...              NaN              NaN              NaN   \n",
189 |       "2       15.6   0.421  ...              NaN              NaN              NaN   \n",
190 |       "3       15.2   0.401  ...              NaN              NaN              NaN   \n",
191 |       "4       12.1   0.403  ...              0.3              4.1              4.1   \n",
192 |       "...      ...     ...  ...              ...              ...              ...   \n",
193 |       "3003    37.7   1.000  ...              0.0              0.0              0.0   \n",
194 |       "3004    14.4   0.367  ...              NaN              NaN              NaN   \n",
195 |       "3005     7.3   0.250  ...              0.0             10.8              3.6   \n",
196 |       "3006     NaN     NaN  ...              NaN              NaN              NaN   \n",
197 |       "3007     NaN     NaN  ...              NaN              NaN              NaN   \n",
198 |       "\n",
199 |       "      CONTESTS_PT  2PM_PH  2PA_PH    2PP_PH  2PM_PT  2PA_PT    2PP_PT  \n",
200 |       "0            11.5     3.7     9.7  0.381443     2.6     7.1  0.366197  \n",
201 |       "1             NaN     5.7    12.8  0.445312     4.0     8.9  0.449438  \n",
202 |       "2             NaN     6.0    13.6  0.441176     4.3     9.6  0.447917  \n",
203 |       "3             NaN     5.5    12.7  0.433071     4.0     9.2  0.434783  \n",
204 |       "4             8.1     4.0     9.5  0.421053     3.0     6.9  0.434783  \n",
205 |       "...           ...     ...     ...       ...     ...     ...       ...  \n",
206 |       "3003          0.0    33.3    33.3  1.000000    37.7    37.7  1.000000  \n",
207 |       "3004          NaN     5.1    10.2  0.500000     3.9     7.7  0.506494  \n",
208 |       "3005         14.4     2.6     7.9  0.329114     1.8     5.5  0.327273  \n",
209 |       "3006          NaN     NaN     NaN       NaN     NaN     NaN       NaN  \n",
210 |       "3007          NaN     NaN     NaN       NaN     NaN     NaN       NaN  \n",
211 |       "\n",
212 |       "[3006 rows x 416 columns]\n"
213 |      ]
214 |     }
215 |    ],
216 |    "source": [
217 |     "# Grab player basic and advanced stats from NBA.com\n",
218 |     "df1 = pd.read_csv('../NBAAdvancedStats.csv', index_col=0)\n",
219 |     "df1 = ConvertDataFrame(df1, ['name', 'red_pos'], ['year'])\n",
220 |     "#print(df1[df1.duplicated(subset=['name', 'year'], keep='first')])\n",
221 |     "#print(df1[df1.duplicated(subset=['name', 'year'], keep='last')])\n",
222 |     "df1 = df1.drop_duplicates(subset=['name', 'year'], keep='last')\n",
223 |     "#df1 = df1.groupby(['name', 'year']).mean(numeric_only=True).reset_index()\n",
224 |     "\n",
225 |     "# Add useful additional variables, particularly\n",
226 |     "# 2-point shots and win-to-loss ratio\n",
227 |     "df1.insert(df1.columns.get_loc('L')+1, 'WLR', df1['W']/df1['L'])\n",
228 |     "df1[\"2PM_PH\"] = df1[\"FGM_PH\"] - df1[\"3PM_PH\"]\n",
229 |     "df1[\"2PA_PH\"] = df1[\"FGA_PH\"] - df1[\"3PA_PH\"]\n",
230 |     "df1[\"2PP_PH\"] = df1[\"2PM_PH\"]/df1[\"2PA_PH\"]\n",
231 |     "df1[\"2PM_PT\"] = df1[\"FGM_PT\"] - df1[\"3PM_PT\"]\n",
232 |     "df1[\"2PA_PT\"] = df1[\"FGA_PT\"] - df1[\"3PA_PT\"]\n",
233 |     "df1[\"2PP_PT\"] = df1[\"2PM_PT\"]/df1[\"2PA_PT\"]\n",
234 |     "\n",
235 |     "df1['name'] = df1['name'].str.replace('.', '')\n",
236 |     "\n",
237 |     "print(\"Table of basic and advanced player stats:\\n\\n\", df1)\n",
238 |     "#print(df1.red_pos)\n",
239 |     "#print(df1[df1.red_pos == 'nan'])\n"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 6,
245 |    "metadata": {
246 |     "scrolled": false
247 |    },
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "Table of basic BBall Reference stats:\n",
254 |       "\n",
255 |       "                     name  year pos team    GP    GS\n",
256 |       "0             AJ Hammons  2017   C  DAL  22.0   0.0\n",
257 |       "1               AJ Price  2010  PG  IND  56.0   2.0\n",
258 |       "2               AJ Price  2011  PG  IND  50.0   0.0\n",
259 |       "3               AJ Price  2012  PG  IND  44.0   1.0\n",
260 |       "4               AJ Price  2013  PG  WAS  57.0  22.0\n",
261 |       "...                  ...   ...  ..  ...   ...   ...\n",
262 |       "4550        Zhaire Smith  2019  SG  PHI   6.0   2.0\n",
263 |       "4551             Zhou Qi  2018   C  HOU  18.0   0.0\n",
264 |       "4552        Zoran Dragic  2015  SG  TOT  16.0   1.0\n",
265 |       "4553  Zydrunas Ilgauskas  2010   C  CLE  64.0   6.0\n",
266 |       "4554  Zydrunas Ilgauskas  2011   C  MIA  72.0  51.0\n",
267 |       "\n",
268 |       "[4555 rows x 6 columns]\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "# Grab basic player stats from BBall Reference\n",
274 |     "with open('../BBallRefStats.json') as f:\n",
275 |     "    json_data = json.load(f)\n",
276 |     "\n",
277 |     "df2 = pd.DataFrame(data=json_data, dtype=float)\n",
278 |     "df2['name'] = df2['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())\n",
279 |     "df2.name = df2.name.astype(str)\n",
280 |     "df2.pos = df2.pos.astype(str)\n",
281 |     "df2.team = df2.team.astype(str)\n",
282 |     "df2.year = df2.year.astype(int)\n",
283 |     "#df2 = df2.drop_duplicates(subset=['name', 'year'], keep=False)\n",
284 |     "df2 = df2.groupby(['name', 'year', 'pos', 'team']).mean(numeric_only=True).reset_index()\n",
285 |     "df2['name'] = df2['name'].str.replace('.', '')\n",
286 |     "print(\"Table of basic BBall Reference stats:\\n\\n\", df2)\n"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 7,
292 |    "metadata": {
293 |     "scrolled": false
294 |    },
295 |    "outputs": [
296 |     {
297 |      "name": "stdout",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "Table of advanced BBall Reference stats:\n",
301 |       "\n",
302 |       "                     name  year    FTR  OWS  DWS   WS   WS48  OBPM  DBPM  BPM  \\\n",
303 |       "0             AJ Hammons  2017  0.476 -0.2  0.2  0.0 -0.001  -7.5   2.0 -5.6   \n",
304 |       "1               AJ Price  2010  0.212  0.4  0.8  1.2  0.065   0.2  -2.0 -1.8   \n",
305 |       "2               AJ Price  2011  0.253 -0.4  0.7  0.3  0.020  -1.1  -2.3 -3.4   \n",
306 |       "3               AJ Price  2012  0.201  0.2  0.5  0.7  0.063  -0.2  -1.7 -1.9   \n",
307 |       "4               AJ Price  2013  0.150  1.0  1.2  2.2  0.084  -0.1  -1.7 -1.8   \n",
308 |       "...                  ...   ...    ...  ...  ...  ...    ...   ...   ...  ...   \n",
309 |       "5271             Zhou Qi  2019  0.000  0.0  0.0  0.0  1.261  22.1 -12.6  9.5   \n",
310 |       "5272        Zoran Dragic  2015  0.167 -0.1  0.0 -0.1 -0.042  -2.5  -4.0 -6.5   \n",
311 |       "5273  Zydrunas Ilgauskas  2010  0.231  0.5  2.0  2.5  0.088  -3.3   0.2 -3.2   \n",
312 |       "5274  Zydrunas Ilgauskas  2011  0.144  1.0  1.9  2.9  0.122  -2.6   1.0 -1.5   \n",
313 |       "5275      Zylan Cheatham  2020  0.000 -0.1  0.0 -0.1 -0.091  -8.8   0.2 -8.6   \n",
314 |       "\n",
315 |       "      VORP  \n",
316 |       "0     -0.1  \n",
317 |       "1      0.0  \n",
318 |       "2     -0.3  \n",
319 |       "3      0.0  \n",
320 |       "4      0.1  \n",
321 |       "...    ...  \n",
322 |       "5271   0.0  \n",
323 |       "5272  -0.1  \n",
324 |       "5273  -0.4  \n",
325 |       "5274   0.1  \n",
326 |       "5275  -0.1  \n",
327 |       "\n",
328 |       "[5276 rows x 11 columns]\n"
329 |      ]
330 |     }
331 |    ],
332 |    "source": [
333 |     "# Grab advanced player stats from BBall Reference\n",
334 |     "with open('../BBallRefAdvancedStats.json') as f:\n",
335 |     "    json_data = json.load(f)\n",
336 |     "\n",
337 |     "df3 = pd.DataFrame(data=json_data, dtype=float)\n",
338 |     "df3['name'] = df3['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())\n",
339 |     "df3.name = df3.name.astype(str)\n",
340 |     "df3.year = df3.year.astype(int)\n",
341 |     "#df3 = df3.drop_duplicates(subset=['name', 'year'], keep=False)\n",
342 |     "df3 = df3.groupby(['name', 'year']).mean(numeric_only=True).reset_index()\n",
343 |     "df3['name'] = df3['name'].str.replace('.', '')\n",
344 |     "print(\"Table of advanced BBall Reference stats:\\n\\n\", df3)\n"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 8,
350 |    "metadata": {},
351 |    "outputs": [
352 |     {
353 |      "name": "stdout",
354 |      "output_type": "stream",
355 |      "text": [
356 |       "Table of advanced Hollinger stats:\n",
357 |       "\n",
358 |       "                     name  year   MPG     TS  ASTR   TOR   USG   ORR   DRR  \\\n",
359 |       "0               AJ Price  2010  15.4  0.530  19.2  10.7  21.3   1.5   9.7   \n",
360 |       "1               AJ Price  2011  15.9  0.454  21.4  10.2  21.8   2.3   7.8   \n",
361 |       "2               AJ Price  2012  12.9  0.454  28.0  10.4  17.7   2.6   9.4   \n",
362 |       "3               AJ Price  2013  22.4  0.501  28.9   9.0  17.9   1.7   8.2   \n",
363 |       "4               AJ Price  2014   3.5  0.469  19.4  10.5  22.7   1.1  10.2   \n",
364 |       "...                  ...   ...   ...    ...   ...   ...   ...   ...   ...   \n",
365 |       "5320             Zhou Qi  2019   1.0  1.000   0.0   0.0  40.8   0.0   0.0   \n",
366 |       "5321        Zoran Dragic  2015   4.7  0.435  11.8  11.8  19.5   8.0   4.7   \n",
367 |       "5322  Zydrunas Ilgauskas  2010  20.9  0.491   8.1  10.6  17.1  10.7  19.6   \n",
368 |       "5323  Zydrunas Ilgauskas  2011  15.9  0.531   6.2  12.5  14.2  11.9  17.5   \n",
369 |       "5324      Zylan Cheatham  2020  10.3  0.400  20.0  30.0  10.9   3.3  17.5   \n",
370 |       "\n",
371 |       "      REBR    PER    VA  EWA       ATR      ODRR  \n",
372 |       "0      5.6  14.06  42.1  1.4  1.794393  0.154639  \n",
373 |       "1      5.0  10.74  -0.7  0.0  2.098039  0.294872  \n",
374 |       "2      6.0  11.54   6.2  0.2  2.692308  0.276596  \n",
375 |       "3      4.9  12.45  27.7  0.9  3.211111  0.207317  \n",
376 |       "4      5.5   9.72   0.0  0.0  1.847619  0.107843  \n",
377 |       "...    ...    ...   ...  ...       ...       ...  \n",
378 |       "5320   0.0  80.61   0.0  0.0       NaN       NaN  \n",
379 |       "5321   6.3   8.21  -2.6 -0.1  1.000000  1.702128  \n",
380 |       "5322  15.3  11.99  27.8  0.9  0.764151  0.545918  \n",
381 |       "5323  14.9  12.84  38.3  1.3  0.496000  0.680000  \n",
382 |       "5324  10.2   0.23  -4.8 -0.2  0.666667  0.188571  \n",
383 |       "\n",
384 |       "[5325 rows x 15 columns]\n"
385 |      ]
386 |     }
387 |    ],
388 |    "source": [
389 |     "# Grab advanced player stats from BBall Reference\n",
390 |     "with open('../Hollinger.json') as f:\n",
391 |     "    json_data = json.load(f)\n",
392 |     "\n",
393 |     "df4 = pd.DataFrame(data=json_data, dtype=float)\n",
394 |     "df4['name'] = df4['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())\n",
395 |     "df4.name = df4.name.astype(str)\n",
396 |     "df4.year = df4.year.astype(int)\n",
397 |     "df4 = df4.groupby(['name', 'year']).mean(numeric_only=True).reset_index()\n",
398 |     "\n",
399 |     "# Create variables for assist-to-turnover ratio\n",
400 |     "# and offensive-to-defensive rebounding ratio\n",
401 |     "df4[\"ATR\"] = df4[\"ASTR\"]/df4[\"TOR\"]\n",
402 |     "df4[\"ODRR\"] = df4[\"ORR\"]/df4[\"DRR\"]\n",
403 |     "\n",
404 |     "df4['name'] = df4['name'].str.replace('.', '')\n",
405 |     "\n",
406 |     "print(\"Table of advanced Hollinger stats:\\n\\n\", df4)\n"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 9,
412 |    "metadata": {
413 |     "scrolled": false
414 |    },
415 |    "outputs": [
416 |     {
417 |      "name": "stdout",
418 |      "output_type": "stream",
419 |      "text": [
420 |       "Table of play-type stats:\n",
421 |       "\n",
422 |       "                   name  year  OFF_POSS_TR  OFF_FREQ_TR  OFF_PPP_TR  \\\n",
423 |       "0         Aaron Brooks  2016          0.9        0.104        0.95   \n",
424 |       "1         Aaron Brooks  2017          0.6        0.102        0.72   \n",
425 |       "2         Aaron Gordon  2016          1.6        0.174        1.08   \n",
426 |       "3         Aaron Gordon  2017          2.6        0.199        1.03   \n",
427 |       "4         Aaron Gordon  2018          3.7        0.201        0.96   \n",
428 |       "...                ...   ...          ...          ...         ...   \n",
429 |       "2028    Devin Robinson  2019          0.0        0.000        0.00   \n",
430 |       "2029  Jared Cunningham  2016          0.0        0.000        0.00   \n",
431 |       "2030     Jarrod Uthoff  2017          0.0        0.000        0.00   \n",
432 |       "2031    Alex Stepheson  2016          0.0        0.000        0.00   \n",
433 |       "2032       Jack Cooley  2018          0.0        0.000        0.00   \n",
434 |       "\n",
435 |       "      OFF_PTS_TR  OFF_FGM_TR  OFF_FGA_TR  OFF_FGP_TR  OFF_EFGP_TR  ...  \\\n",
436 |       "0            0.8         0.4         0.7       0.392        0.520  ...   \n",
437 |       "1            0.4         0.3         0.4       0.333        0.463  ...   \n",
438 |       "2            1.8         0.5         1.1       0.558        0.581  ...   \n",
439 |       "3            2.7         0.9         1.9       0.536        0.565  ...   \n",
440 |       "4            3.6         1.5         2.8       0.473        0.527  ...   \n",
441 |       "...          ...         ...         ...         ...          ...  ...   \n",
442 |       "2028         0.0         0.0         0.0       0.000        0.000  ...   \n",
443 |       "2029         0.0         0.0         0.0       0.000        0.000  ...   \n",
444 |       "2030         0.0         0.0         0.0       0.000        0.000  ...   \n",
445 |       "2031         0.0         0.0         0.0       0.000        0.000  ...   \n",
446 |       "2032         0.0         0.0         0.0       0.000        0.000  ...   \n",
447 |       "\n",
448 |       "      OFF_FT_FREQ_MISC  OFF_TO_FREQ_MISC  OFF_SF_FREQ_MISC  \\\n",
449 |       "0                0.207             0.379             0.034   \n",
450 |       "1                0.095             0.381             0.000   \n",
451 |       "2                0.283             0.358             0.057   \n",
452 |       "3                0.195             0.415             0.049   \n",
453 |       "4                0.195             0.488             0.098   \n",
454 |       "...                ...               ...               ...   \n",
455 |       "2028             0.000             0.000             0.000   \n",
456 |       "2029             0.000             0.000             0.000   \n",
457 |       "2030             0.000             0.000             0.000   \n",
458 |       "2031             0.000             0.000             0.000   \n",
459 |       "2032             0.000             0.000             0.000   \n",
460 |       "\n",
461 |       "      OFF_AND1_FREQ_MISC  OFF_SCORE_FREQ_MISC  OFF_PERC_MISC  AVG_OPP_FGP  \\\n",
462 |       "0                   0.00                0.276          0.311     0.472429   \n",
463 |       "1                   0.00                0.190          0.382     0.394286   \n",
464 |       "2                   0.02                0.377          0.729     0.365571   \n",
465 |       "3                   0.00                0.366          0.755     0.406857   \n",
466 |       "4                   0.07                0.268          0.547     0.366429   \n",
467 |       "...                  ...                  ...            ...          ...   \n",
468 |       "2028                0.00                0.000          0.000     0.033000   \n",
469 |       "2029                0.00                0.000          0.000     0.031429   \n",
470 |       "2030                0.00                0.000          0.000     0.081429   \n",
471 |       "2031                0.00                0.000          0.000     0.000000   \n",
472 |       "2032                0.00                0.000          0.000     0.000000   \n",
473 |       "\n",
474 |       "      WAVG_OPP_FGP  AVG_OPP_PPH  WAVG_OPP_PPH  \n",
475 |       "0         0.397859     1.082857     90.242180  \n",
476 |       "1         0.431259     0.894286     99.282497  \n",
477 |       "2         0.362971     0.842857     83.828897  \n",
478 |       "3         0.394403     0.902857     87.363439  \n",
479 |       "4         0.394326     0.781429     84.444848  \n",
480 |       "...            ...          ...           ...  \n",
481 |       "2028      0.231000     0.104286     73.000000  \n",
482 |       "2029      0.220000     0.071429     50.000000  \n",
483 |       "2030      0.570000     0.224286    157.000000  \n",
484 |       "2031      0.000000     0.000000      0.000000  \n",
485 |       "2032      0.000000     0.000000      0.000000  \n",
486 |       "\n",
487 |       "[2033 rows x 258 columns]\n"
488 |      ]
489 |     },
490 |     {
491 |      "name": "stderr",
492 |      "output_type": "stream",
493 |      "text": [
494 |       "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in true_divide\n",
495 |       "  \n",
496 |       "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:18: RuntimeWarning: invalid value encountered in true_divide\n"
497 |      ]
498 |     }
499 |    ],
500 |    "source": [
501 |     "# Grab player basic and advanced stats from NBA.com\n",
502 |     "df5 = pd.read_csv('../NBAPlayTypeStats.csv', index_col=0)\n",
503 |     "df5 = ConvertDataFrame(df5, ['name'], ['year'])\n",
504 |     "#print(df5[df5.duplicated(subset=['name', 'year'], keep='first')])\n",
505 |     "#print(df5[df5.duplicated(subset=['name', 'year'], keep='last')])\n",
506 |     "\n",
507 |     "# Create new defensive variables from existing play-type data\n",
508 |     "def_fgp_cols = [col for col in df5.columns if 'DEF_FGP_' in col]\n",
509 |     "def_freq_cols = [col for col in df5.columns if 'DEF_FREQ_' in col]\n",
510 |     "def_pph_cols = [col for col in df5.columns if 'DEF_PPP_' in col]\n",
511 |     "# Mean field goal percentage using mean of all play-type columns\n",
512 |     "df5['AVG_OPP_FGP'] = df5[def_fgp_cols].mean(axis=1)\n",
513 |     "# Weighted average which takes into account relative frequency of defensive play type\n",
514 |     "df5['WAVG_OPP_FGP'] = (df5[def_fgp_cols].values*df5[def_freq_cols].values).sum(axis=1)/df5[def_freq_cols].values.sum(axis=1)\n",
515 |     "# Mean points per 100 possession using mean all play-type columns\n",
516 |     "df5['AVG_OPP_PPH'] = df5[def_pph_cols].mean(axis=1)\n",
517 |     "# Weighted average PPH which takes into account relative frequency of defensive play type\n",
518 |     "df5['WAVG_OPP_PPH'] = (df5[def_pph_cols].values*100.*df5[def_freq_cols].values).sum(axis=1)/df5[def_freq_cols].values.sum(axis=1)\n",
519 |     "\n",
520 |     "# Replace NaNs with zeros\n",
521 |     "df5 = df5.fillna(0)\n",
522 |     "\n",
523 |     "df5['name'] = df5['name'].str.replace('.', '')\n",
524 |     "\n",
525 |     "print(\"Table of play-type stats:\\n\\n\", df5)\n"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": 10,
531 |    "metadata": {},
532 |    "outputs": [
533 |     {
534 |      "name": "stdout",
535 |      "output_type": "stream",
536 |      "text": [
537 |       "Table of ESPN real plus-minus stats:\n",
538 |       "\n",
539 |       "                 name  year  ORPM  DRPM   RPM  RPM_WINS\n",
540 |       "0           AJ Price  2014  0.03 -0.12 -0.09      0.14\n",
541 |       "1           AJ Price  2015 -0.75 -2.42 -3.17     -0.13\n",
542 |       "2         AJ Hammons  2017 -2.77  1.27 -1.50      0.16\n",
543 |       "3       Aaron Brooks  2014  0.70 -3.84 -3.14     -0.79\n",
544 |       "4       Aaron Brooks  2015  1.25 -2.33 -1.08      1.46\n",
545 |       "...              ...   ...   ...   ...   ...       ...\n",
546 |       "3218   Zaza Pachulia  2019 -2.87  3.06  0.19      1.76\n",
547 |       "3219    Zhaire Smith  2019 -1.51 -0.58 -2.09      0.07\n",
548 |       "3220         Zhou Qi  2019 -1.87  0.90 -0.97      0.00\n",
549 |       "3221    Zoran Dragic  2015 -1.52 -1.92 -3.44     -0.05\n",
550 |       "3222  Zylan Cheatham  2020 -1.04 -0.74 -1.77     -0.02\n",
551 |       "\n",
552 |       "[3223 rows x 6 columns]\n"
553 |      ]
554 |     }
555 |    ],
556 |    "source": [
557 |     "# Grab advanced real plus-minus stats from ESPN\n",
558 |     "with open('../ESPN_RealPM.json') as f:\n",
559 |     "    json_data = json.load(f)\n",
560 |     "\n",
561 |     "df6 = pd.DataFrame(data=json_data, dtype=float)\n",
562 |     "df6['name'] = df6['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())\n",
563 |     "df6.name = df6.name.astype(str)\n",
564 |     "df6.year = df6.year.astype(int)\n",
565 |     "df6 = df6.groupby(['name', 'year']).mean(numeric_only=True).reset_index()\n",
566 |     "\n",
567 |     "df6['name'] = df6['name'].str.replace('.', '')\n",
568 |     "\n",
569 |     "print(\"Table of ESPN real plus-minus stats:\\n\\n\", df6)\n"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "markdown",
574 |    "metadata": {},
575 |    "source": [
576 |     "# Merge All Dataframes and Rearrange Columns"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": 11,
582 |    "metadata": {
583 |     "scrolled": false
584 |    },
585 |    "outputs": [],
586 |    "source": [
587 |     "# Merge all dataframes\n",
588 |     "df = pd.merge(df_bios, df_comb, on=['name'], how='outer')\n",
589 |     "df = pd.merge(df, df1, on=['name'])\n",
590 |     "df = pd.merge(df, df2, on=['name', 'year'])\n",
591 |     "cols_to_move = ['pos', 'team', 'year', 'GP', 'GS']\n",
592 |     "#col_to_move_after = 'wingspan' # only relevant for players from draft combine\n",
593 |     "col_to_move_after = 'name'\n",
594 |     "befbef_cols = [c for c in df if df.columns.get_loc(c)<=df.columns.get_loc(col_to_move_after) and c not in cols_to_move]\n",
595 |     "before_cols = [c for c in df if df.columns.get_loc(c)>df.columns.get_loc(col_to_move_after) and df.columns.get_loc(c)<df.columns.get_loc(cols_to_move[0]) and c not in cols_to_move]\n",
596 |     "after_cols = [c for c in df if df.columns.get_loc(c)>df.columns.get_loc(cols_to_move[-1]) and c not in cols_to_move]\n",
597 |     "df = df[befbef_cols+cols_to_move+before_cols+after_cols]\n",
598 |     "df = pd.merge(df, df3, on=['name', 'year'])\n",
599 |     "df = pd.merge(df, df4, on=['name', 'year'])\n",
600 |     "df = pd.merge(df, df5, on=['name', 'year'])\n",
601 |     "df = pd.merge(df, df6, on=['name', 'year'])\n"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "markdown",
606 |    "metadata": {},
607 |    "source": [
608 |     "# Create New Variables and Resolve Infs/NaNs"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "code",
613 |    "execution_count": 12,
614 |    "metadata": {},
615 |    "outputs": [],
616 |    "source": [
617 |     "# Column modifications\n",
618 |     "df = df.replace([np.inf, -np.inf], np.nan)\n",
619 |     "\n",
620 |     "# Convert some per-36 min stats into per-game stats\n",
621 |     "df['2PA_PG'] = df['2PA_PT']*df.MPG/36.\n",
622 |     "df['3PA_PG'] = df['3PA_PT']*df.MPG/36.\n",
623 |     "df['FGA_PG'] = df.FGA_PT*df.MPG/36.\n",
624 |     "###\n",
625 |     "df[\"2PR\"] = df[\"2PA_PH\"]/df[\"FGA_PH\"]\n",
626 |     "df[\"3PR\"] = df[\"3PA_PH\"]/df[\"FGA_PH\"]\n",
627 |     "\n",
628 |     "# Create new variables for frequency of various shot types\n",
629 |     "df['FG_FREQ_RIM'] = (df.FGA_RA)/df.FGA_PG # restricted area\n",
630 |     "df['FG_FREQ_MR_AND_PT'] = (df.FGA_MR + df.FGA_NONRA)/df.FGA_PG # combined paint and midrange\n",
631 |     "df['FG_FREQ_MR'] = df.FGA_MR/df.FGA_PG\n",
632 |     "df['FG_FREQ_CORNERS'] = (df.FGA_LC + df.FGA_RC)/df.FGA_PG\n",
633 |     "df['FG_FREQ_AB'] = df.FGA_AB/df.FGA_PG\n",
634 |     "df['FG_FREQ_01DRIB'] = (df['FGA_0DRIB'] + df['FGA_1DRIB'])/df.FGA_PG\n",
635 |     "df['FG_FREQ_GT1DRIB'] = (df['FGA_2DRIB'] + df['FGA_36DRIB'] + df['FGA_GT7DRIB'])/df.FGA_PG\n",
636 |     "df['FG_FREQ_CANDS'] = df['FGA_CANDS']/df.FGA_PG\n",
637 |     "\n",
638 |     "df['FG_FREQ_05FT'] = (df.FGA_05FT)/df.FGA_PG\n",
639 |     "df['FG_FREQ_59FT'] = (df.FGA_59FT)/df.FGA_PG\n",
640 |     "df['FG_FREQ_1014FT'] = (df.FGA_1014FT)/df.FGA_PG\n",
641 |     "df['FG_FREQ_1519FT'] = (df.FGA_1519FT)/df.FGA_PG\n",
642 |     "df['FG_FREQ_2024FT'] = (df.FGA_2024FT)/df.FGA_PG\n",
643 |     "df['FG_FREQ_GT24FT'] = (df.FGA_GT24FT)/df.FGA_PG\n",
644 |     "\n",
645 |     "# Replace NaN values with zeros\n",
646 |     "df[\"ATR\"].fillna(0, inplace=True)\n",
647 |     "df[\"ODRR\"].fillna(0, inplace=True)\n",
648 |     "df[\"2PP_PH\"].fillna(0, inplace=True)\n",
649 |     "df[\"2PP_PT\"].fillna(0, inplace=True)\n",
650 |     "df[\"2PR\"].fillna(0, inplace=True)\n",
651 |     "df[\"3PR\"].fillna(0, inplace=True)\n",
652 |     "df[\"WAVG_OPP_FGP\"].fillna(0, inplace=True)\n",
653 |     "df[\"FG_FREQ_RIM\"].fillna(0, inplace=True)\n",
654 |     "df[\"FG_FREQ_MR_AND_PT\"].fillna(0, inplace=True)\n",
655 |     "df[\"FG_FREQ_MR\"].fillna(0, inplace=True)\n",
656 |     "df[\"FG_FREQ_CORNERS\"].fillna(0, inplace=True)\n",
657 |     "df[\"FG_FREQ_AB\"].fillna(0, inplace=True)\n",
658 |     "df[\"FG_FREQ_01DRIB\"].fillna(0, inplace=True)\n",
659 |     "df[\"FG_FREQ_GT1DRIB\"].fillna(0, inplace=True)\n",
660 |     "df[\"FG_FREQ_CANDS\"].fillna(0, inplace=True)\n",
661 |     "df['FG_FREQ_05FT'].fillna(0, inplace=True)\n",
662 |     "df['FG_FREQ_59FT'].fillna(0, inplace=True)\n",
663 |     "df['FG_FREQ_1014FT'].fillna(0, inplace=True)\n",
664 |     "df['FG_FREQ_1519FT'].fillna(0, inplace=True)\n",
665 |     "df['FG_FREQ_2024FT'].fillna(0, inplace=True)\n",
666 |     "df['FG_FREQ_GT24FT'].fillna(0, inplace=True)\n",
667 |     "\n",
668 |     "# Match team name acronyms match between BBall Reference and NBA.com\n",
669 |     "df['team'] = df['team'].replace('BRK', 'BKN')\n",
670 |     "df['team'] = df['team'].replace('CHO', 'CHA')\n",
671 |     "df['team'] = df['team'].replace('PHO', 'PHX')\n"
672 |    ]
673 |   },
674 |   {
675 |    "cell_type": "code",
676 |    "execution_count": 13,
677 |    "metadata": {},
678 |    "outputs": [
679 |     {
680 |      "name": "stdout",
681 |      "output_type": "stream",
682 |      "text": [
683 |       "Combined Table of BBall Player Stats:\n",
684 |       "\n",
685 |       "                  name pos team  year    GP    GS  height  weight   college  \\\n",
686 |       "0        Andre Miller  PG  TOT  2016  39.0   4.0    74.0   200.0      Utah   \n",
687 |       "1       Dirk Nowitzki  PF  DAL  2016  75.0  75.0    83.0   237.0      None   \n",
688 |       "2       Dirk Nowitzki  PF  DAL  2017  54.0  54.0    83.0   237.0      None   \n",
689 |       "3       Dirk Nowitzki   C  DAL  2018  77.0  77.0    83.0   237.0      None   \n",
690 |       "4       Dirk Nowitzki  PF  DAL  2019  51.0  20.0    83.0   237.0      None   \n",
691 |       "...               ...  ..  ...   ...   ...   ...     ...     ...       ...   \n",
692 |       "1742    Rui Hachimura  PF  WAS  2020  25.0  25.0    80.0   230.0   Gonzaga   \n",
693 |       "1743  Sekou Doumbouya  SF  DET  2020  14.0   7.0    80.0   230.0      None   \n",
694 |       "1744    Terence Davis  SG  TOR  2020  39.0   1.0    76.0   201.0      None   \n",
695 |       "1745        Ty Jerome  PG  PHX  2020  10.0   0.0    77.0   195.0  Virginia   \n",
696 |       "1746      Tyler Herro  SG  MIA  2020  38.0   3.0    77.0   195.0  Kentucky   \n",
697 |       "\n",
698 |       "      country  ...  FG_FREQ_AB  FG_FREQ_01DRIB  FG_FREQ_GT1DRIB FG_FREQ_CANDS  \\\n",
699 |       "0         USA  ...    0.000000        0.000000         0.000000      0.073514   \n",
700 |       "1     Germany  ...    0.033812        0.000000         0.000000      0.540997   \n",
701 |       "2     Germany  ...    0.023923        0.853270         0.000000      0.606061   \n",
702 |       "3     Germany  ...    0.030364        0.890688         0.000000      0.668016   \n",
703 |       "4     Germany  ...    0.054299        0.923077         0.000000      0.733032   \n",
704 |       "...       ...  ...         ...             ...              ...           ...   \n",
705 |       "1742    Japan  ...    0.057534        0.682192         0.304110      0.230137   \n",
706 |       "1743   Guinea  ...    0.196253        0.642284         0.000000      0.303301   \n",
707 |       "1744      USA  ...    0.145653        0.710059         0.291306      0.473373   \n",
708 |       "1745      USA  ...    0.020353        0.508820         0.488467      0.386703   \n",
709 |       "1746      USA  ...    0.076567        0.467908         0.535967      0.289252   \n",
710 |       "\n",
711 |       "     FG_FREQ_05FT  FG_FREQ_59FT  FG_FREQ_1014FT  FG_FREQ_1519FT  \\\n",
712 |       "0        0.514601      0.110272        0.183786        0.110272   \n",
713 |       "1        0.087912      0.054100        0.202874        0.290786   \n",
714 |       "2        0.071770      0.047847        0.151515        0.334928   \n",
715 |       "3        0.050607      0.020243        0.172065        0.222672   \n",
716 |       "4        0.013575      0.054299        0.122172        0.190045   \n",
717 |       "...           ...           ...             ...             ...   \n",
718 |       "1742     0.468493      0.073973        0.123288        0.147945   \n",
719 |       "1743     0.410348      0.089206        0.035682        0.017841   \n",
720 |       "1744     0.364133      0.036413        0.018207        0.018207   \n",
721 |       "1745     0.183175      0.081411        0.244233        0.040706   \n",
722 |       "1746     0.161641      0.059552        0.110596        0.144626   \n",
723 |       "\n",
724 |       "      FG_FREQ_2024FT  FG_FREQ_GT24FT  \n",
725 |       "0           0.036757        0.036757  \n",
726 |       "1           0.250211        0.311074  \n",
727 |       "2           0.271132        0.311005  \n",
728 |       "3           0.222672        0.445344  \n",
729 |       "4           0.217195        0.542986  \n",
730 |       "...              ...             ...  \n",
731 |       "1742        0.098630        0.147945  \n",
732 |       "1743        0.124888        0.338983  \n",
733 |       "1744        0.236686        0.564406  \n",
734 |       "1745        0.101764        0.386703  \n",
735 |       "1746        0.195671        0.459401  \n",
736 |       "\n",
737 |       "[1747 rows x 732 columns]\n"
738 |      ]
739 |     }
740 |    ],
741 |    "source": [
742 |     "# Write complete set of combined stats to .csv file and print\n",
743 |     "df.to_csv(\"../CompleteNBAPlayerStats.csv\", index=False)\n",
744 |     "print(\"Combined Table of BBall Player Stats:\\n\\n\", df)\n"
745 |    ]
746 |   },
747 |   {
748 |    "cell_type": "code",
749 |    "execution_count": null,
750 |    "metadata": {},
751 |    "outputs": [],
752 |    "source": []
753 |   }
754 |  ],
755 |  "metadata": {
756 |   "kernelspec": {
757 |    "display_name": "Python 3",
758 |    "language": "python",
759 |    "name": "python3"
760 |   },
761 |   "language_info": {
762 |    "codemirror_mode": {
763 |     "name": "ipython",
764 |     "version": 3
765 |    },
766 |    "file_extension": ".py",
767 |    "mimetype": "text/x-python",
768 |    "name": "python",
769 |    "nbconvert_exporter": "python",
770 |    "pygments_lexer": "ipython3",
771 |    "version": "3.7.3"
772 |   },
773 |   "toc": {
774 |    "base_numbering": 1,
775 |    "nav_menu": {},
776 |    "number_sections": true,
777 |    "sideBar": true,
778 |    "skip_h1_title": false,
779 |    "title_cell": "Table of Contents",
780 |    "title_sidebar": "Contents",
781 |    "toc_cell": true,
782 |    "toc_position": {
783 |     "height": "calc(100% - 180px)",
784 |     "left": "10px",
785 |     "top": "150px",
786 |     "width": "165px"
787 |    },
788 |    "toc_section_display": true,
789 |    "toc_window_display": true
790 |   }
791 |  },
792 |  "nbformat": 4,
793 |  "nbformat_minor": 2
794 | }
795 | 


--------------------------------------------------------------------------------
/Data_Scraping/Generate_Matchup_Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "toc": true
  7 |    },
  8 |    "source": [
  9 |     "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
 10 |     "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Load-Play-By-Play-Data\" data-toc-modified-id=\"Load-Play-By-Play-Data-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Load Play-By-Play Data</a></span></li><li><span><a href=\"#Extract-+/--Information-for-Lineup-Matchups-and-Apply-Cuts\" data-toc-modified-id=\"Extract-+/--Information-for-Lineup-Matchups-and-Apply-Cuts-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Extract +/- Information for Lineup Matchups and Apply Cuts</a></span></li><li><span><a href=\"#Dump-Matchup-Data-to-Files\" data-toc-modified-id=\"Dump-Matchup-Data-to-Files-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Dump Matchup Data to Files</a></span></li></ul></div>"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import numpy as np\n",
 20 |     "import time\n",
 21 |     "import sys\n",
 22 |     "import os\n",
 23 |     "import pandas as pd\n",
 24 |     "from functools import reduce\n",
 25 |     "from operator import itemgetter\n",
 26 |     "import itertools\n",
 27 |     "import re\n",
 28 |     "import matplotlib.pyplot as plt\n",
 29 |     "import matplotlib\n",
 30 |     "from matplotlib.pyplot import cm\n",
 31 |     "from fuzzywuzzy import fuzz \n",
 32 |     "from fuzzywuzzy import process"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 10,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "# Use the player lineups recorded for each possession in \n",
 42 |     "# play-by-play data to calculate the cumulative plus-minus\n",
 43 |     "# for each lineup matchup \"stint\". A stint is defined as an \n",
 44 |     "# offensive or defensive sequence of possessions.\n",
 45 |     "def ExtractPlusMinus(df):\n",
 46 |     "    ht_pm = []\n",
 47 |     "    vt_pm = []\n",
 48 |     "    ht_poss = []\n",
 49 |     "    vt_poss = []\n",
 50 |     "    pts_scored = []\n",
 51 |     "    for num,game in df.groupby(['game', 'year'], sort=False, as_index=False):\n",
 52 |     "        ht_plus_minus = 0\n",
 53 |     "        vt_plus_minus = 0\n",
 54 |     "        ht_stint_poss = 0\n",
 55 |     "        vt_stint_poss = 0\n",
 56 |     "        ht_last_margin = 0\n",
 57 |     "        prev_hl = []\n",
 58 |     "        prev_vl = []\n",
 59 |     "        for ht,vt,hmarg in zip(game.ht_lineup,game.vt_lineup,game.ht_margin):\n",
 60 |     "            hta = sorted(ht.split(','))\n",
 61 |     "            vta = sorted(vt.split(','))\n",
 62 |     "            # If either home or away lineup has changed\n",
 63 |     "            if hta != prev_hl or vta != prev_vl:\n",
 64 |     "                #print('New home lineup:', hta)\n",
 65 |     "                #print('New visiting lineup:', vta)\n",
 66 |     "                ht_plus_minus = 0\n",
 67 |     "                vt_plus_minus = 0\n",
 68 |     "                ht_stint_poss = 0\n",
 69 |     "                vt_stint_poss = 0\n",
 70 |     "            \n",
 71 |     "            pts_scored.append(abs(hmarg-ht_last_margin))\n",
 72 |     "            ht_plus_minus += (hmarg-ht_last_margin)\n",
 73 |     "            vt_plus_minus += -(hmarg-ht_last_margin)\n",
 74 |     "            #print(ht_last_margin, hmarg, ht_plus_minus, pts_scored)\n",
 75 |     "            ht_pm.append(ht_plus_minus)\n",
 76 |     "            vt_pm.append(vt_plus_minus)\n",
 77 |     "\n",
 78 |     "            ht_stint_poss += 1\n",
 79 |     "            vt_stint_poss += 1\n",
 80 |     "            ht_poss.append(ht_stint_poss)\n",
 81 |     "            vt_poss.append(vt_stint_poss)\n",
 82 |     "            \n",
 83 |     "            prev_hl = hta\n",
 84 |     "            prev_vl = vta\n",
 85 |     "            ht_last_margin = hmarg\n",
 86 |     "        \n",
 87 |     "    df['ht_stint_pm'] = ht_pm\n",
 88 |     "    df['vt_stint_pm'] = vt_pm\n",
 89 |     "    df['ht_stint_poss'] = ht_poss\n",
 90 |     "    df['vt_stint_poss'] = vt_poss\n",
 91 |     "    df['points_scored'] = pts_scored\n",
 92 |     "    return df\n",
 93 |     "\n",
 94 |     "# Grab the data corresponding to the first possession of each lineup matchup stint\n",
 95 |     "def GetMatchupStarts(df):\n",
 96 |     "    dfhead = df.groupby((df[['game', 'year', 'ht_lineup', 'vt_lineup']] != df[['game','year','ht_lineup','vt_lineup']].shift(1)).any(axis=1).cumsum()).head(1).reset_index(drop=True)\n",
 97 |     "    return dfhead\n",
 98 |     "\n",
 99 |     "# Grab the data corresponding to the last possession of each lineup matchup stint\n",
100 |     "# (includes the final plus-minus of the lineup matchup)\n",
101 |     "def GetMatchupEnds(df):\n",
102 |     "    dftail = df.groupby((df[['game', 'year', 'ht_lineup', 'vt_lineup']] != df[['game','year','ht_lineup','vt_lineup']].shift(1)).any(axis=1).cumsum()).tail(1).reset_index(drop=True)\n",
103 |     "    return dftail\n",
104 |     "\n",
105 |     "# Grab the data corresponding to final row for EACH off/def possession\n",
106 |     "def GetPossessionEnds(df):\n",
107 |     "    dftail = df.groupby((df[['game', 'year', 'ht_poss']] != df[['game','year','ht_poss']].shift(1)).any(axis=1).cumsum()).tail(1).reset_index(drop=True)\n",
108 |     "    return dftail\n",
109 |     "\n",
110 |     "# DEPRECATED -- extremely slow method\n",
111 |     "#def PlayerPlusMinus(df, player):\n",
112 |     "#    df_ht = df[df.ht_lineup.str.contains(player)].time_sec.values\n",
113 |     "#    df_hpm = df[df.ht_lineup.str.contains(player)].ht_stint_pm.values\n",
114 |     "#    df_vt = df[df.vt_lineup.str.contains(player)].time_sec.values\n",
115 |     "#    df_vpm = df[df.vt_lineup.str.contains(player)].vt_stint_pm.values\n",
116 |     "#    return df_ht, df_hpm, df_vt, df_vpm\n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "# Load Play-By-Play Data"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 3,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "name": "stdout",
133 |      "output_type": "stream",
134 |      "text": [
135 |       "         year  game home_team vis_team  Q  time_sec  ht_score  vt_score  \\\n",
136 |       "0        2017     1       CLE      NYK  1       0.0         0         0   \n",
137 |       "1        2017     1       CLE      NYK  1      20.0         0         2   \n",
138 |       "2        2017     1       CLE      NYK  1      34.0         0         2   \n",
139 |       "3        2017     1       CLE      NYK  1      37.0         0         2   \n",
140 |       "4        2017     1       CLE      NYK  1      44.0         0         2   \n",
141 |       "...       ...   ...       ...      ... ..       ...       ...       ...   \n",
142 |       "1675613  2019  1230       POR      SAC  4    2859.0       136       131   \n",
143 |       "1675614  2019  1230       POR      SAC  4    2859.0       136       131   \n",
144 |       "1675615  2019  1230       POR      SAC  4    2866.0       136       131   \n",
145 |       "1675616  2019  1230       POR      SAC  4    2869.0       136       131   \n",
146 |       "1675617  2019  1230       POR      SAC  4    2880.0       136       131   \n",
147 |       "\n",
148 |       "         ht_margin  vt_margin  ... ht_flagrants vt_flagrants  ht_2PTA  \\\n",
149 |       "0                0          0  ...            0            0        0   \n",
150 |       "1               -2          2  ...            0            0        0   \n",
151 |       "2               -2          2  ...            0            0        1   \n",
152 |       "3               -2          2  ...            0            0        1   \n",
153 |       "4               -2          2  ...            0            0        1   \n",
154 |       "...            ...        ...  ...          ...          ...      ...   \n",
155 |       "1675613          5         -5  ...            0            0       64   \n",
156 |       "1675614          5         -5  ...            0            0       64   \n",
157 |       "1675615          5         -5  ...            0            0       64   \n",
158 |       "1675616          5         -5  ...            0            0       64   \n",
159 |       "1675617          5         -5  ...            0            0       64   \n",
160 |       "\n",
161 |       "         vt_2PTA  ht_3PTA  vt_3PTA ht_2PTM vt_2PTM ht_3PTM vt_3PTM  \n",
162 |       "0              0        0        0       0       0       0       0  \n",
163 |       "1              1        0        0       0       1       0       0  \n",
164 |       "2              1        0        0       0       1       0       0  \n",
165 |       "3              1        0        0       0       1       0       0  \n",
166 |       "4              2        0        0       0       1       0       0  \n",
167 |       "...          ...      ...      ...     ...     ...     ...     ...  \n",
168 |       "1675613       51       24       42      38      31      14      18  \n",
169 |       "1675614       51       24       42      38      31      14      18  \n",
170 |       "1675615       51       24       43      38      31      14      18  \n",
171 |       "1675616       51       24       43      38      31      14      18  \n",
172 |       "1675617       51       24       43      38      31      14      18  \n",
173 |       "\n",
174 |       "[1675618 rows x 36 columns]\n"
175 |      ]
176 |     }
177 |    ],
178 |    "source": [
179 |     "# Load and concatenate all play-by-play data over the last 3 full seasons\n",
180 |     "df1 = pd.read_csv('../NBA_PBP_Data_2016_2017.csv', index_col=0)\n",
181 |     "df2 = pd.read_csv('../NBA_PBP_Data_2017_2018.csv', index_col=0)\n",
182 |     "df3 = pd.read_csv('../NBA_PBP_Data_2018_2019.csv', index_col=0)\n",
183 |     "\n",
184 |     "df = pd.concat([df1, df2, df3], ignore_index=True)\n",
185 |     "\n",
186 |     "df.fillna('', inplace=True)\n",
187 |     "\n",
188 |     "print(df)\n"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "# Extract +/- Information for Lineup Matchups and Apply Cuts"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 6,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "name": "stdout",
205 |      "output_type": "stream",
206 |      "text": [
207 |       "         year  game home_team vis_team  Q  time_sec  ht_score  vt_score  \\\n",
208 |       "0        2017     1       CLE      NYK  1       0.0         0         0   \n",
209 |       "1        2017     1       CLE      NYK  1      20.0         0         2   \n",
210 |       "2        2017     1       CLE      NYK  1      34.0         0         2   \n",
211 |       "3        2017     1       CLE      NYK  1      37.0         0         2   \n",
212 |       "4        2017     1       CLE      NYK  1      44.0         0         2   \n",
213 |       "...       ...   ...       ...      ... ..       ...       ...       ...   \n",
214 |       "1675613  2019  1230       POR      SAC  4    2859.0       136       131   \n",
215 |       "1675614  2019  1230       POR      SAC  4    2859.0       136       131   \n",
216 |       "1675615  2019  1230       POR      SAC  4    2866.0       136       131   \n",
217 |       "1675616  2019  1230       POR      SAC  4    2869.0       136       131   \n",
218 |       "1675617  2019  1230       POR      SAC  4    2880.0       136       131   \n",
219 |       "\n",
220 |       "         ht_margin  vt_margin  ... vt_3PTA ht_2PTM  vt_2PTM  ht_3PTM  vt_3PTM  \\\n",
221 |       "0                0          0  ...       0       0        0        0        0   \n",
222 |       "1               -2          2  ...       0       0        1        0        0   \n",
223 |       "2               -2          2  ...       0       0        1        0        0   \n",
224 |       "3               -2          2  ...       0       0        1        0        0   \n",
225 |       "4               -2          2  ...       0       0        1        0        0   \n",
226 |       "...            ...        ...  ...     ...     ...      ...      ...      ...   \n",
227 |       "1675613          5         -5  ...      42      38       31       14       18   \n",
228 |       "1675614          5         -5  ...      42      38       31       14       18   \n",
229 |       "1675615          5         -5  ...      43      38       31       14       18   \n",
230 |       "1675616          5         -5  ...      43      38       31       14       18   \n",
231 |       "1675617          5         -5  ...      43      38       31       14       18   \n",
232 |       "\n",
233 |       "         ht_stint_pm vt_stint_pm ht_stint_poss vt_stint_poss points_scored  \n",
234 |       "0                  0           0             1             1             0  \n",
235 |       "1                 -2           2             2             2             2  \n",
236 |       "2                 -2           2             3             3             0  \n",
237 |       "3                 -2           2             4             4             0  \n",
238 |       "4                 -2           2             5             5             0  \n",
239 |       "...              ...         ...           ...           ...           ...  \n",
240 |       "1675613            0           0            41            41             0  \n",
241 |       "1675614            0           0            42            42             0  \n",
242 |       "1675615            0           0            43            43             0  \n",
243 |       "1675616            0           0            44            44             0  \n",
244 |       "1675617            0           0            45            45             0  \n",
245 |       "\n",
246 |       "[1675618 rows x 41 columns]\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "# Comb over the PBP dataframe and calculate +/- data \n",
252 |     "# for each new lineup stint, for each game\n",
253 |     "df_new = ExtractPlusMinus(df)\n",
254 |     "print(df_new)\n"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 7,
260 |    "metadata": {},
261 |    "outputs": [
262 |     {
263 |      "name": "stdout",
264 |      "output_type": "stream",
265 |      "text": [
266 |       "         year  game home_team vis_team  Q  time_sec  ht_score  vt_score  \\\n",
267 |       "0        2017     1       CLE      NYK  1       0.0         0         0   \n",
268 |       "1        2017     1       CLE      NYK  1      20.0         0         2   \n",
269 |       "2        2017     1       CLE      NYK  1      34.0         0         2   \n",
270 |       "3        2017     1       CLE      NYK  1      37.0         0         2   \n",
271 |       "4        2017     1       CLE      NYK  1      44.0         0         2   \n",
272 |       "...       ...   ...       ...      ... ..       ...       ...       ...   \n",
273 |       "1675613  2019  1230       POR      SAC  4    2859.0       136       131   \n",
274 |       "1675614  2019  1230       POR      SAC  4    2859.0       136       131   \n",
275 |       "1675615  2019  1230       POR      SAC  4    2866.0       136       131   \n",
276 |       "1675616  2019  1230       POR      SAC  4    2869.0       136       131   \n",
277 |       "1675617  2019  1230       POR      SAC  4    2880.0       136       131   \n",
278 |       "\n",
279 |       "         ht_margin  vt_margin  ... vt_3PTA ht_2PTM  vt_2PTM  ht_3PTM  vt_3PTM  \\\n",
280 |       "0                0          0  ...       0       0        0        0        0   \n",
281 |       "1               -2          2  ...       0       0        1        0        0   \n",
282 |       "2               -2          2  ...       0       0        1        0        0   \n",
283 |       "3               -2          2  ...       0       0        1        0        0   \n",
284 |       "4               -2          2  ...       0       0        1        0        0   \n",
285 |       "...            ...        ...  ...     ...     ...      ...      ...      ...   \n",
286 |       "1675613          5         -5  ...      42      38       31       14       18   \n",
287 |       "1675614          5         -5  ...      42      38       31       14       18   \n",
288 |       "1675615          5         -5  ...      43      38       31       14       18   \n",
289 |       "1675616          5         -5  ...      43      38       31       14       18   \n",
290 |       "1675617          5         -5  ...      43      38       31       14       18   \n",
291 |       "\n",
292 |       "         ht_stint_pm vt_stint_pm ht_stint_poss vt_stint_poss points_scored  \n",
293 |       "0                  0           0             1             1             0  \n",
294 |       "1                 -2           2             2             2             2  \n",
295 |       "2                 -2           2             3             3             0  \n",
296 |       "3                 -2           2             4             4             0  \n",
297 |       "4                 -2           2             5             5             0  \n",
298 |       "...              ...         ...           ...           ...           ...  \n",
299 |       "1675613            0           0            41            41             0  \n",
300 |       "1675614            0           0            42            42             0  \n",
301 |       "1675615            0           0            43            43             0  \n",
302 |       "1675616            0           0            44            44             0  \n",
303 |       "1675617            0           0            45            45             0  \n",
304 |       "\n",
305 |       "[1591826 rows x 41 columns]\n"
306 |      ]
307 |     }
308 |    ],
309 |    "source": [
310 |     "# Cut out PBP rows where there are not 5 players in both the home and away lineup\n",
311 |     "df_new = df_new[df_new.ht_lineup.str.split(',').str.len() == 5]\n",
312 |     "df_new = df_new[df_new.vt_lineup.str.split(',').str.len() == 5]\n",
313 |     "\n",
314 |     "df_new.to_csv('../NBA_PBP_Data_PlusMinus.csv')\n",
315 |     "print(df_new)\n"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 15,
321 |    "metadata": {},
322 |    "outputs": [
323 |     {
324 |      "data": {
325 |       "text/html": [
326 |        "<div>\n",
327 |        "<style scoped>\n",
328 |        "    .dataframe tbody tr th:only-of-type {\n",
329 |        "        vertical-align: middle;\n",
330 |        "    }\n",
331 |        "\n",
332 |        "    .dataframe tbody tr th {\n",
333 |        "        vertical-align: top;\n",
334 |        "    }\n",
335 |        "\n",
336 |        "    .dataframe thead th {\n",
337 |        "        text-align: right;\n",
338 |        "    }\n",
339 |        "</style>\n",
340 |        "<table border=\"1\" class=\"dataframe\">\n",
341 |        "  <thead>\n",
342 |        "    <tr style=\"text-align: right;\">\n",
343 |        "      <th></th>\n",
344 |        "      <th>year</th>\n",
345 |        "      <th>game</th>\n",
346 |        "      <th>home_team</th>\n",
347 |        "      <th>vis_team</th>\n",
348 |        "      <th>time_sec</th>\n",
349 |        "      <th>ht_score</th>\n",
350 |        "      <th>vt_score</th>\n",
351 |        "      <th>points_scored</th>\n",
352 |        "      <th>ht_play</th>\n",
353 |        "      <th>vt_play</th>\n",
354 |        "      <th>ht_poss</th>\n",
355 |        "    </tr>\n",
356 |        "  </thead>\n",
357 |        "  <tbody>\n",
358 |        "    <tr>\n",
359 |        "      <th>0</th>\n",
360 |        "      <td>2017</td>\n",
361 |        "      <td>1</td>\n",
362 |        "      <td>CLE</td>\n",
363 |        "      <td>NYK</td>\n",
364 |        "      <td>20.0</td>\n",
365 |        "      <td>0</td>\n",
366 |        "      <td>2</td>\n",
367 |        "      <td>2</td>\n",
368 |        "      <td></td>\n",
369 |        "      <td>Rose 1' Driving Layup (2 PTS) (Noah 1 AST)</td>\n",
370 |        "      <td>DEF</td>\n",
371 |        "    </tr>\n",
372 |        "    <tr>\n",
373 |        "      <th>1</th>\n",
374 |        "      <td>2017</td>\n",
375 |        "      <td>1</td>\n",
376 |        "      <td>CLE</td>\n",
377 |        "      <td>NYK</td>\n",
378 |        "      <td>37.0</td>\n",
379 |        "      <td>0</td>\n",
380 |        "      <td>2</td>\n",
381 |        "      <td>0</td>\n",
382 |        "      <td></td>\n",
383 |        "      <td>Noah REBOUND (Off:0 Def:1)</td>\n",
384 |        "      <td>OFF</td>\n",
385 |        "    </tr>\n",
386 |        "    <tr>\n",
387 |        "      <th>2</th>\n",
388 |        "      <td>2017</td>\n",
389 |        "      <td>1</td>\n",
390 |        "      <td>CLE</td>\n",
391 |        "      <td>NYK</td>\n",
392 |        "      <td>45.0</td>\n",
393 |        "      <td>0</td>\n",
394 |        "      <td>4</td>\n",
395 |        "      <td>2</td>\n",
396 |        "      <td></td>\n",
397 |        "      <td>Porzingis 2' Tip Layup Shot (2 PTS)</td>\n",
398 |        "      <td>DEF</td>\n",
399 |        "    </tr>\n",
400 |        "    <tr>\n",
401 |        "      <th>3</th>\n",
402 |        "      <td>2017</td>\n",
403 |        "      <td>1</td>\n",
404 |        "      <td>CLE</td>\n",
405 |        "      <td>NYK</td>\n",
406 |        "      <td>61.0</td>\n",
407 |        "      <td>2</td>\n",
408 |        "      <td>4</td>\n",
409 |        "      <td>2</td>\n",
410 |        "      <td>James 11' Jump Shot (2 PTS) (Irving 1 AST)</td>\n",
411 |        "      <td></td>\n",
412 |        "      <td>OFF</td>\n",
413 |        "    </tr>\n",
414 |        "    <tr>\n",
415 |        "      <th>4</th>\n",
416 |        "      <td>2017</td>\n",
417 |        "      <td>1</td>\n",
418 |        "      <td>CLE</td>\n",
419 |        "      <td>NYK</td>\n",
420 |        "      <td>62.0</td>\n",
421 |        "      <td>2</td>\n",
422 |        "      <td>4</td>\n",
423 |        "      <td>0</td>\n",
424 |        "      <td></td>\n",
425 |        "      <td>Rose Out of Bounds Lost Ball Turnover (P1.T1)</td>\n",
426 |        "      <td>DEF</td>\n",
427 |        "    </tr>\n",
428 |        "    <tr>\n",
429 |        "      <th>...</th>\n",
430 |        "      <td>...</td>\n",
431 |        "      <td>...</td>\n",
432 |        "      <td>...</td>\n",
433 |        "      <td>...</td>\n",
434 |        "      <td>...</td>\n",
435 |        "      <td>...</td>\n",
436 |        "      <td>...</td>\n",
437 |        "      <td>...</td>\n",
438 |        "      <td>...</td>\n",
439 |        "      <td>...</td>\n",
440 |        "      <td>...</td>\n",
441 |        "    </tr>\n",
442 |        "    <tr>\n",
443 |        "      <th>720766</th>\n",
444 |        "      <td>2019</td>\n",
445 |        "      <td>1230</td>\n",
446 |        "      <td>POR</td>\n",
447 |        "      <td>SAC</td>\n",
448 |        "      <td>2832.0</td>\n",
449 |        "      <td>135</td>\n",
450 |        "      <td>131</td>\n",
451 |        "      <td>2</td>\n",
452 |        "      <td>Simons 1' Tip Layup Shot (36 PTS)</td>\n",
453 |        "      <td></td>\n",
454 |        "      <td>OFF</td>\n",
455 |        "    </tr>\n",
456 |        "    <tr>\n",
457 |        "      <th>720767</th>\n",
458 |        "      <td>2019</td>\n",
459 |        "      <td>1230</td>\n",
460 |        "      <td>POR</td>\n",
461 |        "      <td>SAC</td>\n",
462 |        "      <td>2846.0</td>\n",
463 |        "      <td>135</td>\n",
464 |        "      <td>131</td>\n",
465 |        "      <td>0</td>\n",
466 |        "      <td>Layman REBOUND (Off:0 Def:4)</td>\n",
467 |        "      <td></td>\n",
468 |        "      <td>DEF</td>\n",
469 |        "    </tr>\n",
470 |        "    <tr>\n",
471 |        "      <th>720768</th>\n",
472 |        "      <td>2019</td>\n",
473 |        "      <td>1230</td>\n",
474 |        "      <td>POR</td>\n",
475 |        "      <td>SAC</td>\n",
476 |        "      <td>2859.0</td>\n",
477 |        "      <td>136</td>\n",
478 |        "      <td>131</td>\n",
479 |        "      <td>0</td>\n",
480 |        "      <td></td>\n",
481 |        "      <td>Swanigan REBOUND (Off:3 Def:4)</td>\n",
482 |        "      <td>OFF</td>\n",
483 |        "    </tr>\n",
484 |        "    <tr>\n",
485 |        "      <th>720769</th>\n",
486 |        "      <td>2019</td>\n",
487 |        "      <td>1230</td>\n",
488 |        "      <td>POR</td>\n",
489 |        "      <td>SAC</td>\n",
490 |        "      <td>2869.0</td>\n",
491 |        "      <td>136</td>\n",
492 |        "      <td>131</td>\n",
493 |        "      <td>0</td>\n",
494 |        "      <td>Labissiere REBOUND (Off:4 Def:11)</td>\n",
495 |        "      <td></td>\n",
496 |        "      <td>DEF</td>\n",
497 |        "    </tr>\n",
498 |        "    <tr>\n",
499 |        "      <th>720770</th>\n",
500 |        "      <td>2019</td>\n",
501 |        "      <td>1230</td>\n",
502 |        "      <td>POR</td>\n",
503 |        "      <td>SAC</td>\n",
504 |        "      <td>2880.0</td>\n",
505 |        "      <td>136</td>\n",
506 |        "      <td>131</td>\n",
507 |        "      <td>0</td>\n",
508 |        "      <td>EOQ</td>\n",
509 |        "      <td>EOQ</td>\n",
510 |        "      <td>OFF</td>\n",
511 |        "    </tr>\n",
512 |        "  </tbody>\n",
513 |        "</table>\n",
514 |        "<p>720771 rows × 11 columns</p>\n",
515 |        "</div>"
516 |       ],
517 |       "text/plain": [
518 |        "        year  game home_team vis_team  time_sec  ht_score  vt_score  \\\n",
519 |        "0       2017     1       CLE      NYK      20.0         0         2   \n",
520 |        "1       2017     1       CLE      NYK      37.0         0         2   \n",
521 |        "2       2017     1       CLE      NYK      45.0         0         4   \n",
522 |        "3       2017     1       CLE      NYK      61.0         2         4   \n",
523 |        "4       2017     1       CLE      NYK      62.0         2         4   \n",
524 |        "...      ...   ...       ...      ...       ...       ...       ...   \n",
525 |        "720766  2019  1230       POR      SAC    2832.0       135       131   \n",
526 |        "720767  2019  1230       POR      SAC    2846.0       135       131   \n",
527 |        "720768  2019  1230       POR      SAC    2859.0       136       131   \n",
528 |        "720769  2019  1230       POR      SAC    2869.0       136       131   \n",
529 |        "720770  2019  1230       POR      SAC    2880.0       136       131   \n",
530 |        "\n",
531 |        "        points_scored                                     ht_play  \\\n",
532 |        "0                   2                                               \n",
533 |        "1                   0                                               \n",
534 |        "2                   2                                               \n",
535 |        "3                   2  James 11' Jump Shot (2 PTS) (Irving 1 AST)   \n",
536 |        "4                   0                                               \n",
537 |        "...               ...                                         ...   \n",
538 |        "720766              2           Simons 1' Tip Layup Shot (36 PTS)   \n",
539 |        "720767              0                Layman REBOUND (Off:0 Def:4)   \n",
540 |        "720768              0                                               \n",
541 |        "720769              0           Labissiere REBOUND (Off:4 Def:11)   \n",
542 |        "720770              0                                         EOQ   \n",
543 |        "\n",
544 |        "                                              vt_play ht_poss  \n",
545 |        "0          Rose 1' Driving Layup (2 PTS) (Noah 1 AST)     DEF  \n",
546 |        "1                          Noah REBOUND (Off:0 Def:1)     OFF  \n",
547 |        "2                 Porzingis 2' Tip Layup Shot (2 PTS)     DEF  \n",
548 |        "3                                                         OFF  \n",
549 |        "4       Rose Out of Bounds Lost Ball Turnover (P1.T1)     DEF  \n",
550 |        "...                                               ...     ...  \n",
551 |        "720766                                                    OFF  \n",
552 |        "720767                                                    DEF  \n",
553 |        "720768                 Swanigan REBOUND (Off:3 Def:4)     OFF  \n",
554 |        "720769                                                    DEF  \n",
555 |        "720770                                            EOQ     OFF  \n",
556 |        "\n",
557 |        "[720771 rows x 11 columns]"
558 |       ]
559 |      },
560 |      "execution_count": 15,
561 |      "metadata": {},
562 |      "output_type": "execute_result"
563 |     }
564 |    ],
565 |    "source": [
566 |     "# Break play-by-play data into units of possessions, then\n",
567 |     "# store in a file to be loaded for modeling\n",
568 |     "df_poss = GetPossessionEnds(df_new)\n",
569 |     "df_poss.to_csv('../NBA_PBP_Data_Possessions.csv')\n",
570 |     "df_poss[['year', 'game', 'home_team', 'vis_team', 'time_sec', 'ht_score', 'vt_score', 'points_scored', 'ht_play', 'vt_play', 'ht_poss']]\n"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": 6,
576 |    "metadata": {},
577 |    "outputs": [
578 |     {
579 |      "name": "stdout",
580 |      "output_type": "stream",
581 |      "text": [
582 |       "       year  game home_team vis_team  Q  time_sec  ht_score  vt_score  \\\n",
583 |       "0      2017     1       CLE      NYK  1      37.0         0         2   \n",
584 |       "1      2017     1       CLE      NYK  1     415.0        14        12   \n",
585 |       "2      2017     1       CLE      NYK  1     491.0        17        14   \n",
586 |       "3      2017     1       CLE      NYK  1     557.0        19        16   \n",
587 |       "4      2017     1       CLE      NYK  1     632.0        23        16   \n",
588 |       "...     ...   ...       ...      ... ..       ...       ...       ...   \n",
589 |       "98072  2019  1230       POR      SAC  4    2256.0       104       115   \n",
590 |       "98073  2019  1230       POR      SAC  4    2400.0       113       117   \n",
591 |       "98074  2019  1230       POR      SAC  4    2429.0       116       117   \n",
592 |       "98075  2019  1230       POR      SAC  4    2565.0       122       117   \n",
593 |       "98076  2019  1230       POR      SAC  4    2624.0       124       119   \n",
594 |       "\n",
595 |       "       ht_margin  vt_margin  ... vt_3PTA ht_2PTM  vt_2PTM  ht_3PTM  vt_3PTM  \\\n",
596 |       "0             -2          2  ...       0       0        1        0        0   \n",
597 |       "1              2         -2  ...       2       5        4        1        1   \n",
598 |       "2              3         -3  ...       2       6        5        1        1   \n",
599 |       "3              3         -3  ...       2       7        6        1        1   \n",
600 |       "4              7         -7  ...       2       9        6        1        1   \n",
601 |       "...          ...        ...  ...     ...     ...      ...      ...      ...   \n",
602 |       "98072        -11         11  ...      32      28       26       11       17   \n",
603 |       "98073         -4          4  ...      33      31       27       12       17   \n",
604 |       "98074         -1          1  ...      34      31       27       13       17   \n",
605 |       "98075          5         -5  ...      38      34       27       13       17   \n",
606 |       "98076          5         -5  ...      39      35       28       13       17   \n",
607 |       "\n",
608 |       "       ht_stint_pm vt_stint_pm ht_stint_poss vt_stint_poss total_poss  \n",
609 |       "0               -2           2             4             4         99  \n",
610 |       "1                0           0             4             4         12  \n",
611 |       "2                0           0             4             4         11  \n",
612 |       "3                0           0             4             4         14  \n",
613 |       "4                0           0             4             4          4  \n",
614 |       "...            ...         ...           ...           ...        ...  \n",
615 |       "98072            2          -2             4             4         17  \n",
616 |       "98073            2          -2             4             4          6  \n",
617 |       "98074            0           0             4             4         18  \n",
618 |       "98075            2          -2             4             4          7  \n",
619 |       "98076            0           0             4             4         45  \n",
620 |       "\n",
621 |       "[98077 rows x 41 columns]        year  game home_team vis_team  Q  time_sec  ht_score  vt_score  \\\n",
622 |       "0      2017     1       CLE      NYK  1     395.0        14        12   \n",
623 |       "1      2017     1       CLE      NYK  1     455.0        15        12   \n",
624 |       "2      2017     1       CLE      NYK  1     536.0        19        16   \n",
625 |       "3      2017     1       CLE      NYK  1     620.0        23        16   \n",
626 |       "4      2017     1       CLE      NYK  1     632.0        23        16   \n",
627 |       "...     ...   ...       ...      ... ..       ...       ...       ...   \n",
628 |       "98072  2019  1230       POR      SAC  4    2371.0       111       117   \n",
629 |       "98073  2019  1230       POR      SAC  4    2407.0       116       117   \n",
630 |       "98074  2019  1230       POR      SAC  4    2517.0       120       117   \n",
631 |       "98075  2019  1230       POR      SAC  4    2579.0       122       117   \n",
632 |       "98076  2019  1230       POR      SAC  4    2880.0       136       131   \n",
633 |       "\n",
634 |       "       ht_margin  vt_margin  ... vt_3PTA ht_2PTM  vt_2PTM  ht_3PTM  vt_3PTM  \\\n",
635 |       "0              2         -2  ...       2       5        4        1        1   \n",
636 |       "1              3         -3  ...       2       5        4        1        1   \n",
637 |       "2              3         -3  ...       2       7        6        1        1   \n",
638 |       "3              7         -7  ...       2       9        6        1        1   \n",
639 |       "4              7         -7  ...       2       9        6        1        1   \n",
640 |       "...          ...        ...  ...     ...     ...      ...      ...      ...   \n",
641 |       "98072         -6          6  ...      32      30       27       12       17   \n",
642 |       "98073         -1          1  ...      33      31       27       13       17   \n",
643 |       "98074          3         -3  ...      37      33       27       13       17   \n",
644 |       "98075          5         -5  ...      38      34       27       13       17   \n",
645 |       "98076          5         -5  ...      43      38       31       14       18   \n",
646 |       "\n",
647 |       "       ht_stint_pm vt_stint_pm ht_stint_poss vt_stint_poss total_poss  \n",
648 |       "0                2          -2            63            63         99  \n",
649 |       "1                1          -1            12            12         12  \n",
650 |       "2                0           0            11            11         11  \n",
651 |       "3                4          -4            14            14         14  \n",
652 |       "4                0           0             4             4          4  \n",
653 |       "...            ...         ...           ...           ...        ...  \n",
654 |       "98072            7          -7            17            17         17  \n",
655 |       "98073            5          -5             6             6          6  \n",
656 |       "98074            4          -4            18            18         18  \n",
657 |       "98075            2          -2             7             7          7  \n",
658 |       "98076            0           0            45            45         45  \n",
659 |       "\n",
660 |       "[98077 rows x 41 columns]\n"
661 |      ]
662 |     }
663 |    ],
664 |    "source": [
665 |     "# For a single game, this gives the start and ends of offensive possessions for the home team\n",
666 |     "#htm_off_starts = np.unique(df[df['ht_poss'] == 'OFF'].time_sec.values.astype(float) - df[df['ht_poss'] == 'OFF'].ht_time_off.values.astype(float))\n",
667 |     "#vtm_off_starts = np.unique(df[df['vt_poss'] == 'OFF'].time_sec.values.astype(float) - df[df['vt_poss'] == 'OFF'].vt_time_off.values.astype(float))\n",
668 |     "#print(htm_off_starts, vtm_off_starts)\n",
669 |     "\n",
670 |     "# PBP data with lineup matchups with too few possessions\n",
671 |     "# (substitution-only plays, etc.) are removed for each game\n",
672 |     "min_num_poss = 3\n",
673 |     "df_newnew = df_new\n",
674 |     "df_newnew['total_poss'] = df_new.groupby(['game', 'year', 'ht_lineup', 'vt_lineup'], sort=False)['ht_stint_poss'].transform('count')\n",
675 |     "df_newnew = df_newnew[df_newnew['ht_stint_poss'] > min_num_poss]\n",
676 |     "df = df_newnew\n",
677 |     "#print(df)\n",
678 |     "\n",
679 |     "# Get rows corresponding to start of new lineup matchup (+/- = 0)\n",
680 |     "dfs = GetMatchupStarts(df)\n",
681 |     "# Get rows corresponding to end of lineup matchup (final +/-)\n",
682 |     "dfe = GetMatchupEnds(df)\n",
683 |     "# Ensure each stint meets minimum possession requirement\n",
684 |     "dfs = dfs[dfe.ht_stint_poss > min_num_poss]\n",
685 |     "dfe = dfe[dfe.ht_stint_poss > min_num_poss]\n",
686 |     "print(dfs, dfe)\n",
687 |     "\n",
688 |     "# Time average of all play-by-play data (over all games and lineups)\n",
689 |     "dfm = df.groupby(['time_sec'], as_index=False).mean()\n",
690 |     "#print(dfm)\n",
691 |     "\n",
692 |     "# Time average of play-by-play data for each home,away lineup matchup\n",
693 |     "dfta = df.groupby(['time_sec', 'game', 'year', 'ht_lineup', 'vt_lineup'], as_index=False).mean()\n",
694 |     "#print(dfta)\n",
695 |     "\n",
696 |     "# Play-by-play data grouped averaged by quarter\n",
697 |     "df_by_quarter = df.groupby(['Q'], as_index=False).mean()\n",
698 |     "#print(df_by_quarter)\n",
699 |     "\n",
700 |     "# PBP data grouped by home and away teams for team-specific time analysis\n",
701 |     "dfht = df.groupby(['home_team', 'time_sec'], as_index=False).mean()\n",
702 |     "dfvt = df.groupby(['vis_team', 'time_sec'], as_index=False).mean()\n",
703 |     "#print(dfht, dfht.ht_margin)\n",
704 |     "#print(dfvt, dfvt.vt_margin)\n",
705 |     "#print(dfht.ht_margin-dfvt.vt_margin)\n",
706 |     "#print(dfht, dfvt)\n",
707 |     "#for name in np.unique(dfht.home_team.values): \n",
708 |     "#    print(name)\n",
709 |     "\n",
710 |     "########## DEPRECATED\n",
711 |     "# Find the PBP rows corresponding to the last possession of each\n",
712 |     "# lineup matchup, for each game (yielding the cumulative +/- for that lineup)\n",
713 |     "##dfma = df_new.groupby(['game', 'ht_lineup', 'vt_lineup'], as_index=False).tail(1)\n",
714 |     "#idx = df_newnew.groupby(['game', 'ht_lineup', 'vt_lineup'], sort=False)['ht_stint_poss'].transform('max') == df_newnew['ht_stint_poss']\n",
715 |     "#dfma = df_newnew[idx]\n",
716 |     "#print(dfma)\n",
717 |     "#print(dfma.ht_lineup)\n",
718 |     "#print(dfma.vt_lineup)\n",
719 |     "#print(dfma.ht_stint_pm)\n",
720 |     "\n",
721 |     "# Mean, first row, and last row of lineups for each game\n",
722 |     "#dfl = df_newnew.groupby(['ht_lineup', 'vt_lineup'], as_index=False).mean()\n",
723 |     "#dfhl = df_newnew.groupby(['ht_lineup'], as_index=False).mean()\n",
724 |     "#dfvl = df_newnew.groupby(['vt_lineup'], as_index=False).mean()\n",
725 |     "#dfhs = df_newnew.groupby(['game', 'ht_lineup', 'vt_lineup'], as_index=False).head(1)\n",
726 |     "#dfhs = dfhs.groupby(['ht_lineup'], as_index=False).mean()\n",
727 |     "#dfvs = df_newnew.groupby(['game', 'ht_lineup', 'vt_lineup'], as_index=False).head(1)\n",
728 |     "#dfvs = dfvs.groupby(['vt_lineup'], as_index=False).mean()\n",
729 |     "#dfhe = df_newnew.groupby(['game', 'ht_lineup', 'vt_lineup'], as_index=False).tail(1)\n",
730 |     "#dfhe = dfhe.groupby(['ht_lineup'], as_index=False).mean()\n",
731 |     "#dfve = df_newnew.groupby(['game', 'ht_lineup', 'vt_lineup'], as_index=False).tail(1)\n",
732 |     "#dfve = dfve.groupby(['vt_lineup'], as_index=False).mean()\n",
733 |     "##########\n"
734 |    ]
735 |   },
736 |   {
737 |    "cell_type": "markdown",
738 |    "metadata": {},
739 |    "source": [
740 |     "# Dump Matchup Data to Files"
741 |    ]
742 |   },
743 |   {
744 |    "cell_type": "code",
745 |    "execution_count": 7,
746 |    "metadata": {},
747 |    "outputs": [
748 |     {
749 |      "name": "stderr",
750 |      "output_type": "stream",
751 |      "text": [
752 |       "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n",
753 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
754 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
755 |       "\n",
756 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
757 |       "  # Remove the CWD from sys.path while we load stuff.\n",
758 |       "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n",
759 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
760 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
761 |       "\n",
762 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
763 |       "  # This is added back by InteractiveShellApp.init_path()\n"
764 |      ]
765 |     },
766 |     {
767 |      "name": "stdout",
768 |      "output_type": "stream",
769 |      "text": [
770 |       "       year  game                                          ht_lineup  \\\n",
771 |       "0      2017     1  JR Smith,Kevin Love,Kyrie Irving,LeBron James,...   \n",
772 |       "1      2017     1  JR Smith,Kyrie Irving,LeBron James,Richard Jef...   \n",
773 |       "2      2017     1  Iman Shumpert,Kyrie Irving,LeBron James,Richar...   \n",
774 |       "3      2017     1  Iman Shumpert,Kevin Love,Mike Dunleavy,Richard...   \n",
775 |       "4      2017     1  Iman Shumpert,Kevin Love,Mike Dunleavy,Richard...   \n",
776 |       "...     ...   ...                                                ...   \n",
777 |       "98072  2019  1230  Jake Layman,Meyers Leonard,Gary Trent Jr.,Anfe...   \n",
778 |       "98073  2019  1230  Anfernee Simons,Gary Trent Jr.,Jake Layman,Mey...   \n",
779 |       "98074  2019  1230  Anfernee Simons,Gary Trent Jr.,Jake Layman,Mey...   \n",
780 |       "98075  2019  1230  Anfernee Simons,Gary Trent Jr.,Jake Layman,Mey...   \n",
781 |       "98076  2019  1230  Anfernee Simons,Gary Trent Jr.,Jake Layman,Mey...   \n",
782 |       "\n",
783 |       "                                               vt_lineup  ht_stint_pm  \\\n",
784 |       "0      Carmelo Anthony,Courtney Lee,Derrick Rose,Joak...            2   \n",
785 |       "1      Brandon Jennings,Carmelo Anthony,Courtney Lee,...            1   \n",
786 |       "2      Brandon Jennings,Carmelo Anthony,Courtney Lee,...            0   \n",
787 |       "3      Brandon Jennings,Carmelo Anthony,Courtney Lee,...            4   \n",
788 |       "4      Brandon Jennings,Courtney Lee,Justin Holiday,K...            0   \n",
789 |       "...                                                  ...          ...   \n",
790 |       "98072  Bogdan Bogdanovic,Corey Brewer,Frank Mason,Mar...            7   \n",
791 |       "98073  Bogdan Bogdanovic,Corey Brewer,Frank Mason,Mar...            5   \n",
792 |       "98074  Bogdan Bogdanovic,Corey Brewer,Frank Mason,Mar...            4   \n",
793 |       "98075  BJ Johnson,Corey Brewer,Frank Mason,Marvin Bag...            2   \n",
794 |       "98076  BJ Johnson,Caleb Swanigan,Corey Brewer,Frank M...            0   \n",
795 |       "\n",
796 |       "       vt_stint_pm  ht_stint_poss   ht_pm_ph   vt_pm_ph  \n",
797 |       "0               -2             63   3.174603  -3.174603  \n",
798 |       "1               -1             12   8.333333  -8.333333  \n",
799 |       "2                0             11   0.000000   0.000000  \n",
800 |       "3               -4             14  28.571429 -28.571429  \n",
801 |       "4                0              4   0.000000   0.000000  \n",
802 |       "...            ...            ...        ...        ...  \n",
803 |       "98072           -7             17  41.176471 -41.176471  \n",
804 |       "98073           -5              6  83.333333 -83.333333  \n",
805 |       "98074           -4             18  22.222222 -22.222222  \n",
806 |       "98075           -2              7  28.571429 -28.571429  \n",
807 |       "98076            0             45   0.000000   0.000000  \n",
808 |       "\n",
809 |       "[98077 rows x 9 columns]\n"
810 |      ]
811 |     }
812 |    ],
813 |    "source": [
814 |     "# Write lineup matchup +/- data to files\n",
815 |     "dfpm = dfe.copy()\n",
816 |     "dfpm['ht_pm_ph'] = dfpm.ht_stint_pm.values*100./dfpm.ht_stint_poss.values\n",
817 |     "dfpm['vt_pm_ph'] = dfpm.vt_stint_pm.values*100./dfpm.ht_stint_poss.values\n",
818 |     "#dfpm = dfpm.groupby(['ht_lineup', 'vt_lineup'], as_index=False).mean()\n",
819 |     "dfpm.to_csv('../NBA_Full_Matchup_Data.csv')\n",
820 |     "\n",
821 |     "# More simplified dataframe\n",
822 |     "dfpm = dfe[['year', 'game', 'ht_lineup', 'vt_lineup', 'ht_stint_pm', 'vt_stint_pm', 'ht_stint_poss']]\n",
823 |     "dfpm['ht_pm_ph'] = dfpm.ht_stint_pm.values*100./dfpm.ht_stint_poss.values\n",
824 |     "dfpm['vt_pm_ph'] = dfpm.vt_stint_pm.values*100./dfpm.ht_stint_poss.values\n",
825 |     "#dfpm = dfpm.groupby(['ht_lineup', 'vt_lineup'], as_index=False).mean()\n",
826 |     "dfpm.to_csv('../NBA_Matchup_PlusMinus.csv')\n",
827 |     "print(dfpm)\n"
828 |    ]
829 |   },
830 |   {
831 |    "cell_type": "code",
832 |    "execution_count": null,
833 |    "metadata": {},
834 |    "outputs": [],
835 |    "source": []
836 |   }
837 |  ],
838 |  "metadata": {
839 |   "kernelspec": {
840 |    "display_name": "Python 3",
841 |    "language": "python",
842 |    "name": "python3"
843 |   },
844 |   "language_info": {
845 |    "codemirror_mode": {
846 |     "name": "ipython",
847 |     "version": 3
848 |    },
849 |    "file_extension": ".py",
850 |    "mimetype": "text/x-python",
851 |    "name": "python",
852 |    "nbconvert_exporter": "python",
853 |    "pygments_lexer": "ipython3",
854 |    "version": "3.7.7"
855 |   },
856 |   "toc": {
857 |    "base_numbering": 1,
858 |    "nav_menu": {},
859 |    "number_sections": true,
860 |    "sideBar": true,
861 |    "skip_h1_title": false,
862 |    "title_cell": "Table of Contents",
863 |    "title_sidebar": "Contents",
864 |    "toc_cell": true,
865 |    "toc_position": {
866 |     "height": "calc(100% - 180px)",
867 |     "left": "10px",
868 |     "top": "150px",
869 |     "width": "166px"
870 |    },
871 |    "toc_section_display": true,
872 |    "toc_window_display": true
873 |   }
874 |  },
875 |  "nbformat": 4,
876 |  "nbformat_minor": 2
877 | }
878 | 


--------------------------------------------------------------------------------
/Data_Scraping/Scrape_NBA_DraftCombine_Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import urllib.request\n",
 11 |     "from selenium import webdriver\n",
 12 |     "from selenium.webdriver.support.ui import Select\n",
 13 |     "from selenium.webdriver.support import expected_conditions as EC\n",
 14 |     "from selenium.webdriver.common.by import By\n",
 15 |     "from selenium.webdriver.support.ui import WebDriverWait\n",
 16 |     "from selenium.common.exceptions import TimeoutException\n",
 17 |     "from selenium.webdriver.firefox.options import Options as FirefoxOptions\n",
 18 |     "from selenium.webdriver.chrome.options import Options as ChromeOptions\n",
 19 |     "import lxml.html\n",
 20 |     "from lxml import etree\n",
 21 |     "import time\n",
 22 |     "import pandas as pd\n",
 23 |     "from functools import reduce"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# Modify the dataframe to have appropriate data types\n",
 33 |     "def ConvertDataFrame(df):\n",
 34 |     "    new_df = df.loc[:, df.columns != 'name'].astype(float)\n",
 35 |     "    df[new_df.columns] = new_df\n",
 36 |     "    df['name'] = df['name'].astype('str')\n",
 37 |     "    df['draft_year'] = df['draft_year'].astype('int')\n",
 38 |     "    df = df.drop_duplicates(subset=['name'], keep=False)\n",
 39 |     "    return df"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# Scrape draft combine data from NBA.com\n",
 49 |     "def FetchCombineAnthroTables(urls, years):\n",
 50 |     "    # Create a headless Firefox browser instance\n",
 51 |     "    opt = FirefoxOptions()\n",
 52 |     "    opt.add_argument(\"--headless\")\n",
 53 |     "    driver = webdriver.Firefox(options=opt)\n",
 54 |     "    \n",
 55 |     "    arr = []\n",
 56 |     "    for i,url in enumerate(urls):\n",
 57 |     "        year = years[i]\n",
 58 |     "        print(\"Fetching NBA Combine measurements from Year\", year, \"...\")\n",
 59 |     "        \n",
 60 |     "        driver.get(url)\n",
 61 |     "        wait = WebDriverWait(driver, 30)\n",
 62 |     "        wait.until(EC.presence_of_element_located((By.XPATH, \"//*[@class='nba-stat-table__overflow']//table/tbody/tr\")))\n",
 63 |     "        driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;\")\n",
 64 |     "        results = driver.find_elements_by_xpath(\"//*[@class='nba-stat-table__overflow']/table/tbody/tr\")\n",
 65 |     "    \n",
 66 |     "        counter = 0\n",
 67 |     "        for result in results:\n",
 68 |     "            item = result.text\n",
 69 |     "            data = item.split()\n",
 70 |     "            #if counter == 0:\n",
 71 |     "            #    print(data)\n",
 72 |     "            last_str = ''.join(data[-7:])\n",
 73 |     "            if '-' in last_str:\n",
 74 |     "                continue\n",
 75 |     "            if len(data) != 13 and len(data) != 15:\n",
 76 |     "                continue\n",
 77 |     "            if len(data) == 15:\n",
 78 |     "                #print(\"Deleting\", data[-7])\n",
 79 |     "                del data[-7]\n",
 80 |     "                #print(\"Deleting\", data[-6])\n",
 81 |     "                del data[-6]\n",
 82 |     "            #print(' '.join(data[0:-11]))\n",
 83 |     "            data[0:-11] = [' '.join(data[0:-11])]\n",
 84 |     "            del data[1]\n",
 85 |     "            data = [s.strip('%') for s in data]\n",
 86 |     "            data = [s.strip('\\'') for s in data]\n",
 87 |     "            data[4:] = [float(f) for f in data[4:]]\n",
 88 |     "            data[4:5] = [data[4]*12.+data[5]]\n",
 89 |     "            del data[5]\n",
 90 |     "            data[5:6] = [data[5]*12.+data[6]]\n",
 91 |     "            del data[6]\n",
 92 |     "            data[7:8] = [data[7]*12.+data[8]]\n",
 93 |     "            del data[8]\n",
 94 |     "            data.insert(1, int(year))\n",
 95 |     "            arr.append(data)\n",
 96 |     "            counter += 1\n",
 97 |     "            #print(data)\n",
 98 |     "        \n",
 99 |     "        print(\"Fetched stats for\", counter, \"NBA draft players.\")\n",
100 |     "        time.sleep(1)\n",
101 |     "        \n",
102 |     "    time.sleep(1)\n",
103 |     "    driver.quit()\n",
104 |     "    #print(arr)\n",
105 |     "    return np.array(arr)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 4,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "# Establish the years for which we want to fetch NBA Draft Combine player measurements\n",
115 |     "ya = [str(n).zfill(2) for n in range(0, 20)]\n",
116 |     "yb = [str(n).zfill(2) for n in range(1, 21)]\n",
117 |     "years = [int(\"20\"+y) for y in yb]"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "Fetching NBA Combine measurements from Year 2001 ...\n",
130 |       "Fetched stats for 64 NBA draft players.\n",
131 |       "Fetching NBA Combine measurements from Year 2002 ...\n",
132 |       "Fetched stats for 78 NBA draft players.\n",
133 |       "Fetching NBA Combine measurements from Year 2003 ...\n",
134 |       "Fetched stats for 81 NBA draft players.\n",
135 |       "Fetching NBA Combine measurements from Year 2004 ...\n",
136 |       "Fetched stats for 78 NBA draft players.\n",
137 |       "Fetching NBA Combine measurements from Year 2005 ...\n",
138 |       "Fetched stats for 79 NBA draft players.\n",
139 |       "Fetching NBA Combine measurements from Year 2006 ...\n",
140 |       "Fetched stats for 78 NBA draft players.\n",
141 |       "Fetching NBA Combine measurements from Year 2007 ...\n",
142 |       "Fetched stats for 76 NBA draft players.\n",
143 |       "Fetching NBA Combine measurements from Year 2008 ...\n",
144 |       "Fetched stats for 76 NBA draft players.\n",
145 |       "Fetching NBA Combine measurements from Year 2009 ...\n",
146 |       "Fetched stats for 74 NBA draft players.\n",
147 |       "Fetching NBA Combine measurements from Year 2010 ...\n",
148 |       "Fetched stats for 46 NBA draft players.\n",
149 |       "Fetching NBA Combine measurements from Year 2011 ...\n",
150 |       "Fetched stats for 52 NBA draft players.\n",
151 |       "Fetching NBA Combine measurements from Year 2012 ...\n",
152 |       "Fetched stats for 54 NBA draft players.\n",
153 |       "Fetching NBA Combine measurements from Year 2013 ...\n",
154 |       "Fetched stats for 61 NBA draft players.\n",
155 |       "Fetching NBA Combine measurements from Year 2014 ...\n",
156 |       "Fetched stats for 61 NBA draft players.\n",
157 |       "Fetching NBA Combine measurements from Year 2015 ...\n",
158 |       "Fetched stats for 54 NBA draft players.\n",
159 |       "Fetching NBA Combine measurements from Year 2016 ...\n",
160 |       "Fetched stats for 57 NBA draft players.\n",
161 |       "Fetching NBA Combine measurements from Year 2017 ...\n",
162 |       "Fetched stats for 54 NBA draft players.\n",
163 |       "Fetching NBA Combine measurements from Year 2018 ...\n",
164 |       "Fetched stats for 59 NBA draft players.\n",
165 |       "Fetching NBA Combine measurements from Year 2019 ...\n",
166 |       "Fetched stats for 69 NBA draft players.\n",
167 |       "Fetching NBA Combine measurements from Year 2020 ...\n",
168 |       "Fetched stats for 66 NBA draft players.\n",
169 |       "                        name draft_year   BFP hand_length hand_width height  \\\n",
170 |       "0                Malik Allen       2001     -           -          -  80.25   \n",
171 |       "1           Harold Arceneaux       2001     -           -          -   76.5   \n",
172 |       "2              Lamont Barnes       2001     -           -          -   80.5   \n",
173 |       "3                Mario Bland       2001     -           -          -   77.5   \n",
174 |       "4              Primoz Brezec       2001     -           -          -  84.75   \n",
175 |       "...                      ...        ...   ...         ...        ...    ...   \n",
176 |       "1312  Quinndary Weatherspoon       2020  6.10        8.50       9.00   75.0   \n",
177 |       "1313              Coby White       2020  4.30        7.75       9.00   75.5   \n",
178 |       "1314             Kris Wilkes       2020  4.90        8.50       9.50  78.25   \n",
179 |       "1315          Grant Williams       2020  5.40        9.00      10.50  77.75   \n",
180 |       "1316           Dylan Windler       2020  4.60        8.25       9.50  78.25   \n",
181 |       "\n",
182 |       "      reach weight wingspan  \n",
183 |       "0     109.0  271.0     86.5  \n",
184 |       "1     103.0  219.0     80.5  \n",
185 |       "2     108.0  235.5     87.5  \n",
186 |       "3     103.0  287.0     84.0  \n",
187 |       "4     110.0  243.0     86.0  \n",
188 |       "...     ...    ...      ...  \n",
189 |       "1312  100.0  206.6     81.0  \n",
190 |       "1313   97.5  191.4     77.0  \n",
191 |       "1314  103.0  208.8    82.75  \n",
192 |       "1315  104.5  240.2    81.75  \n",
193 |       "1316  104.5  195.8     82.0  \n",
194 |       "\n",
195 |       "[1317 rows x 9 columns]\n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "# Create URLs for the available years of NBA Combine data, fetch the data in 2D array format,\n",
201 |     "# put into a Pandas dataframe, and store the data in a .csv file format\n",
202 |     "#urls = [ 'https://stats.nba.com/draft/combine-anthro/?SeasonYear=2006-07' ]\n",
203 |     "urls = [ \"https://stats.nba.com/draft/combine-anthro/?SeasonYear=20{0}-{1}\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
204 |     "#print(urls)\n",
205 |     "np_arr = FetchCombineAnthroTables(urls, years)\n",
206 |     "#print(np_arr)\n",
207 |     "df = pd.DataFrame(np_arr, columns=['name', 'draft_year', 'BFP', 'hand_length', 'hand_width', 'height', 'reach', 'weight', 'wingspan'])\n",
208 |     "print(df)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 6,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "df.to_csv(\"NBACombineStats.csv\")"
218 |    ]
219 |   }
220 |  ],
221 |  "metadata": {
222 |   "kernelspec": {
223 |    "display_name": "Python 3",
224 |    "language": "python",
225 |    "name": "python3"
226 |   },
227 |   "language_info": {
228 |    "codemirror_mode": {
229 |     "name": "ipython",
230 |     "version": 3
231 |    },
232 |    "file_extension": ".py",
233 |    "mimetype": "text/x-python",
234 |    "name": "python",
235 |    "nbconvert_exporter": "python",
236 |    "pygments_lexer": "ipython3",
237 |    "version": "3.7.3"
238 |   },
239 |   "toc": {
240 |    "base_numbering": 1,
241 |    "nav_menu": {},
242 |    "number_sections": true,
243 |    "sideBar": false,
244 |    "skip_h1_title": false,
245 |    "title_cell": "Table of Contents",
246 |    "title_sidebar": "Contents",
247 |    "toc_cell": true,
248 |    "toc_position": {},
249 |    "toc_section_display": true,
250 |    "toc_window_display": false
251 |   }
252 |  },
253 |  "nbformat": 4,
254 |  "nbformat_minor": 2
255 | }
256 | 


--------------------------------------------------------------------------------
/Data_Scraping/Scrape_NBA_PlayType_Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "toc": true
  7 |    },
  8 |    "source": [
  9 |     "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
 10 |     "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Scrape-Player-Data-for-Transition-Plays\" data-toc-modified-id=\"Scrape-Player-Data-for-Transition-Plays-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Scrape Player Data for Transition Plays</a></span></li><li><span><a href=\"#Scrape-Player-Data-for-Isolation-Plays\" data-toc-modified-id=\"Scrape-Player-Data-for-Isolation-Plays-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Scrape Player Data for Isolation Plays</a></span></li><li><span><a href=\"#Scrape-Player-Data-for-Pick-and-Roll-Plays\" data-toc-modified-id=\"Scrape-Player-Data-for-Pick-and-Roll-Plays-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Scrape Player Data for Pick-and-Roll Plays</a></span></li><li><span><a href=\"#Scrape-Player-Data-for-Post-Up-Plays\" data-toc-modified-id=\"Scrape-Player-Data-for-Post-Up-Plays-4\"><span class=\"toc-item-num\">4&nbsp;&nbsp;</span>Scrape Player Data for Post-Up Plays</a></span></li><li><span><a href=\"#Scrape-Player-Data-for-Spot-Up-Plays\" data-toc-modified-id=\"Scrape-Player-Data-for-Spot-Up-Plays-5\"><span class=\"toc-item-num\">5&nbsp;&nbsp;</span>Scrape Player Data for Spot-Up Plays</a></span></li><li><span><a href=\"#Scrape-Player-Data-for-Hand-Off-Plays\" data-toc-modified-id=\"Scrape-Player-Data-for-Hand-Off-Plays-6\"><span class=\"toc-item-num\">6&nbsp;&nbsp;</span>Scrape Player Data for Hand-Off Plays</a></span></li><li><span><a href=\"#Scrape-Player-Data-for-Cutting-Plays\" data-toc-modified-id=\"Scrape-Player-Data-for-Cutting-Plays-7\"><span class=\"toc-item-num\">7&nbsp;&nbsp;</span>Scrape Player Data for Cutting Plays</a></span></li><li><span><a href=\"#Scrape-Player-Data-for-Off-Screen-Plays\" data-toc-modified-id=\"Scrape-Player-Data-for-Off-Screen-Plays-8\"><span class=\"toc-item-num\">8&nbsp;&nbsp;</span>Scrape Player Data for Off-Screen Plays</a></span></li><li><span><a href=\"#Scrape-Player-Data-for-Put-Back-Plays\" data-toc-modified-id=\"Scrape-Player-Data-for-Put-Back-Plays-9\"><span class=\"toc-item-num\">9&nbsp;&nbsp;</span>Scrape Player Data for Put-Back Plays</a></span></li><li><span><a href=\"#Scrape-Player-Data-for-Miscellaneous-Play-Types\" data-toc-modified-id=\"Scrape-Player-Data-for-Miscellaneous-Play-Types-10\"><span class=\"toc-item-num\">10&nbsp;&nbsp;</span>Scrape Player Data for Miscellaneous Play-Types</a></span></li><li><span><a href=\"#Merge-and-Store-Dataframes\" data-toc-modified-id=\"Merge-and-Store-Dataframes-11\"><span class=\"toc-item-num\">11&nbsp;&nbsp;</span>Merge and Store Dataframes</a></span></li></ul></div>"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import numpy as np\n",
 20 |     "import urllib.request\n",
 21 |     "from selenium import webdriver\n",
 22 |     "from selenium.webdriver.support.ui import Select\n",
 23 |     "from selenium.webdriver.support import expected_conditions as EC\n",
 24 |     "from selenium.webdriver.common.by import By\n",
 25 |     "from selenium.webdriver.support.ui import WebDriverWait\n",
 26 |     "from selenium.common.exceptions import TimeoutException\n",
 27 |     "from selenium.webdriver.firefox.options import Options as FirefoxOptions\n",
 28 |     "from selenium.webdriver.chrome.options import Options as ChromeOptions\n",
 29 |     "import lxml.html\n",
 30 |     "from lxml import etree\n",
 31 |     "import re\n",
 32 |     "import time\n",
 33 |     "import pandas as pd\n",
 34 |     "from functools import reduce"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# Modify dataframe to have appropriate data types\n",
 44 |     "def ConvertDataFrame(df):\n",
 45 |     "    new_df = df.loc[:, df.columns != 'name'].astype(float)\n",
 46 |     "    df[new_df.columns] = new_df\n",
 47 |     "    df['name'] = df['name'].astype('str')\n",
 48 |     "    df['year'] = df['year'].astype('int')\n",
 49 |     "    df = df.groupby(['name', 'year']).mean().reset_index()\n",
 50 |     "    return df\n"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 4,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Scrape play-type data tables from NBA.com webpages, grabbing only\n",
 60 |     "# the specified columns (by index) and for the specified seasons\n",
 61 |     "def FetchPlayTypeTables(urls, years, button):\n",
 62 |     "    # Create a headless Firefox browser instance\n",
 63 |     "    opt = FirefoxOptions()\n",
 64 |     "    opt.add_argument(\"--headless\")\n",
 65 |     "    driver = webdriver.Firefox(options=opt)\n",
 66 |     "    \n",
 67 |     "    arr = []\n",
 68 |     "    for i,url in enumerate(urls):\n",
 69 |     "        year = years[i]\n",
 70 |     "        print(\"Fetching play type data from Year\", year, \"...\")\n",
 71 |     "        \n",
 72 |     "        driver.get(url)\n",
 73 |     "        wait = WebDriverWait(driver, 30)\n",
 74 |     "        wait.until(EC.presence_of_element_located((By.XPATH, \"//select[@name='TypeGrouping']\")))\n",
 75 |     "        sel = Select(driver.find_element_by_name('TypeGrouping'))\n",
 76 |     "        sel.select_by_visible_text(button)\n",
 77 |     "        wait.until(EC.presence_of_element_located((By.XPATH, \"//select[contains(@class, 'stats-table-pagination__select')]\")))\n",
 78 |     "        sel = Select(driver.find_element_by_class_name('stats-table-pagination__select'))\n",
 79 |     "        sel.select_by_visible_text(\"All\")\n",
 80 |     "        driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;\")\n",
 81 |     "        \n",
 82 |     "        root = lxml.html.fromstring(driver.page_source)\n",
 83 |     "        results = root.xpath(\"//*[@class='nba-stat-table__overflow']//table/tbody/tr\")\n",
 84 |     "    \n",
 85 |     "        # Process the table text and break into columns, \n",
 86 |     "        # stripping extraneous newline characters and inserting the season year\n",
 87 |     "        counter = 0\n",
 88 |     "        for result in results:\n",
 89 |     "            item = result.xpath(\"./td//text()\")\n",
 90 |     "            item = [re.sub('\\n +', '', x) for x in item]\n",
 91 |     "            data = [x for x in item if x != '' and x != '\\n']            \n",
 92 |     "            if len(data) != 17:\n",
 93 |     "                continue\n",
 94 |     "            del data[-15]\n",
 95 |     "            del data[-15]\n",
 96 |     "            #% on indices 2, 7-14\n",
 97 |     "            data = [s.strip('%') for s in data]\n",
 98 |     "            data.insert(1, int(year))\n",
 99 |     "            arr.append(data)\n",
100 |     "            counter += 1\n",
101 |     "            #print(data)\n",
102 |     "            \n",
103 |     "        print(\"Fetched stats for\", counter, \"NBA players.\")\n",
104 |     "        \n",
105 |     "    driver.quit()\n",
106 |     "    #print(arr)\n",
107 |     "    return np.array(arr)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 5,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "# Scrape play-type data tables from NBA.com webpages, grabbing only\n",
117 |     "# the specified columns (by index) and for the specified seasons.\n",
118 |     "# ADDITIONALLY: completely reset the web driver for each URL,\n",
119 |     "# which is needed for specific webpages.\n",
120 |     "def FetchPlayTypeTables_ResetPage(urls, years, button):\n",
121 |     "    # Create a headless Firefox browser instance\n",
122 |     "    opt = FirefoxOptions()\n",
123 |     "    opt.add_argument(\"--headless\")\n",
124 |     "    \n",
125 |     "    arr = []\n",
126 |     "    for i,url in enumerate(urls):\n",
127 |     "        driver = webdriver.Firefox(options=opt)\n",
128 |     "        year = years[i]\n",
129 |     "        print(\"Fetching play type data from Year\", year, \"...\")\n",
130 |     "        \n",
131 |     "        driver.get(url)\n",
132 |     "        wait = WebDriverWait(driver, 30)\n",
133 |     "        wait.until(EC.presence_of_element_located((By.XPATH, \"//select[@name='TypeGrouping']\")))\n",
134 |     "        sel = Select(driver.find_element_by_name('TypeGrouping'))\n",
135 |     "        sel.select_by_visible_text(button)\n",
136 |     "        wait.until(EC.presence_of_element_located((By.XPATH, \"//select[contains(@class, 'stats-table-pagination__select')]\")))\n",
137 |     "        sel = Select(driver.find_element_by_class_name('stats-table-pagination__select'))\n",
138 |     "        sel.select_by_visible_text(\"All\")\n",
139 |     "        driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;\")\n",
140 |     "        \n",
141 |     "        root = lxml.html.fromstring(driver.page_source)\n",
142 |     "        results = root.xpath(\"//*[@class='nba-stat-table__overflow']//table/tbody/tr\")\n",
143 |     "    \n",
144 |     "        # Process the table text and break into columns, \n",
145 |     "        # stripping extraneous newline characters and inserting the season year\n",
146 |     "        counter = 0\n",
147 |     "        for result in results:\n",
148 |     "            item = result.xpath(\"./td//text()\")\n",
149 |     "            item = [re.sub('\\n +', '', x) for x in item]\n",
150 |     "            data = [x for x in item if x != '' and x != '\\n']            \n",
151 |     "            if len(data) != 17:\n",
152 |     "                continue\n",
153 |     "            del data[-15]\n",
154 |     "            del data[-15]\n",
155 |     "            #% on indices 2, 7-14\n",
156 |     "            data = [s.strip('%') for s in data]\n",
157 |     "            data.insert(1, int(year))\n",
158 |     "            arr.append(data)\n",
159 |     "            counter += 1\n",
160 |     "            #print(data)\n",
161 |     "            \n",
162 |     "        print(\"Fetched stats for\", counter, \"NBA players.\")\n",
163 |     "        driver.quit()\n",
164 |     "\n",
165 |     "    #print(arr)\n",
166 |     "    return np.array(arr)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "# Scrape Player Data for Transition Plays"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 6,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "#off_play_types = ['TR', 'CUT', 'PB', 'MISC', 'ISO', 'PRBH', 'PRRM', 'PU', 'SU', 'HO', 'OS']\n",
183 |     "#def_play_types = ['ISO', 'PRBH', 'PRRM', 'PU', 'SU', 'HO', 'OS']\n",
184 |     "cols = ['name', 'year', '_POSS_', '_FREQ_', '_PPP_', '_PTS_', '_FGM_', '_FGA_', '_FGP_', '_EFGP_', '_FT_FREQ_', '_TO_FREQ_', '_SF_FREQ_', '_AND1_FREQ_', '_SCORE_FREQ_', '_PERC_']\n",
185 |     "#for play_type in play_types:\n",
186 |     "#    off_cols = ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
187 |     "#    def_cols = ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
188 |     "#    print(off_cols, '\\n', def_cols)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 7,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "# Establish the years for which we want to fetch play-type data from NBA.com\n",
198 |     "ya = [str(n).zfill(2) for n in range(15, 20)]\n",
199 |     "yb = [str(n).zfill(2) for n in range(16, 21)]\n",
200 |     "years = [int(\"20\"+y) for y in yb]"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 8,
206 |    "metadata": {},
207 |    "outputs": [
208 |     {
209 |      "name": "stdout",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "Fetching play type data from Year 2016 ...\n",
213 |       "Fetched stats for 396 NBA players.\n",
214 |       "Fetching play type data from Year 2017 ...\n",
215 |       "Fetched stats for 399 NBA players.\n",
216 |       "Fetching play type data from Year 2018 ...\n",
217 |       "Fetched stats for 423 NBA players.\n",
218 |       "Fetching play type data from Year 2019 ...\n",
219 |       "Fetched stats for 458 NBA players.\n",
220 |       "Fetching play type data from Year 2020 ...\n",
221 |       "Fetched stats for 347 NBA players.\n",
222 |       "               name  year  OFF_POSS_TR  OFF_FREQ_TR  OFF_PPP_TR  OFF_PTS_TR  \\\n",
223 |       "0      Aaron Brooks  2016          0.9         10.4        0.95         0.8   \n",
224 |       "1      Aaron Brooks  2017          0.6         10.2        0.72         0.4   \n",
225 |       "2      Aaron Gordon  2016          1.6         17.4        1.08         1.8   \n",
226 |       "3      Aaron Gordon  2017          2.6         19.9        1.03         2.7   \n",
227 |       "4      Aaron Gordon  2018          3.7         20.1        0.96         3.6   \n",
228 |       "...             ...   ...          ...          ...         ...         ...   \n",
229 |       "1892  Zach Randolph  2018          0.7          4.5        1.15         0.8   \n",
230 |       "1893  Zaza Pachulia  2016          0.4          4.1        1.17         0.4   \n",
231 |       "1894  Zaza Pachulia  2017          0.6          9.1        1.39         0.8   \n",
232 |       "1895  Zaza Pachulia  2018          0.7         12.6        1.17         0.8   \n",
233 |       "1896  Zaza Pachulia  2019          0.2          5.2        1.63         0.4   \n",
234 |       "\n",
235 |       "      OFF_FGM_TR  OFF_FGA_TR  OFF_FGP_TR  OFF_EFGP_TR  OFF_FT_FREQ_TR  \\\n",
236 |       "0            0.4         0.7        39.2         52.0             3.3   \n",
237 |       "1            0.3         0.4        33.3         46.3             5.1   \n",
238 |       "2            0.5         1.1        55.8         58.1            24.4   \n",
239 |       "3            0.9         1.9        53.6         56.5            17.2   \n",
240 |       "4            1.5         2.8        47.3         52.7            12.1   \n",
241 |       "...          ...         ...         ...          ...             ...   \n",
242 |       "1892         0.3         0.6        55.6         65.3             2.4   \n",
243 |       "1893         0.1         0.3        45.0         45.0            31.0   \n",
244 |       "1894         0.2         0.5        67.6         67.6            14.6   \n",
245 |       "1895         0.2         0.5        61.8         61.8            18.8   \n",
246 |       "1896         0.0         0.2        76.9         76.9            25.0   \n",
247 |       "\n",
248 |       "      OFF_TO_FREQ_TR  OFF_SF_FREQ_TR  OFF_AND1_FREQ_TR  OFF_SCORE_FREQ_TR  \\\n",
249 |       "0               11.7             3.3               0.0               36.7   \n",
250 |       "1               25.6             5.1               0.0               28.2   \n",
251 |       "2               12.6            21.3               4.7               54.3   \n",
252 |       "3               12.4            16.7               2.9               51.7   \n",
253 |       "4               13.6            10.7               3.3               44.4   \n",
254 |       "...              ...             ...               ...                ...   \n",
255 |       "1892            12.2             2.4               2.4               48.8   \n",
256 |       "1893             0.0            31.0               0.0               58.6   \n",
257 |       "1894             2.4            14.6               0.0               70.7   \n",
258 |       "1895            14.6            18.8               4.2               58.3   \n",
259 |       "1896             0.0            25.0               6.3               81.3   \n",
260 |       "\n",
261 |       "      OFF_PERC_TR  \n",
262 |       "0            20.3  \n",
263 |       "1             4.2  \n",
264 |       "2            43.9  \n",
265 |       "3            37.1  \n",
266 |       "4            23.9  \n",
267 |       "...           ...  \n",
268 |       "1892         58.1  \n",
269 |       "1893         64.4  \n",
270 |       "1894         92.5  \n",
271 |       "1895         62.4  \n",
272 |       "1896         99.0  \n",
273 |       "\n",
274 |       "[1897 rows x 16 columns]\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "# Scrape transition play-type data, fetch the data in 2D array format,\n",
280 |     "# and convert to a dataframe format with the appropriate data types\n",
281 |     "play_type = \"TR\"\n",
282 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
283 |     "#urls = [ \"https://stats.nba.com/players/transition/?SeasonYear=2018-19&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\" ]\n",
284 |     "urls = [ \"https://stats.nba.com/players/transition/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
285 |     "#print(urls)\n",
286 |     "np_arr_trans = FetchPlayTypeTables(urls, years, \"Offensive\")\n",
287 |     "df_trans = pd.DataFrame(np_arr_trans, columns=off_cols)\n",
288 |     "df_trans = ConvertDataFrame(df_trans)\n",
289 |     "#print(df_trans.dtypes)\n",
290 |     "print(df_trans)\n"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "# Scrape Player Data for Isolation Plays"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 9,
303 |    "metadata": {},
304 |    "outputs": [
305 |     {
306 |      "name": "stdout",
307 |      "output_type": "stream",
308 |      "text": [
309 |       "Fetching play type data from Year 2016 ...\n",
310 |       "Fetched stats for 287 NBA players.\n",
311 |       "Fetching play type data from Year 2017 ...\n",
312 |       "Fetched stats for 288 NBA players.\n",
313 |       "Fetching play type data from Year 2018 ...\n",
314 |       "Fetched stats for 277 NBA players.\n",
315 |       "Fetching play type data from Year 2019 ...\n",
316 |       "Fetched stats for 287 NBA players.\n",
317 |       "Fetching play type data from Year 2020 ...\n",
318 |       "Fetched stats for 165 NBA players.\n",
319 |       "Fetching play type data from Year 2016 ...\n",
320 |       "Fetched stats for 380 NBA players.\n",
321 |       "Fetching play type data from Year 2017 ...\n",
322 |       "Fetched stats for 387 NBA players.\n",
323 |       "Fetching play type data from Year 2018 ...\n",
324 |       "Fetched stats for 383 NBA players.\n",
325 |       "Fetching play type data from Year 2019 ...\n",
326 |       "Fetched stats for 411 NBA players.\n",
327 |       "Fetching play type data from Year 2020 ...\n",
328 |       "Fetched stats for 304 NBA players.\n"
329 |      ]
330 |     }
331 |    ],
332 |    "source": [
333 |     "# Scrape isolation play-type data, fetch the data in 2D array format,\n",
334 |     "# and convert to a dataframe format with the appropriate data types\n",
335 |     "play_type = \"ISO\"\n",
336 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
337 |     "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
338 |     "urls = [ \"https://stats.nba.com/players/isolation/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
339 |     "\n",
340 |     "np_arr_iso = FetchPlayTypeTables(urls, years, \"Offensive\")\n",
341 |     "df_iso = pd.DataFrame(np_arr_iso, columns=off_cols)\n",
342 |     "df_iso = ConvertDataFrame(df_iso)\n",
343 |     "\n",
344 |     "np_arr_iso = FetchPlayTypeTables(urls, years, \"Defensive\")\n",
345 |     "df_iso2 = pd.DataFrame(np_arr_iso, columns=def_cols)\n",
346 |     "df_iso2 = ConvertDataFrame(df_iso2)\n"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "# Scrape Player Data for Pick-and-Roll Plays"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 10,
359 |    "metadata": {
360 |     "scrolled": false
361 |    },
362 |    "outputs": [
363 |     {
364 |      "name": "stdout",
365 |      "output_type": "stream",
366 |      "text": [
367 |       "Fetching play type data from Year 2016 ...\n",
368 |       "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2015-16&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n",
369 |       "Fetched stats for 272 NBA players.\n",
370 |       "Fetching play type data from Year 2017 ...\n",
371 |       "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2016-17&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n",
372 |       "Fetched stats for 265 NBA players.\n",
373 |       "Fetching play type data from Year 2018 ...\n",
374 |       "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2017-18&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n",
375 |       "Fetched stats for 302 NBA players.\n",
376 |       "Fetching play type data from Year 2019 ...\n",
377 |       "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2018-19&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n",
378 |       "Fetched stats for 308 NBA players.\n",
379 |       "Fetching play type data from Year 2020 ...\n",
380 |       "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2019-20&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n",
381 |       "Fetched stats for 241 NBA players.\n",
382 |       "Fetching play type data from Year 2016 ...\n",
383 |       "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2015-16&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n",
384 |       "Fetched stats for 312 NBA players.\n",
385 |       "Fetching play type data from Year 2017 ...\n",
386 |       "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2016-17&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n",
387 |       "Fetched stats for 327 NBA players.\n",
388 |       "Fetching play type data from Year 2018 ...\n",
389 |       "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2017-18&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n",
390 |       "Fetched stats for 367 NBA players.\n",
391 |       "Fetching play type data from Year 2019 ...\n",
392 |       "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2018-19&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n",
393 |       "Fetched stats for 395 NBA players.\n",
394 |       "Fetching play type data from Year 2020 ...\n",
395 |       "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2019-20&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n",
396 |       "Fetched stats for 302 NBA players.\n"
397 |      ]
398 |     }
399 |    ],
400 |    "source": [
401 |     "# Scrape pick-and-roll ball handler play-type data, fetch the data in 2D array format,\n",
402 |     "# and convert to a dataframe format with the appropriate data types\n",
403 |     "play_type = \"PRBH\"\n",
404 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
405 |     "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
406 |     "urls = [ \"https://stats.nba.com/players/ball-handler/#!?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
407 |     "#print(urls)\n",
408 |     "\n",
409 |     "np_arr_prbh = FetchPlayTypeTables_ResetPage(urls, years, \"Offensive\")\n",
410 |     "df_prbh = pd.DataFrame(np_arr_prbh, columns=off_cols)\n",
411 |     "df_prbh = ConvertDataFrame(df_prbh)\n",
412 |     "\n",
413 |     "np_arr_prbh = FetchPlayTypeTables_ResetPage(urls, years, \"Defensive\")\n",
414 |     "df_prbh2 = pd.DataFrame(np_arr_prbh, columns=def_cols)\n",
415 |     "df_prbh2 = ConvertDataFrame(df_prbh2)\n"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 11,
421 |    "metadata": {},
422 |    "outputs": [
423 |     {
424 |      "name": "stdout",
425 |      "output_type": "stream",
426 |      "text": [
427 |       "Fetching play type data from Year 2016 ...\n",
428 |       "Fetched stats for 202 NBA players.\n",
429 |       "Fetching play type data from Year 2017 ...\n",
430 |       "Fetched stats for 196 NBA players.\n",
431 |       "Fetching play type data from Year 2018 ...\n",
432 |       "Fetched stats for 190 NBA players.\n",
433 |       "Fetching play type data from Year 2019 ...\n",
434 |       "Fetched stats for 201 NBA players.\n",
435 |       "Fetching play type data from Year 2020 ...\n",
436 |       "Fetched stats for 157 NBA players.\n",
437 |       "Fetching play type data from Year 2016 ...\n",
438 |       "Fetched stats for 272 NBA players.\n",
439 |       "Fetching play type data from Year 2017 ...\n",
440 |       "Fetched stats for 282 NBA players.\n",
441 |       "Fetching play type data from Year 2018 ...\n",
442 |       "Fetched stats for 293 NBA players.\n",
443 |       "Fetching play type data from Year 2019 ...\n",
444 |       "Fetched stats for 268 NBA players.\n",
445 |       "Fetching play type data from Year 2020 ...\n",
446 |       "Fetched stats for 161 NBA players.\n"
447 |      ]
448 |     }
449 |    ],
450 |    "source": [
451 |     "# Scrape pick-and-roll roll man play-type data, fetch the data in 2D array format,\n",
452 |     "# and convert to a dataframe format with the appropriate data types\n",
453 |     "play_type = \"PRRM\"\n",
454 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
455 |     "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
456 |     "urls = [ \"https://stats.nba.com/players/roll-man/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
457 |     "\n",
458 |     "np_arr_prrm = FetchPlayTypeTables(urls, years, \"Offensive\")\n",
459 |     "df_prrm = pd.DataFrame(np_arr_prrm, columns=off_cols)\n",
460 |     "df_prrm = ConvertDataFrame(df_prrm)\n",
461 |     "\n",
462 |     "np_arr_prrm = FetchPlayTypeTables(urls, years, \"Defensive\")\n",
463 |     "df_prrm2 = pd.DataFrame(np_arr_prrm, columns=def_cols)\n",
464 |     "df_prrm2 = ConvertDataFrame(df_prrm2)\n"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "markdown",
469 |    "metadata": {},
470 |    "source": [
471 |     "# Scrape Player Data for Post-Up Plays"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": 12,
477 |    "metadata": {},
478 |    "outputs": [
479 |     {
480 |      "name": "stdout",
481 |      "output_type": "stream",
482 |      "text": [
483 |       "Fetching play type data from Year 2016 ...\n",
484 |       "Fetched stats for 216 NBA players.\n",
485 |       "Fetching play type data from Year 2017 ...\n",
486 |       "Fetched stats for 184 NBA players.\n",
487 |       "Fetching play type data from Year 2018 ...\n",
488 |       "Fetched stats for 192 NBA players.\n",
489 |       "Fetching play type data from Year 2019 ...\n",
490 |       "Fetched stats for 195 NBA players.\n",
491 |       "Fetching play type data from Year 2020 ...\n",
492 |       "Fetched stats for 113 NBA players.\n",
493 |       "Fetching play type data from Year 2016 ...\n",
494 |       "Fetched stats for 374 NBA players.\n",
495 |       "Fetching play type data from Year 2017 ...\n",
496 |       "Fetched stats for 358 NBA players.\n",
497 |       "Fetching play type data from Year 2018 ...\n",
498 |       "Fetched stats for 371 NBA players.\n",
499 |       "Fetching play type data from Year 2019 ...\n",
500 |       "Fetched stats for 387 NBA players.\n",
501 |       "Fetching play type data from Year 2020 ...\n",
502 |       "Fetched stats for 250 NBA players.\n"
503 |      ]
504 |     }
505 |    ],
506 |    "source": [
507 |     "# Scrape post up play-type data, fetch the data in 2D array format,\n",
508 |     "# and convert to a dataframe format with the appropriate data types\n",
509 |     "play_type = \"PU\"\n",
510 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
511 |     "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
512 |     "urls = [ \"https://stats.nba.com/players/playtype-post-up/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
513 |     "\n",
514 |     "np_arr_pu = FetchPlayTypeTables(urls, years, \"Offensive\")\n",
515 |     "df_pu = pd.DataFrame(np_arr_pu, columns=off_cols)\n",
516 |     "df_pu = ConvertDataFrame(df_pu)\n",
517 |     "\n",
518 |     "np_arr_pu = FetchPlayTypeTables(urls, years, \"Defensive\")\n",
519 |     "df_pu2 = pd.DataFrame(np_arr_pu, columns=def_cols)\n",
520 |     "df_pu2 = ConvertDataFrame(df_pu2)\n"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "markdown",
525 |    "metadata": {},
526 |    "source": [
527 |     "# Scrape Player Data for Spot-Up Plays"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": 13,
533 |    "metadata": {},
534 |    "outputs": [
535 |     {
536 |      "name": "stdout",
537 |      "output_type": "stream",
538 |      "text": [
539 |       "Fetching play type data from Year 2016 ...\n",
540 |       "Fetched stats for 392 NBA players.\n",
541 |       "Fetching play type data from Year 2017 ...\n",
542 |       "Fetched stats for 396 NBA players.\n",
543 |       "Fetching play type data from Year 2018 ...\n",
544 |       "Fetched stats for 428 NBA players.\n",
545 |       "Fetching play type data from Year 2019 ...\n",
546 |       "Fetched stats for 458 NBA players.\n",
547 |       "Fetching play type data from Year 2020 ...\n",
548 |       "Fetched stats for 359 NBA players.\n",
549 |       "Fetching play type data from Year 2016 ...\n",
550 |       "Fetched stats for 417 NBA players.\n",
551 |       "Fetching play type data from Year 2017 ...\n",
552 |       "Fetched stats for 413 NBA players.\n",
553 |       "Fetching play type data from Year 2018 ...\n",
554 |       "Fetched stats for 438 NBA players.\n",
555 |       "Fetching play type data from Year 2019 ...\n",
556 |       "Fetched stats for 477 NBA players.\n",
557 |       "Fetching play type data from Year 2020 ...\n",
558 |       "Fetched stats for 382 NBA players.\n"
559 |      ]
560 |     }
561 |    ],
562 |    "source": [
563 |     "# Scrape spot up play-type data, fetch the data in 2D array format,\n",
564 |     "# and convert to a dataframe format with the appropriate data types\n",
565 |     "play_type = \"SU\"\n",
566 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
567 |     "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
568 |     "urls = [ \"https://stats.nba.com/players/spot-up/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
569 |     "\n",
570 |     "np_arr_su = FetchPlayTypeTables(urls, years, \"Offensive\")\n",
571 |     "df_su = pd.DataFrame(np_arr_su, columns=off_cols)\n",
572 |     "df_su = ConvertDataFrame(df_su)\n",
573 |     "\n",
574 |     "np_arr_su = FetchPlayTypeTables(urls, years, \"Defensive\")\n",
575 |     "df_su2 = pd.DataFrame(np_arr_su, columns=def_cols)\n",
576 |     "df_su2 = ConvertDataFrame(df_su2)\n"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "markdown",
581 |    "metadata": {},
582 |    "source": [
583 |     "# Scrape Player Data for Hand-Off Plays"
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "code",
588 |    "execution_count": 14,
589 |    "metadata": {},
590 |    "outputs": [
591 |     {
592 |      "name": "stdout",
593 |      "output_type": "stream",
594 |      "text": [
595 |       "Fetching play type data from Year 2016 ...\n",
596 |       "Fetched stats for 233 NBA players.\n",
597 |       "Fetching play type data from Year 2017 ...\n",
598 |       "Fetched stats for 235 NBA players.\n",
599 |       "Fetching play type data from Year 2018 ...\n",
600 |       "Fetched stats for 256 NBA players.\n",
601 |       "Fetching play type data from Year 2019 ...\n",
602 |       "Fetched stats for 280 NBA players.\n",
603 |       "Fetching play type data from Year 2020 ...\n",
604 |       "Fetched stats for 198 NBA players.\n",
605 |       "Fetching play type data from Year 2016 ...\n",
606 |       "Fetched stats for 269 NBA players.\n",
607 |       "Fetching play type data from Year 2017 ...\n",
608 |       "Fetched stats for 279 NBA players.\n",
609 |       "Fetching play type data from Year 2018 ...\n",
610 |       "Fetched stats for 302 NBA players.\n",
611 |       "Fetching play type data from Year 2019 ...\n",
612 |       "Fetched stats for 340 NBA players.\n",
613 |       "Fetching play type data from Year 2020 ...\n",
614 |       "Fetched stats for 239 NBA players.\n"
615 |      ]
616 |     }
617 |    ],
618 |    "source": [
619 |     "# Scrape hand off play-type data, fetch the data in 2D array format,\n",
620 |     "# and convert to a dataframe format with the appropriate data types\n",
621 |     "play_type = \"HO\"\n",
622 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
623 |     "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
624 |     "urls = [ \"https://stats.nba.com/players/hand-off/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
625 |     "\n",
626 |     "np_arr_ho = FetchPlayTypeTables(urls, years, \"Offensive\")\n",
627 |     "df_ho = pd.DataFrame(np_arr_ho, columns=off_cols)\n",
628 |     "df_ho = ConvertDataFrame(df_ho)\n",
629 |     "\n",
630 |     "np_arr_ho = FetchPlayTypeTables(urls, years, \"Defensive\")\n",
631 |     "df_ho2 = pd.DataFrame(np_arr_ho, columns=def_cols)\n",
632 |     "df_ho2 = ConvertDataFrame(df_ho2)\n"
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "markdown",
637 |    "metadata": {},
638 |    "source": [
639 |     "# Scrape Player Data for Cutting Plays"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": 15,
645 |    "metadata": {},
646 |    "outputs": [
647 |     {
648 |      "name": "stdout",
649 |      "output_type": "stream",
650 |      "text": [
651 |       "Fetching play type data from Year 2016 ...\n",
652 |       "Fetched stats for 334 NBA players.\n",
653 |       "Fetching play type data from Year 2017 ...\n",
654 |       "Fetched stats for 334 NBA players.\n",
655 |       "Fetching play type data from Year 2018 ...\n",
656 |       "Fetched stats for 340 NBA players.\n",
657 |       "Fetching play type data from Year 2019 ...\n",
658 |       "Fetched stats for 360 NBA players.\n",
659 |       "Fetching play type data from Year 2020 ...\n",
660 |       "Fetched stats for 245 NBA players.\n"
661 |      ]
662 |     }
663 |    ],
664 |    "source": [
665 |     "# Scrape cutting play-type data, fetch the data in 2D array format,\n",
666 |     "# and convert to a dataframe format with the appropriate data types\n",
667 |     "play_type = \"CUT\"\n",
668 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
669 |     "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
670 |     "urls = [ \"https://stats.nba.com/players/cut/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
671 |     "\n",
672 |     "np_arr_cut = FetchPlayTypeTables(urls, years, \"Offensive\")\n",
673 |     "df_cut = pd.DataFrame(np_arr_cut, columns=off_cols)\n",
674 |     "df_cut = ConvertDataFrame(df_cut)\n"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "markdown",
679 |    "metadata": {},
680 |    "source": [
681 |     "# Scrape Player Data for Off-Screen Plays"
682 |    ]
683 |   },
684 |   {
685 |    "cell_type": "code",
686 |    "execution_count": 18,
687 |    "metadata": {},
688 |    "outputs": [
689 |     {
690 |      "name": "stdout",
691 |      "output_type": "stream",
692 |      "text": [
693 |       "Fetching play type data from Year 2016 ...\n",
694 |       "Fetched stats for 238 NBA players.\n",
695 |       "Fetching play type data from Year 2017 ...\n",
696 |       "Fetched stats for 246 NBA players.\n",
697 |       "Fetching play type data from Year 2018 ...\n",
698 |       "Fetched stats for 247 NBA players.\n",
699 |       "Fetching play type data from Year 2019 ...\n",
700 |       "Fetched stats for 229 NBA players.\n",
701 |       "Fetching play type data from Year 2020 ...\n",
702 |       "Fetched stats for 172 NBA players.\n",
703 |       "Fetching play type data from Year 2016 ...\n",
704 |       "Fetched stats for 331 NBA players.\n",
705 |       "Fetching play type data from Year 2017 ...\n",
706 |       "Fetched stats for 336 NBA players.\n",
707 |       "Fetching play type data from Year 2018 ...\n",
708 |       "Fetched stats for 339 NBA players.\n",
709 |       "Fetching play type data from Year 2019 ...\n",
710 |       "Fetched stats for 351 NBA players.\n",
711 |       "Fetching play type data from Year 2020 ...\n",
712 |       "Fetched stats for 239 NBA players.\n"
713 |      ]
714 |     }
715 |    ],
716 |    "source": [
717 |     "# Scrape off screen play-type data, fetch the data in 2D array format,\n",
718 |     "# and convert to a dataframe format with the appropriate data types\n",
719 |     "play_type = \"OS\"\n",
720 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
721 |     "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
722 |     "urls = [ \"https://stats.nba.com/players/off-screen/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
723 |     "\n",
724 |     "np_arr_os = FetchPlayTypeTables(urls, years, \"Offensive\")\n",
725 |     "df_os = pd.DataFrame(np_arr_os, columns=off_cols)\n",
726 |     "df_os = ConvertDataFrame(df_os)\n",
727 |     "\n",
728 |     "np_arr_os = FetchPlayTypeTables(urls, years, \"Defensive\")\n",
729 |     "df_os2 = pd.DataFrame(np_arr_os, columns=def_cols)\n",
730 |     "df_os2 = ConvertDataFrame(df_os2)\n"
731 |    ]
732 |   },
733 |   {
734 |    "cell_type": "markdown",
735 |    "metadata": {},
736 |    "source": [
737 |     "# Scrape Player Data for Put-Back Plays"
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "code",
742 |    "execution_count": 20,
743 |    "metadata": {},
744 |    "outputs": [
745 |     {
746 |      "name": "stdout",
747 |      "output_type": "stream",
748 |      "text": [
749 |       "Fetching play type data from Year 2016 ...\n",
750 |       "Fetched stats for 287 NBA players.\n",
751 |       "Fetching play type data from Year 2017 ...\n",
752 |       "Fetched stats for 280 NBA players.\n",
753 |       "Fetching play type data from Year 2018 ...\n",
754 |       "Fetched stats for 282 NBA players.\n",
755 |       "Fetching play type data from Year 2019 ...\n",
756 |       "Fetched stats for 309 NBA players.\n",
757 |       "Fetching play type data from Year 2020 ...\n",
758 |       "Fetched stats for 202 NBA players.\n"
759 |      ]
760 |     }
761 |    ],
762 |    "source": [
763 |     "# Scrape put-back play-type data, fetch the data in 2D array format,\n",
764 |     "# and convert to a dataframe format with the appropriate data types\n",
765 |     "play_type = \"PB\"\n",
766 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
767 |     "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
768 |     "urls = [ \"https://stats.nba.com/players/putbacks/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
769 |     "\n",
770 |     "np_arr_pb = FetchPlayTypeTables(urls, years, \"Offensive\")\n",
771 |     "df_pb = pd.DataFrame(np_arr_pb, columns=off_cols)\n",
772 |     "df_pb = ConvertDataFrame(df_pb)\n"
773 |    ]
774 |   },
775 |   {
776 |    "cell_type": "markdown",
777 |    "metadata": {},
778 |    "source": [
779 |     "# Scrape Player Data for Miscellaneous Play-Types"
780 |    ]
781 |   },
782 |   {
783 |    "cell_type": "code",
784 |    "execution_count": 21,
785 |    "metadata": {},
786 |    "outputs": [
787 |     {
788 |      "name": "stdout",
789 |      "output_type": "stream",
790 |      "text": [
791 |       "Fetching play type data from Year 2016 ...\n",
792 |       "Fetched stats for 366 NBA players.\n",
793 |       "Fetching play type data from Year 2017 ...\n",
794 |       "Fetched stats for 362 NBA players.\n",
795 |       "Fetching play type data from Year 2018 ...\n",
796 |       "Fetched stats for 361 NBA players.\n",
797 |       "Fetching play type data from Year 2019 ...\n",
798 |       "Fetched stats for 377 NBA players.\n",
799 |       "Fetching play type data from Year 2020 ...\n",
800 |       "Fetched stats for 284 NBA players.\n"
801 |      ]
802 |     }
803 |    ],
804 |    "source": [
805 |     "# Scrape miscellaneous play-type data, fetch the data in 2D array format,\n",
806 |     "# and convert to a dataframe format with the appropriate data types\n",
807 |     "play_type = \"MISC\"\n",
808 |     "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
809 |     "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n",
810 |     "urls = [ \"https://stats.nba.com/players/playtype-misc/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
811 |     "\n",
812 |     "np_arr_misc = FetchPlayTypeTables(urls, years, \"Offensive\")\n",
813 |     "df_misc = pd.DataFrame(np_arr_misc, columns=off_cols)\n",
814 |     "df_misc = ConvertDataFrame(df_misc)\n"
815 |    ]
816 |   },
817 |   {
818 |    "cell_type": "markdown",
819 |    "metadata": {},
820 |    "source": [
821 |     "# Merge and Store Dataframes"
822 |    ]
823 |   },
824 |   {
825 |    "cell_type": "code",
826 |    "execution_count": 22,
827 |    "metadata": {},
828 |    "outputs": [],
829 |    "source": [
830 |     "# Create a complete dataframe, performing an outer merge so as\n",
831 |     "# not to exclude players who play with a couple of play types exclusively\n",
832 |     "df = pd.merge(df_trans, df_iso, on=['name', 'year'], how='outer')\n",
833 |     "df = pd.merge(df, df_iso2, on=['name', 'year'], how='outer')\n",
834 |     "df = pd.merge(df, df_prbh, on=['name', 'year'], how='outer')\n",
835 |     "df = pd.merge(df, df_prbh2, on=['name', 'year'], how='outer')\n",
836 |     "df = pd.merge(df, df_prrm, on=['name', 'year'], how='outer')\n",
837 |     "df = pd.merge(df, df_prrm2, on=['name', 'year'], how='outer')\n",
838 |     "df = pd.merge(df, df_pu, on=['name', 'year'], how='outer')\n",
839 |     "df = pd.merge(df, df_pu2, on=['name', 'year'], how='outer')\n",
840 |     "df = pd.merge(df, df_su, on=['name', 'year'], how='outer')\n",
841 |     "df = pd.merge(df, df_su2, on=['name', 'year'], how='outer')\n",
842 |     "df = pd.merge(df, df_ho, on=['name', 'year'], how='outer')\n",
843 |     "df = pd.merge(df, df_ho2, on=['name', 'year'], how='outer')\n",
844 |     "df = pd.merge(df, df_cut, on=['name', 'year'], how='outer')\n",
845 |     "df = pd.merge(df, df_os, on=['name', 'year'], how='outer')\n",
846 |     "df = pd.merge(df, df_os2, on=['name', 'year'], how='outer')\n",
847 |     "df = pd.merge(df, df_pb, on=['name', 'year'], how='outer')\n",
848 |     "df = pd.merge(df, df_misc, on=['name', 'year'], how='outer')\n",
849 |     "\n",
850 |     "# Scale percentage quantities to be in range 0-1 (for convenience)\n",
851 |     "perc_cols = [col for col in df.columns if 'FGP' in col or '3PP' in col or 'FTP' in col or 'PERC' in col or 'PCT' in col or 'FREQ' in col]\n",
852 |     "df[perc_cols] = df[perc_cols].astype(float)/100.\n"
853 |    ]
854 |   },
855 |   {
856 |    "cell_type": "code",
857 |    "execution_count": 23,
858 |    "metadata": {},
859 |    "outputs": [
860 |     {
861 |      "name": "stdout",
862 |      "output_type": "stream",
863 |      "text": [
864 |       "                  name  year  OFF_POSS_TR  OFF_FREQ_TR  OFF_PPP_TR  \\\n",
865 |       "0         Aaron Brooks  2016          0.9        0.104        0.95   \n",
866 |       "1         Aaron Brooks  2017          0.6        0.102        0.72   \n",
867 |       "2         Aaron Gordon  2016          1.6        0.174        1.08   \n",
868 |       "3         Aaron Gordon  2017          2.6        0.199        1.03   \n",
869 |       "4         Aaron Gordon  2018          3.7        0.201        0.96   \n",
870 |       "...                ...   ...          ...          ...         ...   \n",
871 |       "2028    Devin Robinson  2019          0.0        0.000        0.00   \n",
872 |       "2029  Jared Cunningham  2016          0.0        0.000        0.00   \n",
873 |       "2030     Jarrod Uthoff  2017          0.0        0.000        0.00   \n",
874 |       "2031    Alex Stepheson  2016          0.0        0.000        0.00   \n",
875 |       "2032       Jack Cooley  2018          0.0        0.000        0.00   \n",
876 |       "\n",
877 |       "      OFF_PTS_TR  OFF_FGM_TR  OFF_FGA_TR  OFF_FGP_TR  OFF_EFGP_TR  ...  \\\n",
878 |       "0            0.8         0.4         0.7       0.392        0.520  ...   \n",
879 |       "1            0.4         0.3         0.4       0.333        0.463  ...   \n",
880 |       "2            1.8         0.5         1.1       0.558        0.581  ...   \n",
881 |       "3            2.7         0.9         1.9       0.536        0.565  ...   \n",
882 |       "4            3.6         1.5         2.8       0.473        0.527  ...   \n",
883 |       "...          ...         ...         ...         ...          ...  ...   \n",
884 |       "2028         0.0         0.0         0.0       0.000        0.000  ...   \n",
885 |       "2029         0.0         0.0         0.0       0.000        0.000  ...   \n",
886 |       "2030         0.0         0.0         0.0       0.000        0.000  ...   \n",
887 |       "2031         0.0         0.0         0.0       0.000        0.000  ...   \n",
888 |       "2032         0.0         0.0         0.0       0.000        0.000  ...   \n",
889 |       "\n",
890 |       "      OFF_FGM_MISC  OFF_FGA_MISC  OFF_FGP_MISC  OFF_EFGP_MISC  \\\n",
891 |       "0              0.1           0.2         0.167          0.167   \n",
892 |       "1              0.1           0.2         0.180          0.227   \n",
893 |       "2              0.2           0.3         0.400          0.425   \n",
894 |       "3              0.1           0.2         0.440          0.469   \n",
895 |       "4              0.2           0.3         0.375          0.438   \n",
896 |       "...            ...           ...           ...            ...   \n",
897 |       "2028           0.0           0.0         0.000          0.000   \n",
898 |       "2029           0.0           0.0         0.000          0.000   \n",
899 |       "2030           0.0           0.0         0.000          0.000   \n",
900 |       "2031           0.0           0.0         0.000          0.000   \n",
901 |       "2032           0.0           0.0         0.000          0.000   \n",
902 |       "\n",
903 |       "      OFF_FT_FREQ_MISC  OFF_TO_FREQ_MISC  OFF_SF_FREQ_MISC  \\\n",
904 |       "0                0.207             0.379             0.034   \n",
905 |       "1                0.095             0.381             0.000   \n",
906 |       "2                0.283             0.358             0.057   \n",
907 |       "3                0.195             0.415             0.049   \n",
908 |       "4                0.195             0.488             0.098   \n",
909 |       "...                ...               ...               ...   \n",
910 |       "2028             0.000             0.000             0.000   \n",
911 |       "2029             0.000             0.000             0.000   \n",
912 |       "2030             0.000             0.000             0.000   \n",
913 |       "2031             0.000             0.000             0.000   \n",
914 |       "2032             0.000             0.000             0.000   \n",
915 |       "\n",
916 |       "      OFF_AND1_FREQ_MISC  OFF_SCORE_FREQ_MISC  OFF_PERC_MISC  \n",
917 |       "0                   0.00                0.276          0.311  \n",
918 |       "1                   0.00                0.190          0.382  \n",
919 |       "2                   0.02                0.377          0.729  \n",
920 |       "3                   0.00                0.366          0.755  \n",
921 |       "4                   0.07                0.268          0.547  \n",
922 |       "...                  ...                  ...            ...  \n",
923 |       "2028                0.00                0.000          0.000  \n",
924 |       "2029                0.00                0.000          0.000  \n",
925 |       "2030                0.00                0.000          0.000  \n",
926 |       "2031                0.00                0.000          0.000  \n",
927 |       "2032                0.00                0.000          0.000  \n",
928 |       "\n",
929 |       "[2033 rows x 254 columns]\n"
930 |      ]
931 |     }
932 |    ],
933 |    "source": [
934 |     "# Fill all NaN values with 0, which is reasonable for play-type missing values\n",
935 |     "df = df.fillna(0)\n",
936 |     "print(df)"
937 |    ]
938 |   },
939 |   {
940 |    "cell_type": "code",
941 |    "execution_count": 24,
942 |    "metadata": {},
943 |    "outputs": [],
944 |    "source": [
945 |     "# Write overall play-type dataframe to a .csv file\n",
946 |     "df.to_csv(\"NBAPlayTypeStats.csv\")"
947 |    ]
948 |   },
949 |   {
950 |    "cell_type": "code",
951 |    "execution_count": null,
952 |    "metadata": {},
953 |    "outputs": [],
954 |    "source": []
955 |   }
956 |  ],
957 |  "metadata": {
958 |   "kernelspec": {
959 |    "display_name": "Python 3",
960 |    "language": "python",
961 |    "name": "python3"
962 |   },
963 |   "language_info": {
964 |    "codemirror_mode": {
965 |     "name": "ipython",
966 |     "version": 3
967 |    },
968 |    "file_extension": ".py",
969 |    "mimetype": "text/x-python",
970 |    "name": "python",
971 |    "nbconvert_exporter": "python",
972 |    "pygments_lexer": "ipython3",
973 |    "version": "3.8.2"
974 |   },
975 |   "toc": {
976 |    "base_numbering": 1,
977 |    "nav_menu": {},
978 |    "number_sections": true,
979 |    "sideBar": false,
980 |    "skip_h1_title": false,
981 |    "title_cell": "Table of Contents",
982 |    "title_sidebar": "Contents",
983 |    "toc_cell": true,
984 |    "toc_position": {},
985 |    "toc_section_display": true,
986 |    "toc_window_display": false
987 |   }
988 |  },
989 |  "nbformat": 4,
990 |  "nbformat_minor": 2
991 | }
992 | 


--------------------------------------------------------------------------------
/Data_Scraping/Scrape_NBA_PlayerBios_Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "toc": true
  7 |    },
  8 |    "source": [
  9 |     "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
 10 |     "<div class=\"toc\"><ul class=\"toc-item\"></ul></div>"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import numpy as np\n",
 20 |     "import urllib.request\n",
 21 |     "from selenium import webdriver\n",
 22 |     "from selenium.webdriver.support.ui import Select\n",
 23 |     "from selenium.webdriver.support import expected_conditions as EC\n",
 24 |     "from selenium.webdriver.common.by import By\n",
 25 |     "from selenium.webdriver.support.ui import WebDriverWait\n",
 26 |     "from selenium.common.exceptions import TimeoutException\n",
 27 |     "from selenium.webdriver.firefox.options import Options as FirefoxOptions\n",
 28 |     "from selenium.webdriver.chrome.options import Options as ChromeOptions\n",
 29 |     "import lxml.html\n",
 30 |     "from lxml import etree\n",
 31 |     "import re\n",
 32 |     "import time\n",
 33 |     "import pandas as pd\n",
 34 |     "from functools import reduce\n",
 35 |     "from operator import itemgetter"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 56,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# Modify dataframe to have appropriate data types\n",
 45 |     "def ConvertDataFrame(df):\n",
 46 |     "    cols = df.columns.drop(['name', 'college', 'country', 'draft', 'nationality'])\n",
 47 |     "    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)\n",
 48 |     "    df['name'] = df['name'].astype('str')\n",
 49 |     "    df['college'] = df['college'].astype('str')\n",
 50 |     "    df['country'] = df['country'].astype('str')\n",
 51 |     "    df['draft'] = df['draft'].astype('str')\n",
 52 |     "    df['nationality'] = df['nationality'].astype('str')\n",
 53 |     "    df['actual_draft_year'] = df['actual_draft_year'].astype('float')\n",
 54 |     "    df['draft_round'] = df['draft_round'].astype('float')\n",
 55 |     "    df['draft_number'] = df['draft_number'].astype('float')\n",
 56 |     "    #df = df.groupby(['name', 'actual_draft_year']).mean().reset_index()\n",
 57 |     "    df = df.drop_duplicates(subset=['name', 'actual_draft_year'], keep='first')\n",
 58 |     "    return df\n"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 52,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# Scrape player bio tables from NBA.com webpages, grabbing only\n",
 68 |     "# the specified columns (by index) and for the specified seasons\n",
 69 |     "def FetchStatsTables(urls, years, col_list):\n",
 70 |     "    # Create a headless Firefox browser instance\n",
 71 |     "    opt = FirefoxOptions()\n",
 72 |     "    opt.add_argument(\"--headless\")\n",
 73 |     "    driver = webdriver.Firefox(options=opt)\n",
 74 |     "    \n",
 75 |     "    arr = []\n",
 76 |     "    for i,url in enumerate(urls):\n",
 77 |     "        year = years[i]\n",
 78 |     "        print(\"Fetching player bios from the\", year, \"season...\")\n",
 79 |     "        \n",
 80 |     "        driver.get(url)\n",
 81 |     "        wait = WebDriverWait(driver, 30)\n",
 82 |     "        wait.until(EC.presence_of_element_located((By.XPATH, \"//select[contains(@class, 'stats-table-pagination__select')]\")))\n",
 83 |     "        sel = Select(driver.find_element_by_class_name('stats-table-pagination__select'))\n",
 84 |     "        sel.select_by_visible_text(\"All\")\n",
 85 |     "        driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;\")\n",
 86 |     "        \n",
 87 |     "        retries = 1\n",
 88 |     "        while retries <= 3:\n",
 89 |     "            try:\n",
 90 |     "                wait.until(EC.presence_of_element_located((By.XPATH, \"//*[@class='nba-stat-table__overflow']//table/tbody/tr\")))\n",
 91 |     "                break\n",
 92 |     "            except TimeoutException:\n",
 93 |     "                print('\\nRefreshing NBA bios page due to timeout (retry #', retries,')...')\n",
 94 |     "                driver.refresh()\n",
 95 |     "                time.sleep(1)\n",
 96 |     "                retries += 1\n",
 97 |     "\n",
 98 |     "        root = lxml.html.fromstring(driver.page_source)\n",
 99 |     "        results = root.xpath(\"//*[@class='nba-stat-table__overflow']//table/tbody/tr\")\n",
100 |     "    \n",
101 |     "        # Process the table text and break into columns, \n",
102 |     "        # stripping extraneous newline characters and inserting the season year\n",
103 |     "        counter = 0\n",
104 |     "        for result in results:\n",
105 |     "            item = result.xpath(\"./td//text()\")\n",
106 |     "            item = [re.sub('\\n +', '', x) for x in item]\n",
107 |     "            data = [x for x in item if x != '' and x != '\\n']\n",
108 |     "            data = [s.strip('%') for s in data]\n",
109 |     "            if len(data) < col_list[-1]+1:\n",
110 |     "                continue\n",
111 |     "            elif len(data) > col_list[-1]+1:\n",
112 |     "                data[6:-13] = [' '.join(data[6:-13])]\n",
113 |     "            data = list(itemgetter(*col_list)(data))\n",
114 |     "            #print(data)\n",
115 |     "            \n",
116 |     "            if len(data[1].split('-')) == 2 and data[1].split('-')[0] != '':\n",
117 |     "                ft_in = data[1].split('-')\n",
118 |     "                data[1] = float(ft_in[0])*12. + float(ft_in[1])\n",
119 |     "                #print(ft_in, data[1])\n",
120 |     "            else:\n",
121 |     "                data[1] = ''\n",
122 |     "            \n",
123 |     "            if data[2] == ' ':\n",
124 |     "                data[2] = ''\n",
125 |     "\n",
126 |     "            if data[5].lower() == 'undrafted':\n",
127 |     "                data.append('undrafted')\n",
128 |     "                data[5] = ''\n",
129 |     "                data[6] = ''\n",
130 |     "                data[7] = ''\n",
131 |     "            else:\n",
132 |     "                data.append('drafted')\n",
133 |     "                \n",
134 |     "            if data[4] == 'USA' or data[4] == '':\n",
135 |     "                data.append('domestic')\n",
136 |     "            else:\n",
137 |     "                data.append('foreign')\n",
138 |     "            \n",
139 |     "            arr.append(data)\n",
140 |     "            counter += 1\n",
141 |     "            #print(data)\n",
142 |     "        \n",
143 |     "        print(\"Fetched bios for\", counter, \"NBA players.\")\n",
144 |     "        \n",
145 |     "    driver.quit()\n",
146 |     "    #print(arr)\n",
147 |     "    return np.array(arr)\n"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 53,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "# Establish the years for which we want to fetch player data\n",
157 |     "# (for data available for 201-15 onward)\n",
158 |     "ya = [str(n).zfill(2) for n in range(0, 20)]\n",
159 |     "yb = [str(n).zfill(2) for n in range(1, 21)]\n",
160 |     "years = [int(\"20\"+y) for y in yb]\n"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "# Create URLs for player bios on NBA.com, \n",
170 |     "# fetch the data in 2D array format, and put into a Pandas dataframe\n",
171 |     "#https://stats.nba.com/players/bio/?Season=2000-01&SeasonType=Regular%20Season\n",
172 |     "urls = [ \"https://stats.nba.com/players/bio/?Season=20{0}-{1}&SeasonType=Regular%20Season\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n",
173 |     "np_arr = FetchStatsTables(urls, years, [0, 3, 4, 5, 6, 7, 8, 9])\n",
174 |     "df = pd.DataFrame(np_arr, columns=['name', 'height', 'weight', 'college', 'country', 'actual_draft_year', 'draft_round', 'draft_number', 'draft', 'nationality'])\n"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 57,
180 |    "metadata": {
181 |     "scrolled": false
182 |    },
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "                  name  height  weight       college   country  \\\n",
189 |       "0           A.C. Green    81.0   225.0  Oregon State       USA   \n",
190 |       "1          A.J. Guyton    73.0   180.0       Indiana       USA   \n",
191 |       "2          Aaron McKie    77.0   209.0        Temple       USA   \n",
192 |       "3       Aaron Williams    81.0   225.0        Xavier       USA   \n",
193 |       "4           Adam Keefe    81.0   230.0      Stanford       USA   \n",
194 |       "...                ...     ...     ...           ...       ...   \n",
195 |       "9339   Vincent Poirier    84.0   235.0          None    France   \n",
196 |       "9340     Vlatko Cancar    80.0   236.0          None  Slovenia   \n",
197 |       "9343    Wenyen Gabriel    81.0   205.0          None     Sudan   \n",
198 |       "9354  Zach Norvell Jr.    77.0   205.0          None       USA   \n",
199 |       "9355    Zylan Cheatham    77.0   220.0          None       USA   \n",
200 |       "\n",
201 |       "      actual_draft_year  draft_round  draft_number      draft nationality  \n",
202 |       "0                1985.0          1.0          23.0    drafted    domestic  \n",
203 |       "1                2000.0          2.0          32.0    drafted    domestic  \n",
204 |       "2                1994.0          1.0          17.0    drafted    domestic  \n",
205 |       "3                   NaN          NaN           NaN  undrafted    domestic  \n",
206 |       "4                1992.0          1.0          10.0    drafted    domestic  \n",
207 |       "...                 ...          ...           ...        ...         ...  \n",
208 |       "9339                NaN          NaN           NaN  undrafted     foreign  \n",
209 |       "9340             2017.0          2.0          49.0    drafted     foreign  \n",
210 |       "9343                NaN          NaN           NaN  undrafted     foreign  \n",
211 |       "9354                NaN          NaN           NaN  undrafted    domestic  \n",
212 |       "9355                NaN          NaN           NaN  undrafted    domestic  \n",
213 |       "\n",
214 |       "[1946 rows x 10 columns]\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "df = ConvertDataFrame(df)\n",
220 |     "print(df)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 58,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "# Write NBA player bios dataframe to a .csv file\n",
230 |     "df.to_csv(\"NBAPlayerBios.csv\")\n"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": []
239 |   }
240 |  ],
241 |  "metadata": {
242 |   "kernelspec": {
243 |    "display_name": "Python 3",
244 |    "language": "python",
245 |    "name": "python3"
246 |   },
247 |   "language_info": {
248 |    "codemirror_mode": {
249 |     "name": "ipython",
250 |     "version": 3
251 |    },
252 |    "file_extension": ".py",
253 |    "mimetype": "text/x-python",
254 |    "name": "python",
255 |    "nbconvert_exporter": "python",
256 |    "pygments_lexer": "ipython3",
257 |    "version": "3.7.3"
258 |   },
259 |   "toc": {
260 |    "base_numbering": 1,
261 |    "nav_menu": {},
262 |    "number_sections": true,
263 |    "sideBar": false,
264 |    "skip_h1_title": false,
265 |    "title_cell": "Table of Contents",
266 |    "title_sidebar": "Contents",
267 |    "toc_cell": true,
268 |    "toc_position": {},
269 |    "toc_section_display": true,
270 |    "toc_window_display": false
271 |   }
272 |  },
273 |  "nbformat": 4,
274 |  "nbformat_minor": 2
275 | }
276 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # nba-data-models
  2 | 
  3 | This repository contains explorations and models of NBA data. 
  4 | 
  5 | ## NBA Data Scraping
  6 | 
  7 | In "Data_Scraping" there are many notebooks which were used to amass datasets from a variety of basketball data websites. 
  8 | 
  9 | Note that only free data was used in this project; even the Synergy data used (advanced player tracking, etc.) is publicly available and scrapable using the custom tools contained in this repository.
 10 | 
 11 | Various data scraping tools were used: Scrapy (an open source Python web crawler library built using the Twisted framework), Selenium (a web browser automation tool), GeckoDriver/ChromeDriver for Firefox/Chrome browser launching, and the lxml HTML parsing library.
 12 | 
 13 | To scrape HTML data from simple, static webpages, Scrapy is used. This is done for the following sites:
 14 | 
 15 | * https://www.basketball-reference.com (for some basic and advanced stats not obtained from NBA.com)
 16 | * http://www.espn.com/nba/statistics/rpm (for ESPN Real Plus-Minus advanced player stats)
 17 | * http://insider.espn.com/nba/hollinger/statistics/_/qualified/false (for advanced Hollinger player stats)
 18 | 
 19 | The data obtained from these webpages are stored in JSON formats as player-season dictionaries, containing per-season stats.
 20 | 
 21 | 
 22 | For interactive, dynamic pages served using Javascript, web drivers are used to launch headless browser instances and Selenium and lxml are used to make table selections and parse the HTML:
 23 | 
 24 | * https://stats.nba.com/draft/combine-anthro (for draft combine data on drafted players)
 25 | * https://stats.nba.com/players (for aggregate player data and player bios)
 26 | * https://stats.nba.com/lineups (for aggregate lineup data)
 27 | * https://stats.nba.com/game (for play-by-play data for individual games)
 28 | 
 29 | The data obtained from these webpages are stored in CSV format. 
 30 | 
 31 | 
 32 | Player data is combined using a cleanup script which merges Pandas dataframes into a comprehensive player-season database (this combines draft combine, bio, basic, and advanced stats). Player, lineup, and play-by-play datasets are stored in separate CSV files.
 33 | 
 34 | 
 35 | ## Exploration of Player, Lineup, and Play-by-Play Data
 36 | 
 37 | The "Player_Data_Exploration" notebook breaks down and explores a vast number of player stats. 
 38 | 
 39 | In particular, a basic clustering is performed for players of each general position (Guard, Forward, Center).
 40 | K-means clustering is used to lump players into categories based on a few key traits:
 41 | 
 42 | * Passing/play-making ('AST_PH', 'ASTR', 'ATR')
 43 | * Frequency of 3 point shots ('3PR')
 44 | * Defensive specialization ('BLK_PH', 'DFGP_3PT_PG', 'DFGP_PG')
 45 | * Usage rate ('USG')
 46 | 
 47 | The stats used for clustering are chosen for each position through trial-and-error, using silhouette scores to evaluate the best training features. This silhouette analysis also gives the optimal number of clusters to use at each position.
 48 | 
 49 | This notebook examines offensive and defensive performance metrics (including the relationship between different efficiency stats), shot selection (including distance from basket, number of dribbles, and defender proximity), and offensive and defensive play types used.
 50 | 
 51 | 
 52 | ## A Bokeh Application for Exploring Player Data
 53 | 
 54 | A primitive Bokeh application has been developed for interactively filtering and plotting player data, which expedites the exploration of player stats. Note that Bokeh software is needed as a prerequisite to run this application (see https://bokeh.org/).
 55 | 
 56 | To run the local Bokeh server (which should automatically open at http://localhost:5006/bokeh_app in your browser), run the following command:
 57 | 
 58 | > bokeh serve --show bokeh_app/
 59 | 
 60 | Shown below are some examples of how the player, lineup, and play-by-play tabs can be used:
 61 | ![alt text](https://github.com/jecutter/nba-data-models/blob/master/img/nba_stats_explorer_curry_player_ex.png?raw=true)
 62 | ![alt text](https://github.com/jecutter/nba-data-models/blob/master/img/nba_stats_explorer_curry_lineup_ex.png?raw=true)
 63 | ![alt text](https://github.com/jecutter/nba-data-models/blob/master/img/nba_stats_explorer_curry_pbp_ex.png?raw=true)
 64 | 
 65 | NOTE: It is always possible that data has moved/been removed from the repo, however the necessary data may be regenerated using the scraping tools provided in this repo.
 66 | 
 67 | 
 68 | ## Modeling for Player and Lineup Evaluation
 69 | 
 70 | The "Data_Modeling" directory contains some NBA models in various stages of development.
 71 | 
 72 | ### Play Style Modeling
 73 | 
 74 | The "Player_Comparison_Analysis" notebook contains a simple classification model for finding the best player comp for first-year/rookie players. Veteran players (defined as players to play in all of the last 4 full NBA seasons) are divided into player-seasons, the first 3 seasons being used for model training and the last season being used for testing and validation. 
 75 | 
 76 | The classification model is built using key stylistic player stats, specifically:
 77 | * 'height' - player height in inches
 78 | * 'weight' - player weight in lbs
 79 | * 'FG_FREQ_05FT' - percentage of shots from 0-5 ft. from the basket
 80 | * 'FG_FREQ_59FT' - percentage of shots from 5-9 ft. from the basket
 81 | * 'FG_FREQ_1014FT' - percentage of shots from 10-14 ft. from the basket
 82 | * 'FG_FREQ_1519FT' - percentage of shots from 15-19 ft. from the basket
 83 | * 'FG_FREQ_2024FT' - percentage of shots from 20-24 ft. from the basket
 84 | * 'FG_FREQ_GT24FT' - percentage of shots from > 24 ft. from the basket
 85 | * 'FG_FREQ_CANDS' - percentage of shots that are catch-and-shoot (no dribbles)
 86 | * 'FTR' - free throw rate
 87 | * 'ASTR' - assist rate
 88 | * 'TOR' - turnover rate
 89 | * 'ORR' - offensive rebounding rate
 90 | * 'DRR' - defensive rebounding rate
 91 | * 'BLK_PH' - shot blocks (per 100 possessions)
 92 | * 'STL_PH' - steals (per 100 possessions)
 93 | * 'DFGP_PG' - defensive/opponent field goal percentage
 94 | 
 95 | A variety of permutations were attempted to optimize the testing results, however accuracy of player classification (identifying a veteran player by their 4th season using the model) maxed out at ~75%. This is reasonable, given the variance of player stats from season to season.
 96 | 
 97 | Predictive results are then shown by classifying rookies/first-year players from the 2018-2019 season. This demonstrates the usefulness of this algorithm as a scouting tool for assessing a player's play style.
 98 | 
 99 | ### Player Impact Evaluation
100 | 
101 | The "RAPM_Ridge_Regression" model uses lineup matchup data over 3 full seasons of NBA games to calculate a player's lineup-independent impact. This can be done with ridge regression to calculate a known quantity called "RAPM" (Regularized Adjusted Plus-Minus).
102 | 
103 | A player's +/- is defined as the team's point differential (relative to the opposing team) while that player is on the floor. A player's offensive and defensive impact both affect their raw +/-, but it is a lineup-dependent quantity since it depends on the player's supporting cast as well as the opposing lineups. 
104 | 
105 | The way to take into account all players on the floor is to calculate the "APM" (Adjusted Plus-Minus). This is obtained by creating a matchup matrix *M*, where each row is a lineup matchup and each column is a player. A matrix entry is set to "1" if the player is on offensve, "-1" if the player is on defense, and "0" if the player is not involved in the matchup. The point differential per 100 possessions is calculated for each lineup matchup, which forms an array *y*. We may then solve the equation *M* *x* = *y* for player coefficients *x*, which represent the players' adjusted contributions.
106 | 
107 | The problem with this method is that there is enormous variance in the coefficients due to multicollinearity between players. We therefore perform a modified/perturbed regression, a Bayesian filtering process which introduces bias but greatly reduces the variance by penalizing (regularizing) outliers. The result is a set of player RAPM coefficients, which give a relative ranking of the player's impact. More mathematical details are given in the notebook.
108 | 
109 | Validation of the RAPM model is difficult, since it incorporates a global dataset to produce relative (but biased) player rankings. This means that it is not particularly useful for predicting true lineup matchup results. However, it is useful for scouting players whose impact may be underrated by their environment or have winning qualities that are intangible.
110 | 
111 | ### Lineup Optimization
112 | 
113 | Work in progress.
114 | 


--------------------------------------------------------------------------------
/bokeh_app/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | from bokeh.io import curdoc
 8 | from bokeh.layouts import column, layout
 9 | from bokeh.models import ColumnDataSource, Div, Select, Slider, TextInput, Panel
10 | from bokeh.models.widgets import Tabs
11 | from bokeh.plotting import figure
12 | #from bokeh import mpl
13 | 
14 | from tabs.players import player_tab
15 | from tabs.lineups import lineup_tab
16 | from tabs.playbyplay import playbyplay_tab
17 | 
18 | 
19 | # Grab relative path to datasets
20 | current_file = os.path.abspath(os.path.dirname(__file__))
21 | player_csv_file = os.path.join(current_file, '../CompleteNBAPlayerStats.csv')
22 | lineup_csv_file = os.path.join(current_file, '../NBALineupStats_preInsight.csv')
23 | pbp_csv_file = os.path.join(current_file, '../NBA_PBP_Data_PlusMinus.csv')
24 | 
25 | # Load NBA player data
26 | df_player = pd.read_csv(player_csv_file)
27 | df_lineup = pd.read_csv(lineup_csv_file)
28 | df_pbp = pd.read_csv(pbp_csv_file)
29 | 
30 | # Create each of the tabs
31 | tab1 = player_tab(df_player)
32 | tab2 = lineup_tab(df_lineup)
33 | tab3 = playbyplay_tab(df_pbp)
34 | 
35 | # Collect created tabs
36 | tabs = Tabs(tabs = [tab1, tab2, tab3])
37 | 
38 | # Add tabs to the Bokeh document
39 | #curdoc().add_root(l)
40 | curdoc().add_root(tabs)
41 | curdoc().title = "NBAStats"
42 | 
43 | 


--------------------------------------------------------------------------------
/bokeh_app/tabs/lineups.py:
--------------------------------------------------------------------------------
  1 | from os.path import dirname, join
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | from bokeh.io import curdoc
  8 | from bokeh.layouts import column, layout
  9 | from bokeh.models import ColumnDataSource, Div, Select, Slider, TextInput, Panel
 10 | from bokeh.models.widgets import Tabs, DataTable, DateFormatter, TableColumn
 11 | from bokeh.plotting import figure
 12 | 
 13 | def ControlUpdate(df, source, controls, plot, table):
 14 | 	mask = LineupMask(df, controls)
 15 | 	x_name = controls[4].value
 16 | 	y_name = controls[5].value
 17 | 
 18 | 	#df["color"] = np.where(mask, "red", "grey")
 19 | 	#df["alpha"] = np.where(mask, 0.9, 0.25)
 20 | 	df_mask = df[mask]
 21 | 
 22 | 	# Title 
 23 | 	#plot.title.text = "%d lineups selected" % len(df_mask)
 24 | 	#plot.title.text_font_size = '20pt'
 25 | 	#plot.title.text_font = 'serif'
 26 | 	#plot.title.align = 'center'
 27 | 
 28 |   # Axis titles
 29 | 	plot.xaxis.axis_label = x_name 
 30 | 	plot.yaxis.axis_label = y_name 
 31 | 	plot.xaxis.axis_label_text_font_size = '12pt'
 32 | 	plot.xaxis.axis_label_text_font_style = 'bold'
 33 | 	plot.yaxis.axis_label_text_font_size = '12pt'
 34 | 	plot.yaxis.axis_label_text_font_style = 'bold'
 35 | 
 36 |   # Tick labels
 37 | 	plot.xaxis.major_label_text_font_size = '12pt'
 38 | 	plot.yaxis.major_label_text_font_size = '12pt'
 39 | 
 40 | 	table.columns = [
 41 | 		TableColumn(field="name", title='Lineup'),
 42 | 		TableColumn(field="year", title='Season'),
 43 | 		TableColumn(field="x", title=x_name),
 44 | 		TableColumn(field="y", title=y_name),
 45 | 	]
 46 | 	
 47 | 	source.data = dict(
 48 | 		x = df_mask[x_name],
 49 | 		y = df_mask[y_name],
 50 | 		name = df_mask['lineup_name'],
 51 | 		year = df_mask['year'],
 52 | 		team = df_mask['team']
 53 | 	)
 54 | 
 55 | def LineupMask(df, controls):
 56 | 	player = controls[0].value.lower()
 57 | 	team = controls[1].value
 58 | 	
 59 | 	if player == '':
 60 | 		mask_player = np.ones(len(df.index), dtype=bool)
 61 | 	else:
 62 | 		mask_player = df.lineup_name.str.lower().str.contains(player)
 63 | 	
 64 | 	if team == 'All':
 65 | 		mask_team = np.ones(len(df.index), dtype=bool)
 66 | 	else:
 67 | 		if team == 'Multiple':
 68 | 			mask_team = (df.team == 'TOT')
 69 | 		else:
 70 | 			mask_team = (df.team == team)
 71 | 
 72 | 	mask_year = ((df.year >= controls[2].value) & (df.year <= controls[3].value))
 73 | 	
 74 | 	mask = np.logical_and(mask_player, mask_team)
 75 | 	mask = np.logical_and(mask, mask_year)
 76 | 
 77 | 	return mask
 78 | 
 79 | 
 80 | def lineup_tab(dfl):
 81 | 	# Grab list of stats/columns from dataframe
 82 | 	stats = list(dfl.columns.values)
 83 | 
 84 | 	# Grab list of teams
 85 | 	teams = np.unique(dfl.team.values)
 86 | 	teams = np.insert(teams, 0, 'All')
 87 | 	teams = np.insert(teams, 1, 'Multiple')
 88 | 	teams = teams[teams != 'TOT']
 89 | 	teams = list(teams)
 90 | 
 91 | 	player_sel = TextInput(title="Lineup Contains Player:")
 92 | 	team_sel = Select(title="Team:", options=teams, value='All')
 93 | 	min_year = Slider(title="Starting Season", start=2016, end=2020, value=2016, step=1)
 94 | 	max_year = Slider(title="Ending Season", start=2016, end=2020, value=2019, step=1)
 95 | 	x_axis = Select(title="X Axis", options=stats, value="GPT")
 96 | 	y_axis = Select(title="Y Axis", options=stats, value="NETRTGT")
 97 | 
 98 | 	# Create a data source dictionary for storing data with each update
 99 | 	source = ColumnDataSource(data=dict(x=[], y=[], name=[], year=[], team=[]))
100 | 
101 | 	# Create tooltips object for hover variables,
102 | 	# and create figure for scatterplot
103 | 	TOOLTIPS=[
104 | 		("Name", "@name"),
105 | 		("Year", "@year"),
106 | 		("Team", "@team")
107 | 	]
108 | 
109 | 	p = figure(plot_height=600, plot_width=700, title="", tooltips=TOOLTIPS, sizing_mode="scale_both")
110 | 	p.circle(x="x", y="y", source=source, size=7, line_color=None, fill_alpha=0.8)
111 | 
112 | 	columns = [
113 | 		TableColumn(field="name", title='Name'),
114 | 		TableColumn(field="year", title='Season'),
115 | 		TableColumn(field="x", title=x_axis.value),
116 | 		TableColumn(field="y", title=y_axis.value),
117 | 	]
118 | 	data_table = DataTable(source=source, columns=columns, width=275, height=550)
119 | 
120 | 	# Create controls for filtering plotted/table data
121 | 	controls = [ player_sel, team_sel, min_year, max_year, x_axis, y_axis ]
122 | 	for control in controls:
123 | 			control.on_change('value', lambda attr, old, new: ControlUpdate(dfl, source, controls, p, data_table))
124 | 
125 | 	# Do a preliminary update of plot and table
126 | 	ControlUpdate(dfl, source, controls, p, data_table)
127 | 
128 | 	# Create layout by column
129 | 	inputs = column(*controls, width=250, height=600)
130 | 	inputs.sizing_mode = "fixed"
131 | 	l = layout([
132 | 			[inputs, p, data_table],
133 | 	#], sizing_mode="scale_both")
134 | 	])
135 | 
136 | 	# Make a tab with the layout 
137 | 	tab = Panel(child=l, title = 'Lineup Stats')
138 | 		
139 | 	return tab
140 | 
141 | 


--------------------------------------------------------------------------------
/bokeh_app/tabs/playbyplay.py:
--------------------------------------------------------------------------------
  1 | from os.path import dirname, join
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | from bokeh.io import curdoc
  8 | from bokeh.layouts import column, layout
  9 | from bokeh.models import ColumnDataSource, Div, Select, Slider, TextInput, Panel, BoxAnnotation
 10 | from bokeh.models.widgets import Tabs, DataTable, DateFormatter, TableColumn
 11 | from bokeh.plotting import figure
 12 | 
 13 | 
 14 | global_boxes = []
 15 | 
 16 | def ControlUpdate(df, source, controls, plot):
 17 | 	global global_boxes
 18 | 
 19 | 	mask, starts, ends = PBPMask(df, controls)
 20 | 	team = controls[0].value
 21 | 	game = controls[2].value
 22 | 	y_name = controls[4].value
 23 | 
 24 | 	df_mask = df[mask]
 25 | 
 26 | 	if len(starts) > 0:
 27 | 		keep_starts = np.ones(len(starts), dtype=bool)
 28 | 		keep_ends = np.ones(len(ends), dtype=bool)
 29 | 		prev_end = -1
 30 | 		for i in np.arange(len(starts)):
 31 | 			#print(starts[i], ends[i])
 32 | 			if starts[i] == ends[i]:
 33 | 				#print('throwing out equal starts ends:', starts[i], ends[i])
 34 | 				keep_starts[i] = False
 35 | 				if ends[i] != 2880.:
 36 | 					keep_ends[i] = False
 37 | 			if i > 0:
 38 | 				if starts[i] - prev_end <= 2.:
 39 | 					#print('throwing out overlapping intervals, deleting:', ends[i-1], starts[i])
 40 | 					keep_starts[i] = False
 41 | 					keep_ends[i-1] = False
 42 | 			#print('setting previous end to', ends[i])
 43 | 			prev_end = ends[i]
 44 | 		#print(keep_starts, keep_ends)
 45 | 		#print(np.array(starts)[keep_starts])
 46 | 		#print(np.array(ends)[keep_ends])
 47 | 		starts = list(np.array(starts)[keep_starts])
 48 | 		ends = list(np.array(ends)[keep_ends])
 49 | 
 50 | 	boxes = []
 51 | 	for start,end in zip(starts, ends):
 52 | 		#print('made it', start, end)
 53 | 		box = BoxAnnotation(left=start, right=end,
 54 | 											line_width=1, line_color='black', line_dash='dashed',
 55 | 											fill_alpha=0.2, fill_color='orange')
 56 | 		print('box', box)
 57 | 		plot.add_layout(box)
 58 | 		boxes.append(box)
 59 | 
 60 | 	#if len(boxes) != 0:
 61 | 	#	plot.renderers.extend(boxes)
 62 | 	#	global_boxes += boxes
 63 | 	#else:
 64 | 	#	if len(global_boxes) > 0:
 65 | 	#		plot.renderers.remove(global_boxes)
 66 | 	#	#for i, r in enumerate(plot.renderers):
 67 | 	#	#	print(r)
 68 | 	#	#	if i > 0:
 69 | 	#	#		plot.renderers.remove(r)
 70 | 
 71 | 	# Title
 72 | 	if df_mask.home_team.values[0] == team:
 73 | 		title = "Game " + game + " for " + team + ": At Home (vs. " + df_mask.vis_team.values[0] + ")"
 74 | 	else:
 75 | 		title = "Game " + game + " for " + team + ": Away Team (vs. " + df_mask.home_team.values[0] + ")"
 76 | 
 77 | 	plot.title.text = title
 78 | 	plot.title.text_font_size = '20pt'
 79 | 	plot.title.text_font = 'serif'
 80 | 	plot.title.align = 'center'
 81 | 
 82 |   # Axis titles
 83 | 	plot.xaxis.axis_label = 'Time (seconds)' 
 84 | 	plot.yaxis.axis_label = y_name 
 85 | 	plot.xaxis.axis_label_text_font_size = '12pt'
 86 | 	plot.xaxis.axis_label_text_font_style = 'bold'
 87 | 	plot.yaxis.axis_label_text_font_size = '12pt'
 88 | 	plot.yaxis.axis_label_text_font_style = 'bold'
 89 | 
 90 |   # Tick labels
 91 | 	plot.xaxis.major_label_text_font_size = '12pt'
 92 | 	plot.yaxis.major_label_text_font_size = '12pt'
 93 | 
 94 | 	# Update source data
 95 | 	source.data = dict(
 96 | 		x = df_mask['time_sec'],
 97 | 		y = df_mask[y_name],
 98 | 		home_play = df_mask['ht_play'],
 99 | 		away_play = df_mask['vt_play']
100 | 		#color = df_mask["color"],
101 | 		#alpha = df_mask["alpha"]
102 | 	)
103 | 
104 | def PBPMask(df, controls):
105 | 	# Group dataframe by team and establish for which games
106 | 	# the team is at home or away
107 | 	team = controls[0].value
108 | 	mask_team = ((df.home_team == team) | (df.vis_team == team))
109 | 
110 | 	year = int(controls[1].value)
111 | 	mask_year = (df.year == year)
112 | 
113 | 	game_idx = int(controls[2].value)-1
114 | 	team_games = np.unique(df[mask_team & mask_year].groupby(['game'], as_index=False).mean().game)
115 | 	mask_game = (df.game == team_games[game_idx])
116 | 	
117 | 	mask = np.logical_and(mask_team, mask_year)
118 | 	mask = np.logical_and(mask, mask_game)
119 | 
120 | 	df_masked = df[mask]
121 | 	
122 | 	# Create player mask
123 | 	player = controls[3].value.lower()
124 | 	if player == '':
125 | 		player_starts = []
126 | 		player_ends = []
127 | 	else:
128 | 		dfhead = df_masked.groupby((df_masked[['ht_lineup','vt_lineup']] != df_masked[['ht_lineup','vt_lineup']].shift(1)).any(axis=1).cumsum()).head(1).reset_index(drop=True)
129 | 		dftail = df_masked.groupby((df_masked[['ht_lineup','vt_lineup']] != df_masked[['ht_lineup','vt_lineup']].shift(1)).any(axis=1).cumsum()).tail(1).reset_index(drop=True)
130 | 		player_start_mask = ((dfhead.ht_lineup.str.lower().str.contains(player) & (dfhead.home_team == team)) | (dfhead.vt_lineup.str.lower().str.contains(player) & (dfhead.vis_team == team)))
131 | 		player_end_mask = ((dftail.ht_lineup.str.lower().str.contains(player) & (dftail.home_team == team)) | (dftail.vt_lineup.str.lower().str.contains(player) & (dftail.vis_team == team)))
132 | 		player_starts = list(dfhead[player_start_mask].time_sec.values)
133 | 		player_ends = list(dftail[player_end_mask].time_sec.values)
134 | 
135 | 	return mask, player_starts, player_ends
136 | 
137 | 
138 | def playbyplay_tab(dft):
139 | 	# Grab list of stats/columns from dataframe
140 | 	stats = list(dft.columns.values)
141 | 
142 | 	# Grab list of teams
143 | 	teams = np.unique(dft.home_team.values)
144 | 	teams = teams[teams != 'TOT']
145 | 	teams = list(teams)
146 | 
147 | 	# Grab list of years/seasons
148 | 	years = list(np.unique(dft.year.values).astype(str))
149 | 
150 | 	# Create a list of games (each team plays games 1-82)
151 | 	games = list(np.arange(1,83).astype(str))
152 | 
153 | 	team_sel = Select(title="Team:", options=teams, value='ATL')
154 | 	year_sel = Select(title="Season:", options=years, value='2017')
155 | 	game_sel = Select(title="Game:", options=games, value='1')
156 | 	stint_sel = TextInput(title="Stints Containing Player:")
157 | 	y_axis = Select(title="Y Axis", options=stats, value="ht_margin")
158 | 
159 | 	# Create a data source dictionary for storing data with each update
160 | 	#source = ColumnDataSource(data=dict(x=[], y=[], home_play=[], away_play=[], color=[], alpha=[]))
161 | 	source = ColumnDataSource(data=dict(x=[], y=[], home_play=[], away_play=[]))
162 | 
163 | 	# Create tooltips object for hover variables,
164 | 	# and create figure for scatterplot
165 | 	TOOLTIPS=[
166 | 		("H.T. Play:", "@home_play"),
167 | 		("A.T. Play:", "@away_play")
168 | 	]
169 | 
170 | 	p = figure(plot_height=550, plot_width=1000, title="", tooltips=TOOLTIPS, sizing_mode="scale_both")
171 | 	#p.line(x="x", y="y", source=source, line_width=2, color='color', line_alpha='alpha')
172 | 	p.line(x="x", y="y", source=source, line_width=2, color='black')
173 | 
174 | 	# Create controls for filtering plotted data
175 | 	controls = [ team_sel, year_sel, game_sel, stint_sel, y_axis ]
176 | 	for control in controls:
177 | 			control.on_change('value', lambda attr, old, new: ControlUpdate(dft, source, controls, p))
178 | 
179 | 	# Do a preliminary update of plot
180 | 	ControlUpdate(dft, source, controls, p)
181 | 
182 | 	# Create layout by column
183 | 	inputs = column(*controls, width=250, height=600)
184 | 	inputs.sizing_mode = "fixed"
185 | 	l = layout([
186 | 			[inputs, p],
187 | 	#], sizing_mode="scale_both")
188 | 	])
189 | 
190 | 	# Make a tab with the layout 
191 | 	tab = Panel(child=l, title = 'Game Play-By-Play')
192 | 		
193 | 	return tab
194 | 
195 | 


--------------------------------------------------------------------------------
/bokeh_app/tabs/players.py:
--------------------------------------------------------------------------------
  1 | from os.path import dirname, join
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | from bokeh.io import curdoc
  8 | from bokeh.layouts import column, layout
  9 | from bokeh.models import ColumnDataSource, Div, Select, Slider, TextInput, Panel
 10 | from bokeh.models.widgets import Tabs, DataTable, DateFormatter, TableColumn
 11 | from bokeh.plotting import figure
 12 | 
 13 | def ControlUpdate(df, psource, tsource, controls, plot, table):
 14 | 	mask = PlayerMask(df, controls)
 15 | 	x_name = controls[6].value
 16 | 	y_name = controls[7].value
 17 | 
 18 | 	df["color"] = np.where(mask, "red", "grey")
 19 | 	df["alpha"] = np.where(mask, 0.9, 0.25)
 20 | 	df_mask = df[mask]
 21 | 
 22 | 	# Title 
 23 | 	#plot.title.text = "%d players selected" % len(dfp)
 24 | 	#plot.title.text_font_size = '20pt'
 25 | 	#plot.title.text_font = 'serif'
 26 | 	#plot.title.align = 'center'
 27 | 
 28 |   # Axis titles
 29 | 	plot.xaxis.axis_label = x_name 
 30 | 	plot.yaxis.axis_label = y_name 
 31 | 	plot.xaxis.axis_label_text_font_size = '12pt'
 32 | 	plot.xaxis.axis_label_text_font_style = 'bold'
 33 | 	plot.yaxis.axis_label_text_font_size = '12pt'
 34 | 	plot.yaxis.axis_label_text_font_style = 'bold'
 35 | 
 36 |   # Tick labels
 37 | 	plot.xaxis.major_label_text_font_size = '12pt'
 38 | 	plot.yaxis.major_label_text_font_size = '12pt'
 39 | 
 40 | 	table.columns = [
 41 | 		TableColumn(field="name", title='Name'),
 42 | 		TableColumn(field="year", title='Season'),
 43 | 		TableColumn(field="x", title=x_name),
 44 | 		TableColumn(field="y", title=y_name),
 45 | 	]
 46 | 	
 47 | 	psource.data = dict(
 48 | 		x = df[x_name],
 49 | 		y = df[y_name],
 50 | 		name = df['name'],
 51 | 		year = df['year'],
 52 | 		team = df['team'],
 53 | 		color = df['color'],
 54 | 		alpha = df['alpha'] 
 55 | 	)
 56 | 
 57 | 	tsource.data = dict(
 58 | 		x = df_mask[x_name],
 59 | 		y = df_mask[y_name],
 60 | 		name = df_mask['name'],
 61 | 		year = df_mask['year'],
 62 | 		team = df_mask['team'],
 63 | 		color = df_mask['color'],
 64 | 		alpha = df_mask['alpha'] 
 65 | 	)
 66 | 
 67 | def PlayerMask(df, controls):
 68 | 	player = controls[0].value
 69 | 	team = controls[3].value
 70 | 	
 71 | 	if player == '':
 72 | 		mask_player = np.ones(len(df.index), dtype=bool)
 73 | 	else:
 74 | 		mask_player = df.name.str.lower().str.contains(player)
 75 | 	
 76 | 	if team == 'All':
 77 | 		mask_team = np.ones(len(df.index), dtype=bool)
 78 | 	else:
 79 | 		if team == 'Multiple':
 80 | 			mask_team = (df.team == 'TOT')
 81 | 		else:
 82 | 			mask_team = (df.team == team)
 83 | 
 84 | 	mask_year = ((df.year >= controls[4].value) & (df.year <= controls[5].value))
 85 | 	mask_age = ((df.age >= controls[1].value) & (df.age <= controls[2].value))
 86 | 	
 87 | 	mask = np.logical_and(mask_player, mask_team)
 88 | 	mask = np.logical_and(mask, mask_year)
 89 | 	mask = np.logical_and(mask, mask_age)
 90 | 
 91 | 	return mask
 92 | 
 93 | 
 94 | def player_tab(dfp):
 95 | 	# Grab list of stats/columns from dataframe
 96 | 	#stats = sorted(dfp.columns.values)
 97 | 	stats = list(dfp.columns.values)
 98 | 
 99 | 	# Grab list of teams
100 | 	teams = np.unique(dfp.team.values)
101 | 	teams = np.insert(teams, 0, 'All')
102 | 	teams = np.insert(teams, 1, 'Multiple')
103 | 	teams = teams[teams != 'TOT']
104 | 	teams = list(teams)
105 | 
106 | 	# Grab the minimum and maximum player ages
107 | 	age_low = min(dfp.age.values)
108 | 	age_high = max(dfp.age.values)
109 | 
110 | 	#axis_map = {
111 | 	#	"2 PT %": "2PP_PH",
112 | 	#	"3 PT %": "3PP_PH",
113 | 	#}
114 | 
115 | 	player_sel = TextInput(title="Player Names:")
116 | 	min_age = Slider(title="Min Age", start=age_low, end=age_high, value=age_low, step=1)
117 | 	max_age = Slider(title="Max Age", start=age_low, end=age_high, value=age_high, step=1)
118 | 	team_sel = Select(title="Team:", options=teams, value='All')
119 | 	min_year = Slider(title="Starting Season", start=2016, end=2020, value=2016, step=1)
120 | 	max_year = Slider(title="Ending Season", start=2016, end=2020, value=2019, step=1)
121 | 	#x_axis = Select(title="X Axis", options=sorted(axis_map.keys()), value="2 PT %")
122 | 	#y_axis = Select(title="Y Axis", options=sorted(axis_map.keys()), value="3 PT %")
123 | 	x_axis = Select(title="X Axis", options=stats, value="FG_FREQ_05FT")
124 | 	y_axis = Select(title="Y Axis", options=stats, value="FG_FREQ_GT24FT")
125 | 
126 | 	# Create a data source dictionary for storing data with each update
127 | 	psource = ColumnDataSource(data=dict(x=[], y=[], name=[], year=[], team=[], color=[], alpha=[]))
128 | 	tsource = ColumnDataSource(data=dict(x=[], y=[], name=[], year=[], team=[], color=[], alpha=[]))
129 | 
130 | 	# Create tooltips object for hover variables,
131 | 	# and create figure for scatterplot
132 | 	TOOLTIPS=[
133 | 		("Name", "@name"),
134 | 		("Year", "@year"),
135 | 		("Team", "@team")
136 | 	]
137 | 
138 | 	p = figure(plot_height=600, plot_width=700, title="", tooltips=TOOLTIPS, sizing_mode="scale_both")
139 | 	p.circle(x="x", y="y", source=psource, size=7, line_color=None, color='color', fill_alpha='alpha')
140 | 
141 | 	columns = [
142 | 		TableColumn(field="name", title='Name'),
143 | 		TableColumn(field="year", title='Season'),
144 | 		TableColumn(field="x", title=x_axis.value),
145 | 		TableColumn(field="y", title=y_axis.value),
146 | 	]
147 | 	data_table = DataTable(source=tsource, columns=columns, width=275, height=550)
148 | 
149 | 	# Create controls for filtering plotted/table data
150 | 	controls = [ player_sel, min_age, max_age, team_sel, min_year, max_year, x_axis, y_axis ]
151 | 	for control in controls:
152 | 			control.on_change('value', lambda attr, old, new: ControlUpdate(dfp, psource, tsource, controls, p, data_table))
153 | 
154 | 	# Do a preliminary update of plot and table
155 | 	ControlUpdate(dfp, psource, tsource, controls, p, data_table)
156 | 
157 | 	# Create layout by column
158 | 	inputs = column(*controls, width=250, height=600)
159 | 	inputs.sizing_mode = "fixed"
160 | 	l = layout([
161 | 			[inputs, p, data_table],
162 | 	#], sizing_mode="scale_both")
163 | 	])
164 | 
165 | 	# Make a tab with the layout 
166 | 	tab = Panel(child=l, title = 'Player Stats')
167 | 		
168 | 	return tab
169 | 
170 | 


--------------------------------------------------------------------------------
/bokeh_app/templates/index.html:
--------------------------------------------------------------------------------
 1 | {% extends base %}
 2 | 
 3 | {% block preamble %}
 4 | 	<!-- <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"> -->
 5 | 	<link href="https://fonts.googleapis.com/css?family=Alegreya Sans SC" rel='stylesheet'>
 6 | {% endblock %}
 7 | 
 8 | {% block postamble %}
 9 |   <style>
10 |     {% include 'styles.css' %}
11 |   </style>
12 | {% endblock %}
13 | 
14 | {% block contents %}
15 |   <div class="page-header">
16 |     <h1>NBA Stats Explorer</h1>
17 |     <p>An interactive plotter for NBA data</p>
18 | 	</div>
19 | 	
20 | 	<div class="content">
21 |     <!-- <p>An interactive plotter for NBA data.</p> -->
22 |     {{ super() }}
23 |   </div>
24 | {% endblock %}
25 | 


--------------------------------------------------------------------------------
/bokeh_app/templates/styles.css:
--------------------------------------------------------------------------------
 1 | .page-header {
 2 | 	/*
 3 | 	background-color: #C25100;
 4 | 	*/
 5 | 	background-color: #ffffff;
 6 | 	width: 100%;
 7 | 	margin-top: 0;
 8 | 	padding: 0px 20px 10px 30px;
 9 | }
10 | .page-header h1 {
11 | 	/*
12 | 	color: #ffffff;
13 | 	font-family: 'Julius Sans One', serif;
14 | 	*/
15 | 	font-family: 'Alegreya Sans SC';
16 | 	font-size: 24pt;
17 | 	text-decoration: none;
18 | }
19 | .page-header p {
20 | 	/*
21 | 	color: #ffffff;
22 | 	font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
23 | 	*/
24 | 	font-family: 'Alegreya Sans SC';
25 | 	font-size: 15pt;
26 | 	font-style: italic;
27 | 	text-align: justify;
28 | 	text-justify: inter-word;
29 | }
30 | .content {
31 | 	width: 100%;
32 | 	margin-left: 15px;
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/img/nba_stats_explorer_curry_lineup_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jecutter/nba-data-models/63a81fa04eb4eb3a1b0caa5da2c105a4442d32f9/img/nba_stats_explorer_curry_lineup_ex.png


--------------------------------------------------------------------------------
/img/nba_stats_explorer_curry_pbp_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jecutter/nba-data-models/63a81fa04eb4eb3a1b0caa5da2c105a4442d32f9/img/nba_stats_explorer_curry_pbp_ex.png


--------------------------------------------------------------------------------
/img/nba_stats_explorer_curry_player_ex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jecutter/nba-data-models/63a81fa04eb4eb3a1b0caa5da2c105a4442d32f9/img/nba_stats_explorer_curry_player_ex.png


--------------------------------------------------------------------------------
	year	game	home_team	vis_team	time_sec	ht_score	vt_score	points_scored	ht_play	vt_play	ht_poss
0	2017	1	CLE	NYK	20.0	0	2	2		Rose 1' Driving Layup (2 PTS) (Noah 1 AST)	DEF
1	2017	1	CLE	NYK	37.0	0	2	0		Noah REBOUND (Off:0 Def:1)	OFF
2	2017	1	CLE	NYK	45.0	0	4	2		Porzingis 2' Tip Layup Shot (2 PTS)	DEF
3	2017	1	CLE	NYK	61.0	2	4	2	James 11' Jump Shot (2 PTS) (Irving 1 AST)		OFF
4	2017	1	CLE	NYK	62.0	2	4	0		Rose Out of Bounds Lost Ball Turnover (P1.T1)	DEF
...	...	...	...	...	...	...	...	...	...	...	...
720766	2019	1230	POR	SAC	2832.0	135	131	2	Simons 1' Tip Layup Shot (36 PTS)		OFF
720767	2019	1230	POR	SAC	2846.0	135	131	0	Layman REBOUND (Off:0 Def:4)		DEF
720768	2019	1230	POR	SAC	2859.0	136	131	0		Swanigan REBOUND (Off:3 Def:4)	OFF
720769	2019	1230	POR	SAC	2869.0	136	131	0	Labissiere REBOUND (Off:4 Def:11)		DEF
720770	2019	1230	POR	SAC	2880.0	136	131	0	EOQ	EOQ	OFF