├── .gitignore ├── Data_Exploration ├── Lineup_Data_Exploration.ipynb ├── Lineup_Matchup_Exploration.ipynb ├── PlayByPlay_Data_Exploration.ipynb └── Player_Data_Exploration.ipynb ├── Data_Modeling ├── CompThePlayer.ipynb ├── RAPM_Model_NoPriors.ipynb └── Scratch │ ├── RegressionTest_Lineups.ipynb │ ├── RegressionTest_OffensiveRatings.ipynb │ └── RegressionTest_Players.ipynb ├── Data_Scraping ├── Cleanup_NBA_Player_Data.ipynb ├── Generate_Matchup_Data.ipynb ├── Scrape_BBallRefAdvanced_Stats.ipynb ├── Scrape_BBallRef_Stats.ipynb ├── Scrape_ESPN_RealPM_Stats.ipynb ├── Scrape_Hollinger_Stats.ipynb ├── Scrape_NBA_AdvancedStats_Data.ipynb ├── Scrape_NBA_DraftCombine_Data.ipynb ├── Scrape_NBA_Lineup_Stats.ipynb ├── Scrape_NBA_PlayType_Data.ipynb ├── Scrape_NBA_PlayerBios_Data.ipynb └── Scrape_PlayByPlay_Data.ipynb ├── README.md ├── bokeh_app ├── main.py ├── tabs │ ├── lineups.py │ ├── playbyplay.py │ └── players.py └── templates │ ├── index.html │ └── styles.css └── img ├── nba_stats_explorer_curry_lineup_ex.png ├── nba_stats_explorer_curry_pbp_ex.png └── nba_stats_explorer_curry_player_ex.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore data files 2 | *.json 3 | *.csv 4 | 5 | # Ignore backup and checkpoint files 6 | .ipynb_checkpoints 7 | .swp 8 | -------------------------------------------------------------------------------- /Data_Modeling/Scratch/RegressionTest_OffensiveRatings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Import modules and packages\n", 20 | "import numpy as np\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "import matplotlib.cm as cm\n", 23 | "import json\n", 24 | "import pandas as pd\n", 25 | "import seaborn as sns\n", 26 | "import re\n", 27 | "from scipy.stats import gaussian_kde\n", 28 | "from sklearn import metrics\n", 29 | "from sklearn.metrics import confusion_matrix\n", 30 | "from sklearn.preprocessing import StandardScaler\n", 31 | "from sklearn.preprocessing import LabelEncoder\n", 32 | "from sklearn.model_selection import train_test_split\n", 33 | "from sklearn.model_selection import cross_val_score\n", 34 | "from sklearn.model_selection import GridSearchCV\n", 35 | "from sklearn import svm\n", 36 | "from sklearn.neural_network import MLPRegressor\n", 37 | "from sklearn.linear_model import LinearRegression\n", 38 | "from sklearn.linear_model import Ridge\n", 39 | "from sklearn.ensemble import RandomForestClassifier\n", 40 | "from xgboost import XGBClassifier\n", 41 | "import unidecode\n", 42 | "import unicodedata\n", 43 | "%matplotlib inline" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "sns.set_style(\"whitegrid\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Read in compiled NBA player data\n", 62 | "df_orig = pd.read_csv('../CompleteNBAPlayerStats.csv')\n", 63 | "print(\"Table of BBall Player Stats:\\n\\n\", df_orig)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# Dataframes after cutting on GP/MPG/other parameters\n", 73 | "df = df_orig.copy()\n", 74 | "df = df[df.GP > min_num_games]\n", 75 | "df = df[df.MPG > min_MPG]\n", 76 | "\n", 77 | "# Add some additional variables\n", 78 | "## To convert to \"per 36-min\" stats\n", 79 | "df['2PA_PG'] = df['2PA_PT']*df.MPG/36.\n", 80 | "df['3PA_PG'] = df['3PA_PT']*df.MPG/36.\n", 81 | "df['FGA_PG'] = df.FGA_PT*df.MPG/36.\n", 82 | "\n", 83 | "df[\"2PR\"] = df[\"2PA_PH\"]/df[\"FGA_PH\"]\n", 84 | "df[\"3PR\"] = df[\"3PA_PH\"]/df[\"FGA_PH\"]\n", 85 | "\n", 86 | "df['FG_FREQ_RIM'] = (df.FGA_RA)/df.FGA_PG # restricted area\n", 87 | "df['FG_FREQ_MR_AND_PT'] = (df.FGA_MR + df.FGA_NONRA)/df.FGA_PG # combined paint and midrange\n", 88 | "df['FG_FREQ_MR'] = (df.FGA_MR)/df.FGA_PG\n", 89 | "df['FG_FREQ_CORNERS'] = (df.FGA_LC + df.FGA_RC)/df.FGA_PG\n", 90 | "df['FG_FREQ_AB'] = df.FGA_AB/df.FGA_PG\n", 91 | "df['FG_FREQ_01DRIB'] = (df['FGA_0DRIB'] + df['FGA_1DRIB'])/df.FGA_PG\n", 92 | "df['FG_FREQ_GT1DRIB'] = (df['FGA_2DRIB'] + df['FGA_36DRIB'] + df['FGA_GT7DRIB'])/df.FGA_PG\n", 93 | "df['FG_FREQ_CANDS'] = df['FGA_CANDS']/df.FGA_PG\n", 94 | "\n", 95 | "df[\"FG_FREQ_RIM\"].fillna(0, inplace=True)\n", 96 | "df[\"FG_FREQ_MR_AND_PT\"].fillna(0, inplace=True)\n", 97 | "df[\"FG_FREQ_MR\"].fillna(0, inplace=True)\n", 98 | "df[\"FG_FREQ_CORNERS\"].fillna(0, inplace=True)\n", 99 | "df[\"FG_FREQ_AB\"].fillna(0, inplace=True)\n", 100 | "df[\"FG_FREQ_01DRIB\"].fillna(0, inplace=True)\n", 101 | "df[\"FG_FREQ_GT1DRIB\"].fillna(0, inplace=True)\n", 102 | "df[\"FG_FREQ_CANDS\"].fillna(0, inplace=True)\n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# Various dataframes separating rookies from established veterans,\n", 112 | "# as well as median data to exclude outliers for veteran players\n", 113 | "df_vets = df[df[\"name\"].isin(df[\"name\"].value_counts()[df[\"name\"].value_counts()>1].index)]\n", 114 | "df_rooks = df[df[\"name\"].isin(df[\"name\"].value_counts()[df[\"name\"].value_counts()==1].index)]\n", 115 | "df_med = df_vets.groupby(\"name\").median().reset_index()\n", 116 | "\n", 117 | "# Dataframes by player position\n", 118 | "# Centers\n", 119 | "dfc = df[df['pos'].str.contains('C')]\n", 120 | "# Forwards\n", 121 | "dff = df[df['pos'].str.contains('F')]\n", 122 | "# Guards\n", 123 | "dfg = df[df['pos'].str.contains('G')]\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "x_train = df[df.year != 2020][['2PM_PH', '3PM_PH', 'FTM_PH', 'AST_PH', 'ORB_PH', 'DRB_PH']]\n", 133 | "y_train = df[df.year != 2020][['OFFRTG']]\n", 134 | "\n", 135 | "reg = LinearRegression()\n", 136 | "reg.fit(x_train, y_train)\n", 137 | "\n", 138 | "x_test = df[df.year == 2020][['2PM_PH', '3PM_PH', 'FTM_PH', 'AST_PH', 'ORB_PH', 'DRB_PH']]\n", 139 | "y_test_tot = df[df.year == 2020].OFFRTG\n", 140 | "y_pred_tot = reg.predict(x_test)\n", 141 | "\n", 142 | "#print(df[df.year == 2020].name.values[i])\n", 143 | "#for i,name in enumerate(df[df.year == 2020].name):\n", 144 | "# print(name, ':', y_pred[:,0][i], df[df.year == 2020].iloc[i,:].OFFRTG)\n", 145 | " \n", 146 | "print('Made', len(y_pred_tot), 'predictions with a RMS error of', np.sqrt(metrics.mean_squared_error(y_test_tot, y_pred_tot)))\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "x_train = df[df.year != 2020][['EFGP', '3PR', 'FTM_PH', 'AST_PH', 'TOR', 'REBR']]\n", 156 | "y_train = df[df.year != 2020][['OFFRTG']]\n", 157 | "\n", 158 | "reg = LinearRegression()\n", 159 | "reg.fit(x_train, y_train)\n", 160 | "\n", 161 | "x_test = df[df.year == 2020][['EFGP', '3PR', 'FTM_PH', 'AST_PH', 'TOR', 'REBR']]\n", 162 | "y_test_eff = df[df.year == 2020].OFFRTG\n", 163 | "y_pred_eff = reg.predict(x_test)\n", 164 | "\n", 165 | "#print(df[df.year == 2020].name.values[i])\n", 166 | "#for i,name in enumerate(df[df.year == 2020].name):\n", 167 | "# print(name, ':', y_pred[:,0][i], df[df.year == 2020].iloc[i,:].OFFRTG)\n", 168 | " \n", 169 | "print('Made', len(y_pred_eff), 'predictions with a RMS error of', np.sqrt(metrics.mean_squared_error(y_test_eff, y_pred_eff)))\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "fig, ax = plt.subplots(figsize=(10, 8))\n", 179 | "plt.xlabel(\"Offensive Rating\")\n", 180 | "plt.ylabel(\"Predicted Offensive Rating\")\n", 181 | "plt.scatter(y_test_tot, y_pred_tot)\n", 182 | "plt.scatter(y_test_eff, y_pred_eff)\n", 183 | "#plt.scatter(y_test_eff, df[df.year == 2020].PTS_PH)\n", 184 | "xmin, xmax = ax.get_xlim()\n", 185 | "#ymin, ymax = ax.get_ylim()\n", 186 | "ymin = 90\n", 187 | "ymax = 125\n", 188 | "plt.plot([90,130], [90,130], 'r--')\n", 189 | "ax.set_xlim(xmin, xmax)\n", 190 | "ax.set_ylim(ymin, ymax)\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "scrolled": false 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "mlp_reg = MLPRegressor()\n", 202 | "mlp_reg.fit(x_train, y_train.values.ravel())\n", 203 | "y_mlp_pred = mlp_reg.predict(x_test)\n", 204 | "for i,pred in enumerate(y_mlp_pred):\n", 205 | " print(str(df_test.name.values[i]) + ' has a predicted +/- of ' + str(pred) + ' compared to real +/- of ' + str(y_test.values[i][0]))\n", 206 | "\n", 207 | "print('Made', len(y_mlp_pred), 'predictions with a RMS error of', np.sqrt(metrics.mean_squared_error(y_test, y_mlp_pred)))\n" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [] 216 | } 217 | ], 218 | "metadata": { 219 | "kernelspec": { 220 | "display_name": "Python 3", 221 | "language": "python", 222 | "name": "python3" 223 | }, 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.7.3" 235 | }, 236 | "toc": { 237 | "base_numbering": 1, 238 | "nav_menu": {}, 239 | "number_sections": true, 240 | "sideBar": false, 241 | "skip_h1_title": false, 242 | "title_cell": "Table of Contents", 243 | "title_sidebar": "Contents", 244 | "toc_cell": true, 245 | "toc_position": {}, 246 | "toc_section_display": true, 247 | "toc_window_display": false 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 2 252 | } 253 | -------------------------------------------------------------------------------- /Data_Scraping/Cleanup_NBA_Player_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Import modules and packages\n", 20 | "import numpy as np\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "import matplotlib.cm as cm\n", 23 | "import json\n", 24 | "import pandas as pd\n", 25 | "import seaborn as sns\n", 26 | "from sklearn.metrics import confusion_matrix\n", 27 | "from sklearn.preprocessing import StandardScaler\n", 28 | "from sklearn.preprocessing import LabelEncoder\n", 29 | "from sklearn.model_selection import train_test_split\n", 30 | "from sklearn.model_selection import cross_val_score\n", 31 | "from sklearn.model_selection import GridSearchCV\n", 32 | "from sklearn import svm\n", 33 | "from sklearn.linear_model import LinearRegression\n", 34 | "from sklearn.ensemble import RandomForestClassifier\n", 35 | "from xgboost import XGBClassifier\n", 36 | "import unidecode\n", 37 | "import unicodedata\n", 38 | "import difflib\n", 39 | "from fuzzywuzzy import fuzz \n", 40 | "from fuzzywuzzy import process\n", 41 | "%matplotlib inline" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# Modify dataframe columns to specified types (float, int, string)\n", 51 | "def ConvertDataFrame(df, str_list, int_list):\n", 52 | " cols = df.columns.drop(str_list)\n", 53 | " df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)\n", 54 | " for string in str_list:\n", 55 | " df[string] = df[string].astype('str')\n", 56 | " for integer in int_list:\n", 57 | " df[integer] = df[integer].astype('int')\n", 58 | " return df\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "# Load and Convert Types for Each Player Dataset" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "Table of NBA Player Bios Over the Last 20 Years:\n", 78 | "\n", 79 | " name height weight college country \\\n", 80 | "0 AC Green 81.0 225.0 Oregon State USA \n", 81 | "1 AJ Guyton 73.0 180.0 Indiana USA \n", 82 | "2 Aaron McKie 77.0 209.0 Temple USA \n", 83 | "3 Aaron Williams 81.0 225.0 Xavier USA \n", 84 | "4 Adam Keefe 81.0 230.0 Stanford USA \n", 85 | "... ... ... ... ... ... \n", 86 | "9339 Vincent Poirier 84.0 235.0 None France \n", 87 | "9340 Vlatko Cancar 80.0 236.0 None Slovenia \n", 88 | "9343 Wenyen Gabriel 81.0 205.0 None Sudan \n", 89 | "9354 Zach Norvell Jr 77.0 205.0 None USA \n", 90 | "9355 Zylan Cheatham 77.0 220.0 None USA \n", 91 | "\n", 92 | " actual_draft_year draft_round draft_number draft nationality \n", 93 | "0 1985.0 1.0 23.0 drafted domestic \n", 94 | "1 2000.0 2.0 32.0 drafted domestic \n", 95 | "2 1994.0 1.0 17.0 drafted domestic \n", 96 | "3 NaN NaN NaN undrafted domestic \n", 97 | "4 1992.0 1.0 10.0 drafted domestic \n", 98 | "... ... ... ... ... ... \n", 99 | "9339 NaN NaN NaN undrafted foreign \n", 100 | "9340 2017.0 2.0 49.0 drafted foreign \n", 101 | "9343 NaN NaN NaN undrafted foreign \n", 102 | "9354 NaN NaN NaN undrafted domestic \n", 103 | "9355 NaN NaN NaN undrafted domestic \n", 104 | "\n", 105 | "[1934 rows x 10 columns]\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "# Grab NBA player bios (including height/weight)\n", 111 | "df_bios = pd.read_csv('../NBAPlayerBios.csv', index_col=0)\n", 112 | "df_bios = ConvertDataFrame(df_bios, ['name', 'college', 'country', 'draft', 'nationality'], [])\n", 113 | "#print(df_bios[df_bios.duplicated(subset=['name'], keep='first')])\n", 114 | "#print(df_bios[df_bios.duplicated(subset=['name'], keep='last')])\n", 115 | "df_bios = df_bios.drop_duplicates(subset=['name'], keep=False)\n", 116 | "df_bios['name'] = df_bios['name'].str.replace('.', '')\n", 117 | "print(\"Table of NBA Player Bios Over the Last 20 Years:\\n\\n\", df_bios)\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 4, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "Table of NBA Draft Player Measurements Over the Last 20 Years:\n", 130 | "\n", 131 | " name draft_year wingspan\n", 132 | "0 Malik Allen 2001 86.50\n", 133 | "1 Harold Arceneaux 2001 80.50\n", 134 | "2 Lamont Barnes 2001 87.50\n", 135 | "3 Mario Bland 2001 84.00\n", 136 | "4 Primoz Brezec 2001 86.00\n", 137 | "... ... ... ...\n", 138 | "1312 Quinndary Weatherspoon 2020 81.00\n", 139 | "1313 Coby White 2020 77.00\n", 140 | "1314 Kris Wilkes 2020 82.75\n", 141 | "1315 Grant Williams 2020 81.75\n", 142 | "1316 Dylan Windler 2020 82.00\n", 143 | "\n", 144 | "[1294 rows x 3 columns]\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "# Grab physical player measurements from Draft Combine data\n", 150 | "#df_comb = pd.read_csv('NBACombineStats.csv', usecols=['name', 'draft_year', 'height', 'weight', 'wingspan'])\n", 151 | "df_comb = pd.read_csv('../NBACombineStats.csv', usecols=['name', 'draft_year', 'wingspan'])\n", 152 | "df_comb = ConvertDataFrame(df_comb, ['name'], ['draft_year'])\n", 153 | "#print(df_comb[df_comb.duplicated(subset=['name'], keep='first')])\n", 154 | "#print(df_comb[df_comb.duplicated(subset=['name'], keep='last')])\n", 155 | "df_comb = df_comb.drop_duplicates(subset=['name'], keep='last')\n", 156 | "df_comb['name'] = df_comb['name'].str.replace('.', '')\n", 157 | "print(\"Table of NBA Draft Player Measurements Over the Last 20 Years:\\n\\n\", df_comb)\n" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 5, 163 | "metadata": { 164 | "scrolled": false 165 | }, 166 | "outputs": [ 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "Table of basic and advanced player stats:\n", 172 | "\n", 173 | " name year age W L WLR PTS_PT FGM_PT \\\n", 174 | "0 AJ Hammons 2017 24.0 4.0 18.0 0.222222 10.6 3.7 \n", 175 | "1 AJ Price 2015 28.0 11.0 15.0 0.733333 14.8 5.7 \n", 176 | "2 Aaron Brooks 2015 30.0 50.0 32.0 1.562500 18.2 6.6 \n", 177 | "3 Aaron Brooks 2016 31.0 36.0 33.0 1.090909 16.0 6.1 \n", 178 | "4 Aaron Brooks 2017 32.0 36.0 29.0 1.241379 13.0 4.9 \n", 179 | "... ... ... ... ... ... ... ... ... \n", 180 | "3003 Zhou Qi 2019 23.0 0.0 1.0 0.000000 75.4 37.7 \n", 181 | "3004 Zoran Dragic 2015 26.0 6.0 10.0 0.600000 13.4 5.3 \n", 182 | "3005 Zylan Cheatham 2020 24.0 1.0 1.0 1.000000 3.6 1.8 \n", 183 | "3006 Antonius Cleveland 2020 NaN NaN NaN NaN NaN NaN \n", 184 | "3007 Kyle Guy 2020 NaN NaN NaN NaN NaN NaN \n", 185 | "\n", 186 | " FGA_PT FGP_PT ... CHARGE_DRAWN_PT CONTESTS_2PT_PT CONTESTS_3PT_PT \\\n", 187 | "0 9.3 0.405 ... 0.0 9.7 1.8 \n", 188 | "1 15.2 0.372 ... NaN NaN NaN \n", 189 | "2 15.6 0.421 ... NaN NaN NaN \n", 190 | "3 15.2 0.401 ... NaN NaN NaN \n", 191 | "4 12.1 0.403 ... 0.3 4.1 4.1 \n", 192 | "... ... ... ... ... ... ... \n", 193 | "3003 37.7 1.000 ... 0.0 0.0 0.0 \n", 194 | "3004 14.4 0.367 ... NaN NaN NaN \n", 195 | "3005 7.3 0.250 ... 0.0 10.8 3.6 \n", 196 | "3006 NaN NaN ... NaN NaN NaN \n", 197 | "3007 NaN NaN ... NaN NaN NaN \n", 198 | "\n", 199 | " CONTESTS_PT 2PM_PH 2PA_PH 2PP_PH 2PM_PT 2PA_PT 2PP_PT \n", 200 | "0 11.5 3.7 9.7 0.381443 2.6 7.1 0.366197 \n", 201 | "1 NaN 5.7 12.8 0.445312 4.0 8.9 0.449438 \n", 202 | "2 NaN 6.0 13.6 0.441176 4.3 9.6 0.447917 \n", 203 | "3 NaN 5.5 12.7 0.433071 4.0 9.2 0.434783 \n", 204 | "4 8.1 4.0 9.5 0.421053 3.0 6.9 0.434783 \n", 205 | "... ... ... ... ... ... ... ... \n", 206 | "3003 0.0 33.3 33.3 1.000000 37.7 37.7 1.000000 \n", 207 | "3004 NaN 5.1 10.2 0.500000 3.9 7.7 0.506494 \n", 208 | "3005 14.4 2.6 7.9 0.329114 1.8 5.5 0.327273 \n", 209 | "3006 NaN NaN NaN NaN NaN NaN NaN \n", 210 | "3007 NaN NaN NaN NaN NaN NaN NaN \n", 211 | "\n", 212 | "[3006 rows x 416 columns]\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "# Grab player basic and advanced stats from NBA.com\n", 218 | "df1 = pd.read_csv('../NBAAdvancedStats.csv', index_col=0)\n", 219 | "df1 = ConvertDataFrame(df1, ['name', 'red_pos'], ['year'])\n", 220 | "#print(df1[df1.duplicated(subset=['name', 'year'], keep='first')])\n", 221 | "#print(df1[df1.duplicated(subset=['name', 'year'], keep='last')])\n", 222 | "df1 = df1.drop_duplicates(subset=['name', 'year'], keep='last')\n", 223 | "#df1 = df1.groupby(['name', 'year']).mean(numeric_only=True).reset_index()\n", 224 | "\n", 225 | "# Add useful additional variables, particularly\n", 226 | "# 2-point shots and win-to-loss ratio\n", 227 | "df1.insert(df1.columns.get_loc('L')+1, 'WLR', df1['W']/df1['L'])\n", 228 | "df1[\"2PM_PH\"] = df1[\"FGM_PH\"] - df1[\"3PM_PH\"]\n", 229 | "df1[\"2PA_PH\"] = df1[\"FGA_PH\"] - df1[\"3PA_PH\"]\n", 230 | "df1[\"2PP_PH\"] = df1[\"2PM_PH\"]/df1[\"2PA_PH\"]\n", 231 | "df1[\"2PM_PT\"] = df1[\"FGM_PT\"] - df1[\"3PM_PT\"]\n", 232 | "df1[\"2PA_PT\"] = df1[\"FGA_PT\"] - df1[\"3PA_PT\"]\n", 233 | "df1[\"2PP_PT\"] = df1[\"2PM_PT\"]/df1[\"2PA_PT\"]\n", 234 | "\n", 235 | "df1['name'] = df1['name'].str.replace('.', '')\n", 236 | "\n", 237 | "print(\"Table of basic and advanced player stats:\\n\\n\", df1)\n", 238 | "#print(df1.red_pos)\n", 239 | "#print(df1[df1.red_pos == 'nan'])\n" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 6, 245 | "metadata": { 246 | "scrolled": false 247 | }, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "Table of basic BBall Reference stats:\n", 254 | "\n", 255 | " name year pos team GP GS\n", 256 | "0 AJ Hammons 2017 C DAL 22.0 0.0\n", 257 | "1 AJ Price 2010 PG IND 56.0 2.0\n", 258 | "2 AJ Price 2011 PG IND 50.0 0.0\n", 259 | "3 AJ Price 2012 PG IND 44.0 1.0\n", 260 | "4 AJ Price 2013 PG WAS 57.0 22.0\n", 261 | "... ... ... .. ... ... ...\n", 262 | "4550 Zhaire Smith 2019 SG PHI 6.0 2.0\n", 263 | "4551 Zhou Qi 2018 C HOU 18.0 0.0\n", 264 | "4552 Zoran Dragic 2015 SG TOT 16.0 1.0\n", 265 | "4553 Zydrunas Ilgauskas 2010 C CLE 64.0 6.0\n", 266 | "4554 Zydrunas Ilgauskas 2011 C MIA 72.0 51.0\n", 267 | "\n", 268 | "[4555 rows x 6 columns]\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "# Grab basic player stats from BBall Reference\n", 274 | "with open('../BBallRefStats.json') as f:\n", 275 | " json_data = json.load(f)\n", 276 | "\n", 277 | "df2 = pd.DataFrame(data=json_data, dtype=float)\n", 278 | "df2['name'] = df2['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())\n", 279 | "df2.name = df2.name.astype(str)\n", 280 | "df2.pos = df2.pos.astype(str)\n", 281 | "df2.team = df2.team.astype(str)\n", 282 | "df2.year = df2.year.astype(int)\n", 283 | "#df2 = df2.drop_duplicates(subset=['name', 'year'], keep=False)\n", 284 | "df2 = df2.groupby(['name', 'year', 'pos', 'team']).mean(numeric_only=True).reset_index()\n", 285 | "df2['name'] = df2['name'].str.replace('.', '')\n", 286 | "print(\"Table of basic BBall Reference stats:\\n\\n\", df2)\n" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 7, 292 | "metadata": { 293 | "scrolled": false 294 | }, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "Table of advanced BBall Reference stats:\n", 301 | "\n", 302 | " name year FTR OWS DWS WS WS48 OBPM DBPM BPM \\\n", 303 | "0 AJ Hammons 2017 0.476 -0.2 0.2 0.0 -0.001 -7.5 2.0 -5.6 \n", 304 | "1 AJ Price 2010 0.212 0.4 0.8 1.2 0.065 0.2 -2.0 -1.8 \n", 305 | "2 AJ Price 2011 0.253 -0.4 0.7 0.3 0.020 -1.1 -2.3 -3.4 \n", 306 | "3 AJ Price 2012 0.201 0.2 0.5 0.7 0.063 -0.2 -1.7 -1.9 \n", 307 | "4 AJ Price 2013 0.150 1.0 1.2 2.2 0.084 -0.1 -1.7 -1.8 \n", 308 | "... ... ... ... ... ... ... ... ... ... ... \n", 309 | "5271 Zhou Qi 2019 0.000 0.0 0.0 0.0 1.261 22.1 -12.6 9.5 \n", 310 | "5272 Zoran Dragic 2015 0.167 -0.1 0.0 -0.1 -0.042 -2.5 -4.0 -6.5 \n", 311 | "5273 Zydrunas Ilgauskas 2010 0.231 0.5 2.0 2.5 0.088 -3.3 0.2 -3.2 \n", 312 | "5274 Zydrunas Ilgauskas 2011 0.144 1.0 1.9 2.9 0.122 -2.6 1.0 -1.5 \n", 313 | "5275 Zylan Cheatham 2020 0.000 -0.1 0.0 -0.1 -0.091 -8.8 0.2 -8.6 \n", 314 | "\n", 315 | " VORP \n", 316 | "0 -0.1 \n", 317 | "1 0.0 \n", 318 | "2 -0.3 \n", 319 | "3 0.0 \n", 320 | "4 0.1 \n", 321 | "... ... \n", 322 | "5271 0.0 \n", 323 | "5272 -0.1 \n", 324 | "5273 -0.4 \n", 325 | "5274 0.1 \n", 326 | "5275 -0.1 \n", 327 | "\n", 328 | "[5276 rows x 11 columns]\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "# Grab advanced player stats from BBall Reference\n", 334 | "with open('../BBallRefAdvancedStats.json') as f:\n", 335 | " json_data = json.load(f)\n", 336 | "\n", 337 | "df3 = pd.DataFrame(data=json_data, dtype=float)\n", 338 | "df3['name'] = df3['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())\n", 339 | "df3.name = df3.name.astype(str)\n", 340 | "df3.year = df3.year.astype(int)\n", 341 | "#df3 = df3.drop_duplicates(subset=['name', 'year'], keep=False)\n", 342 | "df3 = df3.groupby(['name', 'year']).mean(numeric_only=True).reset_index()\n", 343 | "df3['name'] = df3['name'].str.replace('.', '')\n", 344 | "print(\"Table of advanced BBall Reference stats:\\n\\n\", df3)\n" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 8, 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "Table of advanced Hollinger stats:\n", 357 | "\n", 358 | " name year MPG TS ASTR TOR USG ORR DRR \\\n", 359 | "0 AJ Price 2010 15.4 0.530 19.2 10.7 21.3 1.5 9.7 \n", 360 | "1 AJ Price 2011 15.9 0.454 21.4 10.2 21.8 2.3 7.8 \n", 361 | "2 AJ Price 2012 12.9 0.454 28.0 10.4 17.7 2.6 9.4 \n", 362 | "3 AJ Price 2013 22.4 0.501 28.9 9.0 17.9 1.7 8.2 \n", 363 | "4 AJ Price 2014 3.5 0.469 19.4 10.5 22.7 1.1 10.2 \n", 364 | "... ... ... ... ... ... ... ... ... ... \n", 365 | "5320 Zhou Qi 2019 1.0 1.000 0.0 0.0 40.8 0.0 0.0 \n", 366 | "5321 Zoran Dragic 2015 4.7 0.435 11.8 11.8 19.5 8.0 4.7 \n", 367 | "5322 Zydrunas Ilgauskas 2010 20.9 0.491 8.1 10.6 17.1 10.7 19.6 \n", 368 | "5323 Zydrunas Ilgauskas 2011 15.9 0.531 6.2 12.5 14.2 11.9 17.5 \n", 369 | "5324 Zylan Cheatham 2020 10.3 0.400 20.0 30.0 10.9 3.3 17.5 \n", 370 | "\n", 371 | " REBR PER VA EWA ATR ODRR \n", 372 | "0 5.6 14.06 42.1 1.4 1.794393 0.154639 \n", 373 | "1 5.0 10.74 -0.7 0.0 2.098039 0.294872 \n", 374 | "2 6.0 11.54 6.2 0.2 2.692308 0.276596 \n", 375 | "3 4.9 12.45 27.7 0.9 3.211111 0.207317 \n", 376 | "4 5.5 9.72 0.0 0.0 1.847619 0.107843 \n", 377 | "... ... ... ... ... ... ... \n", 378 | "5320 0.0 80.61 0.0 0.0 NaN NaN \n", 379 | "5321 6.3 8.21 -2.6 -0.1 1.000000 1.702128 \n", 380 | "5322 15.3 11.99 27.8 0.9 0.764151 0.545918 \n", 381 | "5323 14.9 12.84 38.3 1.3 0.496000 0.680000 \n", 382 | "5324 10.2 0.23 -4.8 -0.2 0.666667 0.188571 \n", 383 | "\n", 384 | "[5325 rows x 15 columns]\n" 385 | ] 386 | } 387 | ], 388 | "source": [ 389 | "# Grab advanced player stats from BBall Reference\n", 390 | "with open('../Hollinger.json') as f:\n", 391 | " json_data = json.load(f)\n", 392 | "\n", 393 | "df4 = pd.DataFrame(data=json_data, dtype=float)\n", 394 | "df4['name'] = df4['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())\n", 395 | "df4.name = df4.name.astype(str)\n", 396 | "df4.year = df4.year.astype(int)\n", 397 | "df4 = df4.groupby(['name', 'year']).mean(numeric_only=True).reset_index()\n", 398 | "\n", 399 | "# Create variables for assist-to-turnover ratio\n", 400 | "# and offensive-to-defensive rebounding ratio\n", 401 | "df4[\"ATR\"] = df4[\"ASTR\"]/df4[\"TOR\"]\n", 402 | "df4[\"ODRR\"] = df4[\"ORR\"]/df4[\"DRR\"]\n", 403 | "\n", 404 | "df4['name'] = df4['name'].str.replace('.', '')\n", 405 | "\n", 406 | "print(\"Table of advanced Hollinger stats:\\n\\n\", df4)\n" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 9, 412 | "metadata": { 413 | "scrolled": false 414 | }, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "Table of play-type stats:\n", 421 | "\n", 422 | " name year OFF_POSS_TR OFF_FREQ_TR OFF_PPP_TR \\\n", 423 | "0 Aaron Brooks 2016 0.9 0.104 0.95 \n", 424 | "1 Aaron Brooks 2017 0.6 0.102 0.72 \n", 425 | "2 Aaron Gordon 2016 1.6 0.174 1.08 \n", 426 | "3 Aaron Gordon 2017 2.6 0.199 1.03 \n", 427 | "4 Aaron Gordon 2018 3.7 0.201 0.96 \n", 428 | "... ... ... ... ... ... \n", 429 | "2028 Devin Robinson 2019 0.0 0.000 0.00 \n", 430 | "2029 Jared Cunningham 2016 0.0 0.000 0.00 \n", 431 | "2030 Jarrod Uthoff 2017 0.0 0.000 0.00 \n", 432 | "2031 Alex Stepheson 2016 0.0 0.000 0.00 \n", 433 | "2032 Jack Cooley 2018 0.0 0.000 0.00 \n", 434 | "\n", 435 | " OFF_PTS_TR OFF_FGM_TR OFF_FGA_TR OFF_FGP_TR OFF_EFGP_TR ... \\\n", 436 | "0 0.8 0.4 0.7 0.392 0.520 ... \n", 437 | "1 0.4 0.3 0.4 0.333 0.463 ... \n", 438 | "2 1.8 0.5 1.1 0.558 0.581 ... \n", 439 | "3 2.7 0.9 1.9 0.536 0.565 ... \n", 440 | "4 3.6 1.5 2.8 0.473 0.527 ... \n", 441 | "... ... ... ... ... ... ... \n", 442 | "2028 0.0 0.0 0.0 0.000 0.000 ... \n", 443 | "2029 0.0 0.0 0.0 0.000 0.000 ... \n", 444 | "2030 0.0 0.0 0.0 0.000 0.000 ... \n", 445 | "2031 0.0 0.0 0.0 0.000 0.000 ... \n", 446 | "2032 0.0 0.0 0.0 0.000 0.000 ... \n", 447 | "\n", 448 | " OFF_FT_FREQ_MISC OFF_TO_FREQ_MISC OFF_SF_FREQ_MISC \\\n", 449 | "0 0.207 0.379 0.034 \n", 450 | "1 0.095 0.381 0.000 \n", 451 | "2 0.283 0.358 0.057 \n", 452 | "3 0.195 0.415 0.049 \n", 453 | "4 0.195 0.488 0.098 \n", 454 | "... ... ... ... \n", 455 | "2028 0.000 0.000 0.000 \n", 456 | "2029 0.000 0.000 0.000 \n", 457 | "2030 0.000 0.000 0.000 \n", 458 | "2031 0.000 0.000 0.000 \n", 459 | "2032 0.000 0.000 0.000 \n", 460 | "\n", 461 | " OFF_AND1_FREQ_MISC OFF_SCORE_FREQ_MISC OFF_PERC_MISC AVG_OPP_FGP \\\n", 462 | "0 0.00 0.276 0.311 0.472429 \n", 463 | "1 0.00 0.190 0.382 0.394286 \n", 464 | "2 0.02 0.377 0.729 0.365571 \n", 465 | "3 0.00 0.366 0.755 0.406857 \n", 466 | "4 0.07 0.268 0.547 0.366429 \n", 467 | "... ... ... ... ... \n", 468 | "2028 0.00 0.000 0.000 0.033000 \n", 469 | "2029 0.00 0.000 0.000 0.031429 \n", 470 | "2030 0.00 0.000 0.000 0.081429 \n", 471 | "2031 0.00 0.000 0.000 0.000000 \n", 472 | "2032 0.00 0.000 0.000 0.000000 \n", 473 | "\n", 474 | " WAVG_OPP_FGP AVG_OPP_PPH WAVG_OPP_PPH \n", 475 | "0 0.397859 1.082857 90.242180 \n", 476 | "1 0.431259 0.894286 99.282497 \n", 477 | "2 0.362971 0.842857 83.828897 \n", 478 | "3 0.394403 0.902857 87.363439 \n", 479 | "4 0.394326 0.781429 84.444848 \n", 480 | "... ... ... ... \n", 481 | "2028 0.231000 0.104286 73.000000 \n", 482 | "2029 0.220000 0.071429 50.000000 \n", 483 | "2030 0.570000 0.224286 157.000000 \n", 484 | "2031 0.000000 0.000000 0.000000 \n", 485 | "2032 0.000000 0.000000 0.000000 \n", 486 | "\n", 487 | "[2033 rows x 258 columns]\n" 488 | ] 489 | }, 490 | { 491 | "name": "stderr", 492 | "output_type": "stream", 493 | "text": [ 494 | "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in true_divide\n", 495 | " \n", 496 | "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:18: RuntimeWarning: invalid value encountered in true_divide\n" 497 | ] 498 | } 499 | ], 500 | "source": [ 501 | "# Grab player basic and advanced stats from NBA.com\n", 502 | "df5 = pd.read_csv('../NBAPlayTypeStats.csv', index_col=0)\n", 503 | "df5 = ConvertDataFrame(df5, ['name'], ['year'])\n", 504 | "#print(df5[df5.duplicated(subset=['name', 'year'], keep='first')])\n", 505 | "#print(df5[df5.duplicated(subset=['name', 'year'], keep='last')])\n", 506 | "\n", 507 | "# Create new defensive variables from existing play-type data\n", 508 | "def_fgp_cols = [col for col in df5.columns if 'DEF_FGP_' in col]\n", 509 | "def_freq_cols = [col for col in df5.columns if 'DEF_FREQ_' in col]\n", 510 | "def_pph_cols = [col for col in df5.columns if 'DEF_PPP_' in col]\n", 511 | "# Mean field goal percentage using mean of all play-type columns\n", 512 | "df5['AVG_OPP_FGP'] = df5[def_fgp_cols].mean(axis=1)\n", 513 | "# Weighted average which takes into account relative frequency of defensive play type\n", 514 | "df5['WAVG_OPP_FGP'] = (df5[def_fgp_cols].values*df5[def_freq_cols].values).sum(axis=1)/df5[def_freq_cols].values.sum(axis=1)\n", 515 | "# Mean points per 100 possession using mean all play-type columns\n", 516 | "df5['AVG_OPP_PPH'] = df5[def_pph_cols].mean(axis=1)\n", 517 | "# Weighted average PPH which takes into account relative frequency of defensive play type\n", 518 | "df5['WAVG_OPP_PPH'] = (df5[def_pph_cols].values*100.*df5[def_freq_cols].values).sum(axis=1)/df5[def_freq_cols].values.sum(axis=1)\n", 519 | "\n", 520 | "# Replace NaNs with zeros\n", 521 | "df5 = df5.fillna(0)\n", 522 | "\n", 523 | "df5['name'] = df5['name'].str.replace('.', '')\n", 524 | "\n", 525 | "print(\"Table of play-type stats:\\n\\n\", df5)\n" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 10, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "name": "stdout", 535 | "output_type": "stream", 536 | "text": [ 537 | "Table of ESPN real plus-minus stats:\n", 538 | "\n", 539 | " name year ORPM DRPM RPM RPM_WINS\n", 540 | "0 AJ Price 2014 0.03 -0.12 -0.09 0.14\n", 541 | "1 AJ Price 2015 -0.75 -2.42 -3.17 -0.13\n", 542 | "2 AJ Hammons 2017 -2.77 1.27 -1.50 0.16\n", 543 | "3 Aaron Brooks 2014 0.70 -3.84 -3.14 -0.79\n", 544 | "4 Aaron Brooks 2015 1.25 -2.33 -1.08 1.46\n", 545 | "... ... ... ... ... ... ...\n", 546 | "3218 Zaza Pachulia 2019 -2.87 3.06 0.19 1.76\n", 547 | "3219 Zhaire Smith 2019 -1.51 -0.58 -2.09 0.07\n", 548 | "3220 Zhou Qi 2019 -1.87 0.90 -0.97 0.00\n", 549 | "3221 Zoran Dragic 2015 -1.52 -1.92 -3.44 -0.05\n", 550 | "3222 Zylan Cheatham 2020 -1.04 -0.74 -1.77 -0.02\n", 551 | "\n", 552 | "[3223 rows x 6 columns]\n" 553 | ] 554 | } 555 | ], 556 | "source": [ 557 | "# Grab advanced real plus-minus stats from ESPN\n", 558 | "with open('../ESPN_RealPM.json') as f:\n", 559 | " json_data = json.load(f)\n", 560 | "\n", 561 | "df6 = pd.DataFrame(data=json_data, dtype=float)\n", 562 | "df6['name'] = df6['name'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode())\n", 563 | "df6.name = df6.name.astype(str)\n", 564 | "df6.year = df6.year.astype(int)\n", 565 | "df6 = df6.groupby(['name', 'year']).mean(numeric_only=True).reset_index()\n", 566 | "\n", 567 | "df6['name'] = df6['name'].str.replace('.', '')\n", 568 | "\n", 569 | "print(\"Table of ESPN real plus-minus stats:\\n\\n\", df6)\n" 570 | ] 571 | }, 572 | { 573 | "cell_type": "markdown", 574 | "metadata": {}, 575 | "source": [ 576 | "# Merge All Dataframes and Rearrange Columns" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 11, 582 | "metadata": { 583 | "scrolled": false 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "# Merge all dataframes\n", 588 | "df = pd.merge(df_bios, df_comb, on=['name'], how='outer')\n", 589 | "df = pd.merge(df, df1, on=['name'])\n", 590 | "df = pd.merge(df, df2, on=['name', 'year'])\n", 591 | "cols_to_move = ['pos', 'team', 'year', 'GP', 'GS']\n", 592 | "#col_to_move_after = 'wingspan' # only relevant for players from draft combine\n", 593 | "col_to_move_after = 'name'\n", 594 | "befbef_cols = [c for c in df if df.columns.get_loc(c)<=df.columns.get_loc(col_to_move_after) and c not in cols_to_move]\n", 595 | "before_cols = [c for c in df if df.columns.get_loc(c)>df.columns.get_loc(col_to_move_after) and df.columns.get_loc(c)df.columns.get_loc(cols_to_move[-1]) and c not in cols_to_move]\n", 597 | "df = df[befbef_cols+cols_to_move+before_cols+after_cols]\n", 598 | "df = pd.merge(df, df3, on=['name', 'year'])\n", 599 | "df = pd.merge(df, df4, on=['name', 'year'])\n", 600 | "df = pd.merge(df, df5, on=['name', 'year'])\n", 601 | "df = pd.merge(df, df6, on=['name', 'year'])\n" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "# Create New Variables and Resolve Infs/NaNs" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": 12, 614 | "metadata": {}, 615 | "outputs": [], 616 | "source": [ 617 | "# Column modifications\n", 618 | "df = df.replace([np.inf, -np.inf], np.nan)\n", 619 | "\n", 620 | "# Convert some per-36 min stats into per-game stats\n", 621 | "df['2PA_PG'] = df['2PA_PT']*df.MPG/36.\n", 622 | "df['3PA_PG'] = df['3PA_PT']*df.MPG/36.\n", 623 | "df['FGA_PG'] = df.FGA_PT*df.MPG/36.\n", 624 | "###\n", 625 | "df[\"2PR\"] = df[\"2PA_PH\"]/df[\"FGA_PH\"]\n", 626 | "df[\"3PR\"] = df[\"3PA_PH\"]/df[\"FGA_PH\"]\n", 627 | "\n", 628 | "# Create new variables for frequency of various shot types\n", 629 | "df['FG_FREQ_RIM'] = (df.FGA_RA)/df.FGA_PG # restricted area\n", 630 | "df['FG_FREQ_MR_AND_PT'] = (df.FGA_MR + df.FGA_NONRA)/df.FGA_PG # combined paint and midrange\n", 631 | "df['FG_FREQ_MR'] = df.FGA_MR/df.FGA_PG\n", 632 | "df['FG_FREQ_CORNERS'] = (df.FGA_LC + df.FGA_RC)/df.FGA_PG\n", 633 | "df['FG_FREQ_AB'] = df.FGA_AB/df.FGA_PG\n", 634 | "df['FG_FREQ_01DRIB'] = (df['FGA_0DRIB'] + df['FGA_1DRIB'])/df.FGA_PG\n", 635 | "df['FG_FREQ_GT1DRIB'] = (df['FGA_2DRIB'] + df['FGA_36DRIB'] + df['FGA_GT7DRIB'])/df.FGA_PG\n", 636 | "df['FG_FREQ_CANDS'] = df['FGA_CANDS']/df.FGA_PG\n", 637 | "\n", 638 | "df['FG_FREQ_05FT'] = (df.FGA_05FT)/df.FGA_PG\n", 639 | "df['FG_FREQ_59FT'] = (df.FGA_59FT)/df.FGA_PG\n", 640 | "df['FG_FREQ_1014FT'] = (df.FGA_1014FT)/df.FGA_PG\n", 641 | "df['FG_FREQ_1519FT'] = (df.FGA_1519FT)/df.FGA_PG\n", 642 | "df['FG_FREQ_2024FT'] = (df.FGA_2024FT)/df.FGA_PG\n", 643 | "df['FG_FREQ_GT24FT'] = (df.FGA_GT24FT)/df.FGA_PG\n", 644 | "\n", 645 | "# Replace NaN values with zeros\n", 646 | "df[\"ATR\"].fillna(0, inplace=True)\n", 647 | "df[\"ODRR\"].fillna(0, inplace=True)\n", 648 | "df[\"2PP_PH\"].fillna(0, inplace=True)\n", 649 | "df[\"2PP_PT\"].fillna(0, inplace=True)\n", 650 | "df[\"2PR\"].fillna(0, inplace=True)\n", 651 | "df[\"3PR\"].fillna(0, inplace=True)\n", 652 | "df[\"WAVG_OPP_FGP\"].fillna(0, inplace=True)\n", 653 | "df[\"FG_FREQ_RIM\"].fillna(0, inplace=True)\n", 654 | "df[\"FG_FREQ_MR_AND_PT\"].fillna(0, inplace=True)\n", 655 | "df[\"FG_FREQ_MR\"].fillna(0, inplace=True)\n", 656 | "df[\"FG_FREQ_CORNERS\"].fillna(0, inplace=True)\n", 657 | "df[\"FG_FREQ_AB\"].fillna(0, inplace=True)\n", 658 | "df[\"FG_FREQ_01DRIB\"].fillna(0, inplace=True)\n", 659 | "df[\"FG_FREQ_GT1DRIB\"].fillna(0, inplace=True)\n", 660 | "df[\"FG_FREQ_CANDS\"].fillna(0, inplace=True)\n", 661 | "df['FG_FREQ_05FT'].fillna(0, inplace=True)\n", 662 | "df['FG_FREQ_59FT'].fillna(0, inplace=True)\n", 663 | "df['FG_FREQ_1014FT'].fillna(0, inplace=True)\n", 664 | "df['FG_FREQ_1519FT'].fillna(0, inplace=True)\n", 665 | "df['FG_FREQ_2024FT'].fillna(0, inplace=True)\n", 666 | "df['FG_FREQ_GT24FT'].fillna(0, inplace=True)\n", 667 | "\n", 668 | "# Match team name acronyms match between BBall Reference and NBA.com\n", 669 | "df['team'] = df['team'].replace('BRK', 'BKN')\n", 670 | "df['team'] = df['team'].replace('CHO', 'CHA')\n", 671 | "df['team'] = df['team'].replace('PHO', 'PHX')\n" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": 13, 677 | "metadata": {}, 678 | "outputs": [ 679 | { 680 | "name": "stdout", 681 | "output_type": "stream", 682 | "text": [ 683 | "Combined Table of BBall Player Stats:\n", 684 | "\n", 685 | " name pos team year GP GS height weight college \\\n", 686 | "0 Andre Miller PG TOT 2016 39.0 4.0 74.0 200.0 Utah \n", 687 | "1 Dirk Nowitzki PF DAL 2016 75.0 75.0 83.0 237.0 None \n", 688 | "2 Dirk Nowitzki PF DAL 2017 54.0 54.0 83.0 237.0 None \n", 689 | "3 Dirk Nowitzki C DAL 2018 77.0 77.0 83.0 237.0 None \n", 690 | "4 Dirk Nowitzki PF DAL 2019 51.0 20.0 83.0 237.0 None \n", 691 | "... ... .. ... ... ... ... ... ... ... \n", 692 | "1742 Rui Hachimura PF WAS 2020 25.0 25.0 80.0 230.0 Gonzaga \n", 693 | "1743 Sekou Doumbouya SF DET 2020 14.0 7.0 80.0 230.0 None \n", 694 | "1744 Terence Davis SG TOR 2020 39.0 1.0 76.0 201.0 None \n", 695 | "1745 Ty Jerome PG PHX 2020 10.0 0.0 77.0 195.0 Virginia \n", 696 | "1746 Tyler Herro SG MIA 2020 38.0 3.0 77.0 195.0 Kentucky \n", 697 | "\n", 698 | " country ... FG_FREQ_AB FG_FREQ_01DRIB FG_FREQ_GT1DRIB FG_FREQ_CANDS \\\n", 699 | "0 USA ... 0.000000 0.000000 0.000000 0.073514 \n", 700 | "1 Germany ... 0.033812 0.000000 0.000000 0.540997 \n", 701 | "2 Germany ... 0.023923 0.853270 0.000000 0.606061 \n", 702 | "3 Germany ... 0.030364 0.890688 0.000000 0.668016 \n", 703 | "4 Germany ... 0.054299 0.923077 0.000000 0.733032 \n", 704 | "... ... ... ... ... ... ... \n", 705 | "1742 Japan ... 0.057534 0.682192 0.304110 0.230137 \n", 706 | "1743 Guinea ... 0.196253 0.642284 0.000000 0.303301 \n", 707 | "1744 USA ... 0.145653 0.710059 0.291306 0.473373 \n", 708 | "1745 USA ... 0.020353 0.508820 0.488467 0.386703 \n", 709 | "1746 USA ... 0.076567 0.467908 0.535967 0.289252 \n", 710 | "\n", 711 | " FG_FREQ_05FT FG_FREQ_59FT FG_FREQ_1014FT FG_FREQ_1519FT \\\n", 712 | "0 0.514601 0.110272 0.183786 0.110272 \n", 713 | "1 0.087912 0.054100 0.202874 0.290786 \n", 714 | "2 0.071770 0.047847 0.151515 0.334928 \n", 715 | "3 0.050607 0.020243 0.172065 0.222672 \n", 716 | "4 0.013575 0.054299 0.122172 0.190045 \n", 717 | "... ... ... ... ... \n", 718 | "1742 0.468493 0.073973 0.123288 0.147945 \n", 719 | "1743 0.410348 0.089206 0.035682 0.017841 \n", 720 | "1744 0.364133 0.036413 0.018207 0.018207 \n", 721 | "1745 0.183175 0.081411 0.244233 0.040706 \n", 722 | "1746 0.161641 0.059552 0.110596 0.144626 \n", 723 | "\n", 724 | " FG_FREQ_2024FT FG_FREQ_GT24FT \n", 725 | "0 0.036757 0.036757 \n", 726 | "1 0.250211 0.311074 \n", 727 | "2 0.271132 0.311005 \n", 728 | "3 0.222672 0.445344 \n", 729 | "4 0.217195 0.542986 \n", 730 | "... ... ... \n", 731 | "1742 0.098630 0.147945 \n", 732 | "1743 0.124888 0.338983 \n", 733 | "1744 0.236686 0.564406 \n", 734 | "1745 0.101764 0.386703 \n", 735 | "1746 0.195671 0.459401 \n", 736 | "\n", 737 | "[1747 rows x 732 columns]\n" 738 | ] 739 | } 740 | ], 741 | "source": [ 742 | "# Write complete set of combined stats to .csv file and print\n", 743 | "df.to_csv(\"../CompleteNBAPlayerStats.csv\", index=False)\n", 744 | "print(\"Combined Table of BBall Player Stats:\\n\\n\", df)\n" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [] 753 | } 754 | ], 755 | "metadata": { 756 | "kernelspec": { 757 | "display_name": "Python 3", 758 | "language": "python", 759 | "name": "python3" 760 | }, 761 | "language_info": { 762 | "codemirror_mode": { 763 | "name": "ipython", 764 | "version": 3 765 | }, 766 | "file_extension": ".py", 767 | "mimetype": "text/x-python", 768 | "name": "python", 769 | "nbconvert_exporter": "python", 770 | "pygments_lexer": "ipython3", 771 | "version": "3.7.3" 772 | }, 773 | "toc": { 774 | "base_numbering": 1, 775 | "nav_menu": {}, 776 | "number_sections": true, 777 | "sideBar": true, 778 | "skip_h1_title": false, 779 | "title_cell": "Table of Contents", 780 | "title_sidebar": "Contents", 781 | "toc_cell": true, 782 | "toc_position": { 783 | "height": "calc(100% - 180px)", 784 | "left": "10px", 785 | "top": "150px", 786 | "width": "165px" 787 | }, 788 | "toc_section_display": true, 789 | "toc_window_display": true 790 | } 791 | }, 792 | "nbformat": 4, 793 | "nbformat_minor": 2 794 | } 795 | -------------------------------------------------------------------------------- /Data_Scraping/Generate_Matchup_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import numpy as np\n", 20 | "import time\n", 21 | "import sys\n", 22 | "import os\n", 23 | "import pandas as pd\n", 24 | "from functools import reduce\n", 25 | "from operator import itemgetter\n", 26 | "import itertools\n", 27 | "import re\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "import matplotlib\n", 30 | "from matplotlib.pyplot import cm\n", 31 | "from fuzzywuzzy import fuzz \n", 32 | "from fuzzywuzzy import process" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 10, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# Use the player lineups recorded for each possession in \n", 42 | "# play-by-play data to calculate the cumulative plus-minus\n", 43 | "# for each lineup matchup \"stint\". A stint is defined as an \n", 44 | "# offensive or defensive sequence of possessions.\n", 45 | "def ExtractPlusMinus(df):\n", 46 | " ht_pm = []\n", 47 | " vt_pm = []\n", 48 | " ht_poss = []\n", 49 | " vt_poss = []\n", 50 | " pts_scored = []\n", 51 | " for num,game in df.groupby(['game', 'year'], sort=False, as_index=False):\n", 52 | " ht_plus_minus = 0\n", 53 | " vt_plus_minus = 0\n", 54 | " ht_stint_poss = 0\n", 55 | " vt_stint_poss = 0\n", 56 | " ht_last_margin = 0\n", 57 | " prev_hl = []\n", 58 | " prev_vl = []\n", 59 | " for ht,vt,hmarg in zip(game.ht_lineup,game.vt_lineup,game.ht_margin):\n", 60 | " hta = sorted(ht.split(','))\n", 61 | " vta = sorted(vt.split(','))\n", 62 | " # If either home or away lineup has changed\n", 63 | " if hta != prev_hl or vta != prev_vl:\n", 64 | " #print('New home lineup:', hta)\n", 65 | " #print('New visiting lineup:', vta)\n", 66 | " ht_plus_minus = 0\n", 67 | " vt_plus_minus = 0\n", 68 | " ht_stint_poss = 0\n", 69 | " vt_stint_poss = 0\n", 70 | " \n", 71 | " pts_scored.append(abs(hmarg-ht_last_margin))\n", 72 | " ht_plus_minus += (hmarg-ht_last_margin)\n", 73 | " vt_plus_minus += -(hmarg-ht_last_margin)\n", 74 | " #print(ht_last_margin, hmarg, ht_plus_minus, pts_scored)\n", 75 | " ht_pm.append(ht_plus_minus)\n", 76 | " vt_pm.append(vt_plus_minus)\n", 77 | "\n", 78 | " ht_stint_poss += 1\n", 79 | " vt_stint_poss += 1\n", 80 | " ht_poss.append(ht_stint_poss)\n", 81 | " vt_poss.append(vt_stint_poss)\n", 82 | " \n", 83 | " prev_hl = hta\n", 84 | " prev_vl = vta\n", 85 | " ht_last_margin = hmarg\n", 86 | " \n", 87 | " df['ht_stint_pm'] = ht_pm\n", 88 | " df['vt_stint_pm'] = vt_pm\n", 89 | " df['ht_stint_poss'] = ht_poss\n", 90 | " df['vt_stint_poss'] = vt_poss\n", 91 | " df['points_scored'] = pts_scored\n", 92 | " return df\n", 93 | "\n", 94 | "# Grab the data corresponding to the first possession of each lineup matchup stint\n", 95 | "def GetMatchupStarts(df):\n", 96 | " dfhead = df.groupby((df[['game', 'year', 'ht_lineup', 'vt_lineup']] != df[['game','year','ht_lineup','vt_lineup']].shift(1)).any(axis=1).cumsum()).head(1).reset_index(drop=True)\n", 97 | " return dfhead\n", 98 | "\n", 99 | "# Grab the data corresponding to the last possession of each lineup matchup stint\n", 100 | "# (includes the final plus-minus of the lineup matchup)\n", 101 | "def GetMatchupEnds(df):\n", 102 | " dftail = df.groupby((df[['game', 'year', 'ht_lineup', 'vt_lineup']] != df[['game','year','ht_lineup','vt_lineup']].shift(1)).any(axis=1).cumsum()).tail(1).reset_index(drop=True)\n", 103 | " return dftail\n", 104 | "\n", 105 | "# Grab the data corresponding to final row for EACH off/def possession\n", 106 | "def GetPossessionEnds(df):\n", 107 | " dftail = df.groupby((df[['game', 'year', 'ht_poss']] != df[['game','year','ht_poss']].shift(1)).any(axis=1).cumsum()).tail(1).reset_index(drop=True)\n", 108 | " return dftail\n", 109 | "\n", 110 | "# DEPRECATED -- extremely slow method\n", 111 | "#def PlayerPlusMinus(df, player):\n", 112 | "# df_ht = df[df.ht_lineup.str.contains(player)].time_sec.values\n", 113 | "# df_hpm = df[df.ht_lineup.str.contains(player)].ht_stint_pm.values\n", 114 | "# df_vt = df[df.vt_lineup.str.contains(player)].time_sec.values\n", 115 | "# df_vpm = df[df.vt_lineup.str.contains(player)].vt_stint_pm.values\n", 116 | "# return df_ht, df_hpm, df_vt, df_vpm\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "# Load Play-By-Play Data" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 3, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | " year game home_team vis_team Q time_sec ht_score vt_score \\\n", 136 | "0 2017 1 CLE NYK 1 0.0 0 0 \n", 137 | "1 2017 1 CLE NYK 1 20.0 0 2 \n", 138 | "2 2017 1 CLE NYK 1 34.0 0 2 \n", 139 | "3 2017 1 CLE NYK 1 37.0 0 2 \n", 140 | "4 2017 1 CLE NYK 1 44.0 0 2 \n", 141 | "... ... ... ... ... .. ... ... ... \n", 142 | "1675613 2019 1230 POR SAC 4 2859.0 136 131 \n", 143 | "1675614 2019 1230 POR SAC 4 2859.0 136 131 \n", 144 | "1675615 2019 1230 POR SAC 4 2866.0 136 131 \n", 145 | "1675616 2019 1230 POR SAC 4 2869.0 136 131 \n", 146 | "1675617 2019 1230 POR SAC 4 2880.0 136 131 \n", 147 | "\n", 148 | " ht_margin vt_margin ... ht_flagrants vt_flagrants ht_2PTA \\\n", 149 | "0 0 0 ... 0 0 0 \n", 150 | "1 -2 2 ... 0 0 0 \n", 151 | "2 -2 2 ... 0 0 1 \n", 152 | "3 -2 2 ... 0 0 1 \n", 153 | "4 -2 2 ... 0 0 1 \n", 154 | "... ... ... ... ... ... ... \n", 155 | "1675613 5 -5 ... 0 0 64 \n", 156 | "1675614 5 -5 ... 0 0 64 \n", 157 | "1675615 5 -5 ... 0 0 64 \n", 158 | "1675616 5 -5 ... 0 0 64 \n", 159 | "1675617 5 -5 ... 0 0 64 \n", 160 | "\n", 161 | " vt_2PTA ht_3PTA vt_3PTA ht_2PTM vt_2PTM ht_3PTM vt_3PTM \n", 162 | "0 0 0 0 0 0 0 0 \n", 163 | "1 1 0 0 0 1 0 0 \n", 164 | "2 1 0 0 0 1 0 0 \n", 165 | "3 1 0 0 0 1 0 0 \n", 166 | "4 2 0 0 0 1 0 0 \n", 167 | "... ... ... ... ... ... ... ... \n", 168 | "1675613 51 24 42 38 31 14 18 \n", 169 | "1675614 51 24 42 38 31 14 18 \n", 170 | "1675615 51 24 43 38 31 14 18 \n", 171 | "1675616 51 24 43 38 31 14 18 \n", 172 | "1675617 51 24 43 38 31 14 18 \n", 173 | "\n", 174 | "[1675618 rows x 36 columns]\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "# Load and concatenate all play-by-play data over the last 3 full seasons\n", 180 | "df1 = pd.read_csv('../NBA_PBP_Data_2016_2017.csv', index_col=0)\n", 181 | "df2 = pd.read_csv('../NBA_PBP_Data_2017_2018.csv', index_col=0)\n", 182 | "df3 = pd.read_csv('../NBA_PBP_Data_2018_2019.csv', index_col=0)\n", 183 | "\n", 184 | "df = pd.concat([df1, df2, df3], ignore_index=True)\n", 185 | "\n", 186 | "df.fillna('', inplace=True)\n", 187 | "\n", 188 | "print(df)\n" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "# Extract +/- Information for Lineup Matchups and Apply Cuts" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 6, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | " year game home_team vis_team Q time_sec ht_score vt_score \\\n", 208 | "0 2017 1 CLE NYK 1 0.0 0 0 \n", 209 | "1 2017 1 CLE NYK 1 20.0 0 2 \n", 210 | "2 2017 1 CLE NYK 1 34.0 0 2 \n", 211 | "3 2017 1 CLE NYK 1 37.0 0 2 \n", 212 | "4 2017 1 CLE NYK 1 44.0 0 2 \n", 213 | "... ... ... ... ... .. ... ... ... \n", 214 | "1675613 2019 1230 POR SAC 4 2859.0 136 131 \n", 215 | "1675614 2019 1230 POR SAC 4 2859.0 136 131 \n", 216 | "1675615 2019 1230 POR SAC 4 2866.0 136 131 \n", 217 | "1675616 2019 1230 POR SAC 4 2869.0 136 131 \n", 218 | "1675617 2019 1230 POR SAC 4 2880.0 136 131 \n", 219 | "\n", 220 | " ht_margin vt_margin ... vt_3PTA ht_2PTM vt_2PTM ht_3PTM vt_3PTM \\\n", 221 | "0 0 0 ... 0 0 0 0 0 \n", 222 | "1 -2 2 ... 0 0 1 0 0 \n", 223 | "2 -2 2 ... 0 0 1 0 0 \n", 224 | "3 -2 2 ... 0 0 1 0 0 \n", 225 | "4 -2 2 ... 0 0 1 0 0 \n", 226 | "... ... ... ... ... ... ... ... ... \n", 227 | "1675613 5 -5 ... 42 38 31 14 18 \n", 228 | "1675614 5 -5 ... 42 38 31 14 18 \n", 229 | "1675615 5 -5 ... 43 38 31 14 18 \n", 230 | "1675616 5 -5 ... 43 38 31 14 18 \n", 231 | "1675617 5 -5 ... 43 38 31 14 18 \n", 232 | "\n", 233 | " ht_stint_pm vt_stint_pm ht_stint_poss vt_stint_poss points_scored \n", 234 | "0 0 0 1 1 0 \n", 235 | "1 -2 2 2 2 2 \n", 236 | "2 -2 2 3 3 0 \n", 237 | "3 -2 2 4 4 0 \n", 238 | "4 -2 2 5 5 0 \n", 239 | "... ... ... ... ... ... \n", 240 | "1675613 0 0 41 41 0 \n", 241 | "1675614 0 0 42 42 0 \n", 242 | "1675615 0 0 43 43 0 \n", 243 | "1675616 0 0 44 44 0 \n", 244 | "1675617 0 0 45 45 0 \n", 245 | "\n", 246 | "[1675618 rows x 41 columns]\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "# Comb over the PBP dataframe and calculate +/- data \n", 252 | "# for each new lineup stint, for each game\n", 253 | "df_new = ExtractPlusMinus(df)\n", 254 | "print(df_new)\n" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 7, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "name": "stdout", 264 | "output_type": "stream", 265 | "text": [ 266 | " year game home_team vis_team Q time_sec ht_score vt_score \\\n", 267 | "0 2017 1 CLE NYK 1 0.0 0 0 \n", 268 | "1 2017 1 CLE NYK 1 20.0 0 2 \n", 269 | "2 2017 1 CLE NYK 1 34.0 0 2 \n", 270 | "3 2017 1 CLE NYK 1 37.0 0 2 \n", 271 | "4 2017 1 CLE NYK 1 44.0 0 2 \n", 272 | "... ... ... ... ... .. ... ... ... \n", 273 | "1675613 2019 1230 POR SAC 4 2859.0 136 131 \n", 274 | "1675614 2019 1230 POR SAC 4 2859.0 136 131 \n", 275 | "1675615 2019 1230 POR SAC 4 2866.0 136 131 \n", 276 | "1675616 2019 1230 POR SAC 4 2869.0 136 131 \n", 277 | "1675617 2019 1230 POR SAC 4 2880.0 136 131 \n", 278 | "\n", 279 | " ht_margin vt_margin ... vt_3PTA ht_2PTM vt_2PTM ht_3PTM vt_3PTM \\\n", 280 | "0 0 0 ... 0 0 0 0 0 \n", 281 | "1 -2 2 ... 0 0 1 0 0 \n", 282 | "2 -2 2 ... 0 0 1 0 0 \n", 283 | "3 -2 2 ... 0 0 1 0 0 \n", 284 | "4 -2 2 ... 0 0 1 0 0 \n", 285 | "... ... ... ... ... ... ... ... ... \n", 286 | "1675613 5 -5 ... 42 38 31 14 18 \n", 287 | "1675614 5 -5 ... 42 38 31 14 18 \n", 288 | "1675615 5 -5 ... 43 38 31 14 18 \n", 289 | "1675616 5 -5 ... 43 38 31 14 18 \n", 290 | "1675617 5 -5 ... 43 38 31 14 18 \n", 291 | "\n", 292 | " ht_stint_pm vt_stint_pm ht_stint_poss vt_stint_poss points_scored \n", 293 | "0 0 0 1 1 0 \n", 294 | "1 -2 2 2 2 2 \n", 295 | "2 -2 2 3 3 0 \n", 296 | "3 -2 2 4 4 0 \n", 297 | "4 -2 2 5 5 0 \n", 298 | "... ... ... ... ... ... \n", 299 | "1675613 0 0 41 41 0 \n", 300 | "1675614 0 0 42 42 0 \n", 301 | "1675615 0 0 43 43 0 \n", 302 | "1675616 0 0 44 44 0 \n", 303 | "1675617 0 0 45 45 0 \n", 304 | "\n", 305 | "[1591826 rows x 41 columns]\n" 306 | ] 307 | } 308 | ], 309 | "source": [ 310 | "# Cut out PBP rows where there are not 5 players in both the home and away lineup\n", 311 | "df_new = df_new[df_new.ht_lineup.str.split(',').str.len() == 5]\n", 312 | "df_new = df_new[df_new.vt_lineup.str.split(',').str.len() == 5]\n", 313 | "\n", 314 | "df_new.to_csv('../NBA_PBP_Data_PlusMinus.csv')\n", 315 | "print(df_new)\n" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 15, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/html": [ 326 | "
\n", 327 | "\n", 340 | "\n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | "
yeargamehome_teamvis_teamtime_secht_scorevt_scorepoints_scoredht_playvt_playht_poss
020171CLENYK20.0022Rose 1' Driving Layup (2 PTS) (Noah 1 AST)DEF
120171CLENYK37.0020Noah REBOUND (Off:0 Def:1)OFF
220171CLENYK45.0042Porzingis 2' Tip Layup Shot (2 PTS)DEF
320171CLENYK61.0242James 11' Jump Shot (2 PTS) (Irving 1 AST)OFF
420171CLENYK62.0240Rose Out of Bounds Lost Ball Turnover (P1.T1)DEF
....................................
72076620191230PORSAC2832.01351312Simons 1' Tip Layup Shot (36 PTS)OFF
72076720191230PORSAC2846.01351310Layman REBOUND (Off:0 Def:4)DEF
72076820191230PORSAC2859.01361310Swanigan REBOUND (Off:3 Def:4)OFF
72076920191230PORSAC2869.01361310Labissiere REBOUND (Off:4 Def:11)DEF
72077020191230PORSAC2880.01361310EOQEOQOFF
\n", 514 | "

720771 rows × 11 columns

\n", 515 | "
" 516 | ], 517 | "text/plain": [ 518 | " year game home_team vis_team time_sec ht_score vt_score \\\n", 519 | "0 2017 1 CLE NYK 20.0 0 2 \n", 520 | "1 2017 1 CLE NYK 37.0 0 2 \n", 521 | "2 2017 1 CLE NYK 45.0 0 4 \n", 522 | "3 2017 1 CLE NYK 61.0 2 4 \n", 523 | "4 2017 1 CLE NYK 62.0 2 4 \n", 524 | "... ... ... ... ... ... ... ... \n", 525 | "720766 2019 1230 POR SAC 2832.0 135 131 \n", 526 | "720767 2019 1230 POR SAC 2846.0 135 131 \n", 527 | "720768 2019 1230 POR SAC 2859.0 136 131 \n", 528 | "720769 2019 1230 POR SAC 2869.0 136 131 \n", 529 | "720770 2019 1230 POR SAC 2880.0 136 131 \n", 530 | "\n", 531 | " points_scored ht_play \\\n", 532 | "0 2 \n", 533 | "1 0 \n", 534 | "2 2 \n", 535 | "3 2 James 11' Jump Shot (2 PTS) (Irving 1 AST) \n", 536 | "4 0 \n", 537 | "... ... ... \n", 538 | "720766 2 Simons 1' Tip Layup Shot (36 PTS) \n", 539 | "720767 0 Layman REBOUND (Off:0 Def:4) \n", 540 | "720768 0 \n", 541 | "720769 0 Labissiere REBOUND (Off:4 Def:11) \n", 542 | "720770 0 EOQ \n", 543 | "\n", 544 | " vt_play ht_poss \n", 545 | "0 Rose 1' Driving Layup (2 PTS) (Noah 1 AST) DEF \n", 546 | "1 Noah REBOUND (Off:0 Def:1) OFF \n", 547 | "2 Porzingis 2' Tip Layup Shot (2 PTS) DEF \n", 548 | "3 OFF \n", 549 | "4 Rose Out of Bounds Lost Ball Turnover (P1.T1) DEF \n", 550 | "... ... ... \n", 551 | "720766 OFF \n", 552 | "720767 DEF \n", 553 | "720768 Swanigan REBOUND (Off:3 Def:4) OFF \n", 554 | "720769 DEF \n", 555 | "720770 EOQ OFF \n", 556 | "\n", 557 | "[720771 rows x 11 columns]" 558 | ] 559 | }, 560 | "execution_count": 15, 561 | "metadata": {}, 562 | "output_type": "execute_result" 563 | } 564 | ], 565 | "source": [ 566 | "# Break play-by-play data into units of possessions, then\n", 567 | "# store in a file to be loaded for modeling\n", 568 | "df_poss = GetPossessionEnds(df_new)\n", 569 | "df_poss.to_csv('../NBA_PBP_Data_Possessions.csv')\n", 570 | "df_poss[['year', 'game', 'home_team', 'vis_team', 'time_sec', 'ht_score', 'vt_score', 'points_scored', 'ht_play', 'vt_play', 'ht_poss']]\n" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 6, 576 | "metadata": {}, 577 | "outputs": [ 578 | { 579 | "name": "stdout", 580 | "output_type": "stream", 581 | "text": [ 582 | " year game home_team vis_team Q time_sec ht_score vt_score \\\n", 583 | "0 2017 1 CLE NYK 1 37.0 0 2 \n", 584 | "1 2017 1 CLE NYK 1 415.0 14 12 \n", 585 | "2 2017 1 CLE NYK 1 491.0 17 14 \n", 586 | "3 2017 1 CLE NYK 1 557.0 19 16 \n", 587 | "4 2017 1 CLE NYK 1 632.0 23 16 \n", 588 | "... ... ... ... ... .. ... ... ... \n", 589 | "98072 2019 1230 POR SAC 4 2256.0 104 115 \n", 590 | "98073 2019 1230 POR SAC 4 2400.0 113 117 \n", 591 | "98074 2019 1230 POR SAC 4 2429.0 116 117 \n", 592 | "98075 2019 1230 POR SAC 4 2565.0 122 117 \n", 593 | "98076 2019 1230 POR SAC 4 2624.0 124 119 \n", 594 | "\n", 595 | " ht_margin vt_margin ... vt_3PTA ht_2PTM vt_2PTM ht_3PTM vt_3PTM \\\n", 596 | "0 -2 2 ... 0 0 1 0 0 \n", 597 | "1 2 -2 ... 2 5 4 1 1 \n", 598 | "2 3 -3 ... 2 6 5 1 1 \n", 599 | "3 3 -3 ... 2 7 6 1 1 \n", 600 | "4 7 -7 ... 2 9 6 1 1 \n", 601 | "... ... ... ... ... ... ... ... ... \n", 602 | "98072 -11 11 ... 32 28 26 11 17 \n", 603 | "98073 -4 4 ... 33 31 27 12 17 \n", 604 | "98074 -1 1 ... 34 31 27 13 17 \n", 605 | "98075 5 -5 ... 38 34 27 13 17 \n", 606 | "98076 5 -5 ... 39 35 28 13 17 \n", 607 | "\n", 608 | " ht_stint_pm vt_stint_pm ht_stint_poss vt_stint_poss total_poss \n", 609 | "0 -2 2 4 4 99 \n", 610 | "1 0 0 4 4 12 \n", 611 | "2 0 0 4 4 11 \n", 612 | "3 0 0 4 4 14 \n", 613 | "4 0 0 4 4 4 \n", 614 | "... ... ... ... ... ... \n", 615 | "98072 2 -2 4 4 17 \n", 616 | "98073 2 -2 4 4 6 \n", 617 | "98074 0 0 4 4 18 \n", 618 | "98075 2 -2 4 4 7 \n", 619 | "98076 0 0 4 4 45 \n", 620 | "\n", 621 | "[98077 rows x 41 columns] year game home_team vis_team Q time_sec ht_score vt_score \\\n", 622 | "0 2017 1 CLE NYK 1 395.0 14 12 \n", 623 | "1 2017 1 CLE NYK 1 455.0 15 12 \n", 624 | "2 2017 1 CLE NYK 1 536.0 19 16 \n", 625 | "3 2017 1 CLE NYK 1 620.0 23 16 \n", 626 | "4 2017 1 CLE NYK 1 632.0 23 16 \n", 627 | "... ... ... ... ... .. ... ... ... \n", 628 | "98072 2019 1230 POR SAC 4 2371.0 111 117 \n", 629 | "98073 2019 1230 POR SAC 4 2407.0 116 117 \n", 630 | "98074 2019 1230 POR SAC 4 2517.0 120 117 \n", 631 | "98075 2019 1230 POR SAC 4 2579.0 122 117 \n", 632 | "98076 2019 1230 POR SAC 4 2880.0 136 131 \n", 633 | "\n", 634 | " ht_margin vt_margin ... vt_3PTA ht_2PTM vt_2PTM ht_3PTM vt_3PTM \\\n", 635 | "0 2 -2 ... 2 5 4 1 1 \n", 636 | "1 3 -3 ... 2 5 4 1 1 \n", 637 | "2 3 -3 ... 2 7 6 1 1 \n", 638 | "3 7 -7 ... 2 9 6 1 1 \n", 639 | "4 7 -7 ... 2 9 6 1 1 \n", 640 | "... ... ... ... ... ... ... ... ... \n", 641 | "98072 -6 6 ... 32 30 27 12 17 \n", 642 | "98073 -1 1 ... 33 31 27 13 17 \n", 643 | "98074 3 -3 ... 37 33 27 13 17 \n", 644 | "98075 5 -5 ... 38 34 27 13 17 \n", 645 | "98076 5 -5 ... 43 38 31 14 18 \n", 646 | "\n", 647 | " ht_stint_pm vt_stint_pm ht_stint_poss vt_stint_poss total_poss \n", 648 | "0 2 -2 63 63 99 \n", 649 | "1 1 -1 12 12 12 \n", 650 | "2 0 0 11 11 11 \n", 651 | "3 4 -4 14 14 14 \n", 652 | "4 0 0 4 4 4 \n", 653 | "... ... ... ... ... ... \n", 654 | "98072 7 -7 17 17 17 \n", 655 | "98073 5 -5 6 6 6 \n", 656 | "98074 4 -4 18 18 18 \n", 657 | "98075 2 -2 7 7 7 \n", 658 | "98076 0 0 45 45 45 \n", 659 | "\n", 660 | "[98077 rows x 41 columns]\n" 661 | ] 662 | } 663 | ], 664 | "source": [ 665 | "# For a single game, this gives the start and ends of offensive possessions for the home team\n", 666 | "#htm_off_starts = np.unique(df[df['ht_poss'] == 'OFF'].time_sec.values.astype(float) - df[df['ht_poss'] == 'OFF'].ht_time_off.values.astype(float))\n", 667 | "#vtm_off_starts = np.unique(df[df['vt_poss'] == 'OFF'].time_sec.values.astype(float) - df[df['vt_poss'] == 'OFF'].vt_time_off.values.astype(float))\n", 668 | "#print(htm_off_starts, vtm_off_starts)\n", 669 | "\n", 670 | "# PBP data with lineup matchups with too few possessions\n", 671 | "# (substitution-only plays, etc.) are removed for each game\n", 672 | "min_num_poss = 3\n", 673 | "df_newnew = df_new\n", 674 | "df_newnew['total_poss'] = df_new.groupby(['game', 'year', 'ht_lineup', 'vt_lineup'], sort=False)['ht_stint_poss'].transform('count')\n", 675 | "df_newnew = df_newnew[df_newnew['ht_stint_poss'] > min_num_poss]\n", 676 | "df = df_newnew\n", 677 | "#print(df)\n", 678 | "\n", 679 | "# Get rows corresponding to start of new lineup matchup (+/- = 0)\n", 680 | "dfs = GetMatchupStarts(df)\n", 681 | "# Get rows corresponding to end of lineup matchup (final +/-)\n", 682 | "dfe = GetMatchupEnds(df)\n", 683 | "# Ensure each stint meets minimum possession requirement\n", 684 | "dfs = dfs[dfe.ht_stint_poss > min_num_poss]\n", 685 | "dfe = dfe[dfe.ht_stint_poss > min_num_poss]\n", 686 | "print(dfs, dfe)\n", 687 | "\n", 688 | "# Time average of all play-by-play data (over all games and lineups)\n", 689 | "dfm = df.groupby(['time_sec'], as_index=False).mean()\n", 690 | "#print(dfm)\n", 691 | "\n", 692 | "# Time average of play-by-play data for each home,away lineup matchup\n", 693 | "dfta = df.groupby(['time_sec', 'game', 'year', 'ht_lineup', 'vt_lineup'], as_index=False).mean()\n", 694 | "#print(dfta)\n", 695 | "\n", 696 | "# Play-by-play data grouped averaged by quarter\n", 697 | "df_by_quarter = df.groupby(['Q'], as_index=False).mean()\n", 698 | "#print(df_by_quarter)\n", 699 | "\n", 700 | "# PBP data grouped by home and away teams for team-specific time analysis\n", 701 | "dfht = df.groupby(['home_team', 'time_sec'], as_index=False).mean()\n", 702 | "dfvt = df.groupby(['vis_team', 'time_sec'], as_index=False).mean()\n", 703 | "#print(dfht, dfht.ht_margin)\n", 704 | "#print(dfvt, dfvt.vt_margin)\n", 705 | "#print(dfht.ht_margin-dfvt.vt_margin)\n", 706 | "#print(dfht, dfvt)\n", 707 | "#for name in np.unique(dfht.home_team.values): \n", 708 | "# print(name)\n", 709 | "\n", 710 | "########## DEPRECATED\n", 711 | "# Find the PBP rows corresponding to the last possession of each\n", 712 | "# lineup matchup, for each game (yielding the cumulative +/- for that lineup)\n", 713 | "##dfma = df_new.groupby(['game', 'ht_lineup', 'vt_lineup'], as_index=False).tail(1)\n", 714 | "#idx = df_newnew.groupby(['game', 'ht_lineup', 'vt_lineup'], sort=False)['ht_stint_poss'].transform('max') == df_newnew['ht_stint_poss']\n", 715 | "#dfma = df_newnew[idx]\n", 716 | "#print(dfma)\n", 717 | "#print(dfma.ht_lineup)\n", 718 | "#print(dfma.vt_lineup)\n", 719 | "#print(dfma.ht_stint_pm)\n", 720 | "\n", 721 | "# Mean, first row, and last row of lineups for each game\n", 722 | "#dfl = df_newnew.groupby(['ht_lineup', 'vt_lineup'], as_index=False).mean()\n", 723 | "#dfhl = df_newnew.groupby(['ht_lineup'], as_index=False).mean()\n", 724 | "#dfvl = df_newnew.groupby(['vt_lineup'], as_index=False).mean()\n", 725 | "#dfhs = df_newnew.groupby(['game', 'ht_lineup', 'vt_lineup'], as_index=False).head(1)\n", 726 | "#dfhs = dfhs.groupby(['ht_lineup'], as_index=False).mean()\n", 727 | "#dfvs = df_newnew.groupby(['game', 'ht_lineup', 'vt_lineup'], as_index=False).head(1)\n", 728 | "#dfvs = dfvs.groupby(['vt_lineup'], as_index=False).mean()\n", 729 | "#dfhe = df_newnew.groupby(['game', 'ht_lineup', 'vt_lineup'], as_index=False).tail(1)\n", 730 | "#dfhe = dfhe.groupby(['ht_lineup'], as_index=False).mean()\n", 731 | "#dfve = df_newnew.groupby(['game', 'ht_lineup', 'vt_lineup'], as_index=False).tail(1)\n", 732 | "#dfve = dfve.groupby(['vt_lineup'], as_index=False).mean()\n", 733 | "##########\n" 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "# Dump Matchup Data to Files" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 7, 746 | "metadata": {}, 747 | "outputs": [ 748 | { 749 | "name": "stderr", 750 | "output_type": "stream", 751 | "text": [ 752 | "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n", 753 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 754 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 755 | "\n", 756 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 757 | " # Remove the CWD from sys.path while we load stuff.\n", 758 | "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n", 759 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 760 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 761 | "\n", 762 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 763 | " # This is added back by InteractiveShellApp.init_path()\n" 764 | ] 765 | }, 766 | { 767 | "name": "stdout", 768 | "output_type": "stream", 769 | "text": [ 770 | " year game ht_lineup \\\n", 771 | "0 2017 1 JR Smith,Kevin Love,Kyrie Irving,LeBron James,... \n", 772 | "1 2017 1 JR Smith,Kyrie Irving,LeBron James,Richard Jef... \n", 773 | "2 2017 1 Iman Shumpert,Kyrie Irving,LeBron James,Richar... \n", 774 | "3 2017 1 Iman Shumpert,Kevin Love,Mike Dunleavy,Richard... \n", 775 | "4 2017 1 Iman Shumpert,Kevin Love,Mike Dunleavy,Richard... \n", 776 | "... ... ... ... \n", 777 | "98072 2019 1230 Jake Layman,Meyers Leonard,Gary Trent Jr.,Anfe... \n", 778 | "98073 2019 1230 Anfernee Simons,Gary Trent Jr.,Jake Layman,Mey... \n", 779 | "98074 2019 1230 Anfernee Simons,Gary Trent Jr.,Jake Layman,Mey... \n", 780 | "98075 2019 1230 Anfernee Simons,Gary Trent Jr.,Jake Layman,Mey... \n", 781 | "98076 2019 1230 Anfernee Simons,Gary Trent Jr.,Jake Layman,Mey... \n", 782 | "\n", 783 | " vt_lineup ht_stint_pm \\\n", 784 | "0 Carmelo Anthony,Courtney Lee,Derrick Rose,Joak... 2 \n", 785 | "1 Brandon Jennings,Carmelo Anthony,Courtney Lee,... 1 \n", 786 | "2 Brandon Jennings,Carmelo Anthony,Courtney Lee,... 0 \n", 787 | "3 Brandon Jennings,Carmelo Anthony,Courtney Lee,... 4 \n", 788 | "4 Brandon Jennings,Courtney Lee,Justin Holiday,K... 0 \n", 789 | "... ... ... \n", 790 | "98072 Bogdan Bogdanovic,Corey Brewer,Frank Mason,Mar... 7 \n", 791 | "98073 Bogdan Bogdanovic,Corey Brewer,Frank Mason,Mar... 5 \n", 792 | "98074 Bogdan Bogdanovic,Corey Brewer,Frank Mason,Mar... 4 \n", 793 | "98075 BJ Johnson,Corey Brewer,Frank Mason,Marvin Bag... 2 \n", 794 | "98076 BJ Johnson,Caleb Swanigan,Corey Brewer,Frank M... 0 \n", 795 | "\n", 796 | " vt_stint_pm ht_stint_poss ht_pm_ph vt_pm_ph \n", 797 | "0 -2 63 3.174603 -3.174603 \n", 798 | "1 -1 12 8.333333 -8.333333 \n", 799 | "2 0 11 0.000000 0.000000 \n", 800 | "3 -4 14 28.571429 -28.571429 \n", 801 | "4 0 4 0.000000 0.000000 \n", 802 | "... ... ... ... ... \n", 803 | "98072 -7 17 41.176471 -41.176471 \n", 804 | "98073 -5 6 83.333333 -83.333333 \n", 805 | "98074 -4 18 22.222222 -22.222222 \n", 806 | "98075 -2 7 28.571429 -28.571429 \n", 807 | "98076 0 45 0.000000 0.000000 \n", 808 | "\n", 809 | "[98077 rows x 9 columns]\n" 810 | ] 811 | } 812 | ], 813 | "source": [ 814 | "# Write lineup matchup +/- data to files\n", 815 | "dfpm = dfe.copy()\n", 816 | "dfpm['ht_pm_ph'] = dfpm.ht_stint_pm.values*100./dfpm.ht_stint_poss.values\n", 817 | "dfpm['vt_pm_ph'] = dfpm.vt_stint_pm.values*100./dfpm.ht_stint_poss.values\n", 818 | "#dfpm = dfpm.groupby(['ht_lineup', 'vt_lineup'], as_index=False).mean()\n", 819 | "dfpm.to_csv('../NBA_Full_Matchup_Data.csv')\n", 820 | "\n", 821 | "# More simplified dataframe\n", 822 | "dfpm = dfe[['year', 'game', 'ht_lineup', 'vt_lineup', 'ht_stint_pm', 'vt_stint_pm', 'ht_stint_poss']]\n", 823 | "dfpm['ht_pm_ph'] = dfpm.ht_stint_pm.values*100./dfpm.ht_stint_poss.values\n", 824 | "dfpm['vt_pm_ph'] = dfpm.vt_stint_pm.values*100./dfpm.ht_stint_poss.values\n", 825 | "#dfpm = dfpm.groupby(['ht_lineup', 'vt_lineup'], as_index=False).mean()\n", 826 | "dfpm.to_csv('../NBA_Matchup_PlusMinus.csv')\n", 827 | "print(dfpm)\n" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": null, 833 | "metadata": {}, 834 | "outputs": [], 835 | "source": [] 836 | } 837 | ], 838 | "metadata": { 839 | "kernelspec": { 840 | "display_name": "Python 3", 841 | "language": "python", 842 | "name": "python3" 843 | }, 844 | "language_info": { 845 | "codemirror_mode": { 846 | "name": "ipython", 847 | "version": 3 848 | }, 849 | "file_extension": ".py", 850 | "mimetype": "text/x-python", 851 | "name": "python", 852 | "nbconvert_exporter": "python", 853 | "pygments_lexer": "ipython3", 854 | "version": "3.7.7" 855 | }, 856 | "toc": { 857 | "base_numbering": 1, 858 | "nav_menu": {}, 859 | "number_sections": true, 860 | "sideBar": true, 861 | "skip_h1_title": false, 862 | "title_cell": "Table of Contents", 863 | "title_sidebar": "Contents", 864 | "toc_cell": true, 865 | "toc_position": { 866 | "height": "calc(100% - 180px)", 867 | "left": "10px", 868 | "top": "150px", 869 | "width": "166px" 870 | }, 871 | "toc_section_display": true, 872 | "toc_window_display": true 873 | } 874 | }, 875 | "nbformat": 4, 876 | "nbformat_minor": 2 877 | } 878 | -------------------------------------------------------------------------------- /Data_Scraping/Scrape_NBA_DraftCombine_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import urllib.request\n", 11 | "from selenium import webdriver\n", 12 | "from selenium.webdriver.support.ui import Select\n", 13 | "from selenium.webdriver.support import expected_conditions as EC\n", 14 | "from selenium.webdriver.common.by import By\n", 15 | "from selenium.webdriver.support.ui import WebDriverWait\n", 16 | "from selenium.common.exceptions import TimeoutException\n", 17 | "from selenium.webdriver.firefox.options import Options as FirefoxOptions\n", 18 | "from selenium.webdriver.chrome.options import Options as ChromeOptions\n", 19 | "import lxml.html\n", 20 | "from lxml import etree\n", 21 | "import time\n", 22 | "import pandas as pd\n", 23 | "from functools import reduce" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Modify the dataframe to have appropriate data types\n", 33 | "def ConvertDataFrame(df):\n", 34 | " new_df = df.loc[:, df.columns != 'name'].astype(float)\n", 35 | " df[new_df.columns] = new_df\n", 36 | " df['name'] = df['name'].astype('str')\n", 37 | " df['draft_year'] = df['draft_year'].astype('int')\n", 38 | " df = df.drop_duplicates(subset=['name'], keep=False)\n", 39 | " return df" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# Scrape draft combine data from NBA.com\n", 49 | "def FetchCombineAnthroTables(urls, years):\n", 50 | " # Create a headless Firefox browser instance\n", 51 | " opt = FirefoxOptions()\n", 52 | " opt.add_argument(\"--headless\")\n", 53 | " driver = webdriver.Firefox(options=opt)\n", 54 | " \n", 55 | " arr = []\n", 56 | " for i,url in enumerate(urls):\n", 57 | " year = years[i]\n", 58 | " print(\"Fetching NBA Combine measurements from Year\", year, \"...\")\n", 59 | " \n", 60 | " driver.get(url)\n", 61 | " wait = WebDriverWait(driver, 30)\n", 62 | " wait.until(EC.presence_of_element_located((By.XPATH, \"//*[@class='nba-stat-table__overflow']//table/tbody/tr\")))\n", 63 | " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;\")\n", 64 | " results = driver.find_elements_by_xpath(\"//*[@class='nba-stat-table__overflow']/table/tbody/tr\")\n", 65 | " \n", 66 | " counter = 0\n", 67 | " for result in results:\n", 68 | " item = result.text\n", 69 | " data = item.split()\n", 70 | " #if counter == 0:\n", 71 | " # print(data)\n", 72 | " last_str = ''.join(data[-7:])\n", 73 | " if '-' in last_str:\n", 74 | " continue\n", 75 | " if len(data) != 13 and len(data) != 15:\n", 76 | " continue\n", 77 | " if len(data) == 15:\n", 78 | " #print(\"Deleting\", data[-7])\n", 79 | " del data[-7]\n", 80 | " #print(\"Deleting\", data[-6])\n", 81 | " del data[-6]\n", 82 | " #print(' '.join(data[0:-11]))\n", 83 | " data[0:-11] = [' '.join(data[0:-11])]\n", 84 | " del data[1]\n", 85 | " data = [s.strip('%') for s in data]\n", 86 | " data = [s.strip('\\'') for s in data]\n", 87 | " data[4:] = [float(f) for f in data[4:]]\n", 88 | " data[4:5] = [data[4]*12.+data[5]]\n", 89 | " del data[5]\n", 90 | " data[5:6] = [data[5]*12.+data[6]]\n", 91 | " del data[6]\n", 92 | " data[7:8] = [data[7]*12.+data[8]]\n", 93 | " del data[8]\n", 94 | " data.insert(1, int(year))\n", 95 | " arr.append(data)\n", 96 | " counter += 1\n", 97 | " #print(data)\n", 98 | " \n", 99 | " print(\"Fetched stats for\", counter, \"NBA draft players.\")\n", 100 | " time.sleep(1)\n", 101 | " \n", 102 | " time.sleep(1)\n", 103 | " driver.quit()\n", 104 | " #print(arr)\n", 105 | " return np.array(arr)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 4, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "# Establish the years for which we want to fetch NBA Draft Combine player measurements\n", 115 | "ya = [str(n).zfill(2) for n in range(0, 20)]\n", 116 | "yb = [str(n).zfill(2) for n in range(1, 21)]\n", 117 | "years = [int(\"20\"+y) for y in yb]" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "Fetching NBA Combine measurements from Year 2001 ...\n", 130 | "Fetched stats for 64 NBA draft players.\n", 131 | "Fetching NBA Combine measurements from Year 2002 ...\n", 132 | "Fetched stats for 78 NBA draft players.\n", 133 | "Fetching NBA Combine measurements from Year 2003 ...\n", 134 | "Fetched stats for 81 NBA draft players.\n", 135 | "Fetching NBA Combine measurements from Year 2004 ...\n", 136 | "Fetched stats for 78 NBA draft players.\n", 137 | "Fetching NBA Combine measurements from Year 2005 ...\n", 138 | "Fetched stats for 79 NBA draft players.\n", 139 | "Fetching NBA Combine measurements from Year 2006 ...\n", 140 | "Fetched stats for 78 NBA draft players.\n", 141 | "Fetching NBA Combine measurements from Year 2007 ...\n", 142 | "Fetched stats for 76 NBA draft players.\n", 143 | "Fetching NBA Combine measurements from Year 2008 ...\n", 144 | "Fetched stats for 76 NBA draft players.\n", 145 | "Fetching NBA Combine measurements from Year 2009 ...\n", 146 | "Fetched stats for 74 NBA draft players.\n", 147 | "Fetching NBA Combine measurements from Year 2010 ...\n", 148 | "Fetched stats for 46 NBA draft players.\n", 149 | "Fetching NBA Combine measurements from Year 2011 ...\n", 150 | "Fetched stats for 52 NBA draft players.\n", 151 | "Fetching NBA Combine measurements from Year 2012 ...\n", 152 | "Fetched stats for 54 NBA draft players.\n", 153 | "Fetching NBA Combine measurements from Year 2013 ...\n", 154 | "Fetched stats for 61 NBA draft players.\n", 155 | "Fetching NBA Combine measurements from Year 2014 ...\n", 156 | "Fetched stats for 61 NBA draft players.\n", 157 | "Fetching NBA Combine measurements from Year 2015 ...\n", 158 | "Fetched stats for 54 NBA draft players.\n", 159 | "Fetching NBA Combine measurements from Year 2016 ...\n", 160 | "Fetched stats for 57 NBA draft players.\n", 161 | "Fetching NBA Combine measurements from Year 2017 ...\n", 162 | "Fetched stats for 54 NBA draft players.\n", 163 | "Fetching NBA Combine measurements from Year 2018 ...\n", 164 | "Fetched stats for 59 NBA draft players.\n", 165 | "Fetching NBA Combine measurements from Year 2019 ...\n", 166 | "Fetched stats for 69 NBA draft players.\n", 167 | "Fetching NBA Combine measurements from Year 2020 ...\n", 168 | "Fetched stats for 66 NBA draft players.\n", 169 | " name draft_year BFP hand_length hand_width height \\\n", 170 | "0 Malik Allen 2001 - - - 80.25 \n", 171 | "1 Harold Arceneaux 2001 - - - 76.5 \n", 172 | "2 Lamont Barnes 2001 - - - 80.5 \n", 173 | "3 Mario Bland 2001 - - - 77.5 \n", 174 | "4 Primoz Brezec 2001 - - - 84.75 \n", 175 | "... ... ... ... ... ... ... \n", 176 | "1312 Quinndary Weatherspoon 2020 6.10 8.50 9.00 75.0 \n", 177 | "1313 Coby White 2020 4.30 7.75 9.00 75.5 \n", 178 | "1314 Kris Wilkes 2020 4.90 8.50 9.50 78.25 \n", 179 | "1315 Grant Williams 2020 5.40 9.00 10.50 77.75 \n", 180 | "1316 Dylan Windler 2020 4.60 8.25 9.50 78.25 \n", 181 | "\n", 182 | " reach weight wingspan \n", 183 | "0 109.0 271.0 86.5 \n", 184 | "1 103.0 219.0 80.5 \n", 185 | "2 108.0 235.5 87.5 \n", 186 | "3 103.0 287.0 84.0 \n", 187 | "4 110.0 243.0 86.0 \n", 188 | "... ... ... ... \n", 189 | "1312 100.0 206.6 81.0 \n", 190 | "1313 97.5 191.4 77.0 \n", 191 | "1314 103.0 208.8 82.75 \n", 192 | "1315 104.5 240.2 81.75 \n", 193 | "1316 104.5 195.8 82.0 \n", 194 | "\n", 195 | "[1317 rows x 9 columns]\n" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "# Create URLs for the available years of NBA Combine data, fetch the data in 2D array format,\n", 201 | "# put into a Pandas dataframe, and store the data in a .csv file format\n", 202 | "#urls = [ 'https://stats.nba.com/draft/combine-anthro/?SeasonYear=2006-07' ]\n", 203 | "urls = [ \"https://stats.nba.com/draft/combine-anthro/?SeasonYear=20{0}-{1}\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 204 | "#print(urls)\n", 205 | "np_arr = FetchCombineAnthroTables(urls, years)\n", 206 | "#print(np_arr)\n", 207 | "df = pd.DataFrame(np_arr, columns=['name', 'draft_year', 'BFP', 'hand_length', 'hand_width', 'height', 'reach', 'weight', 'wingspan'])\n", 208 | "print(df)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 6, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "df.to_csv(\"NBACombineStats.csv\")" 218 | ] 219 | } 220 | ], 221 | "metadata": { 222 | "kernelspec": { 223 | "display_name": "Python 3", 224 | "language": "python", 225 | "name": "python3" 226 | }, 227 | "language_info": { 228 | "codemirror_mode": { 229 | "name": "ipython", 230 | "version": 3 231 | }, 232 | "file_extension": ".py", 233 | "mimetype": "text/x-python", 234 | "name": "python", 235 | "nbconvert_exporter": "python", 236 | "pygments_lexer": "ipython3", 237 | "version": "3.7.3" 238 | }, 239 | "toc": { 240 | "base_numbering": 1, 241 | "nav_menu": {}, 242 | "number_sections": true, 243 | "sideBar": false, 244 | "skip_h1_title": false, 245 | "title_cell": "Table of Contents", 246 | "title_sidebar": "Contents", 247 | "toc_cell": true, 248 | "toc_position": {}, 249 | "toc_section_display": true, 250 | "toc_window_display": false 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 2 255 | } 256 | -------------------------------------------------------------------------------- /Data_Scraping/Scrape_NBA_PlayType_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import numpy as np\n", 20 | "import urllib.request\n", 21 | "from selenium import webdriver\n", 22 | "from selenium.webdriver.support.ui import Select\n", 23 | "from selenium.webdriver.support import expected_conditions as EC\n", 24 | "from selenium.webdriver.common.by import By\n", 25 | "from selenium.webdriver.support.ui import WebDriverWait\n", 26 | "from selenium.common.exceptions import TimeoutException\n", 27 | "from selenium.webdriver.firefox.options import Options as FirefoxOptions\n", 28 | "from selenium.webdriver.chrome.options import Options as ChromeOptions\n", 29 | "import lxml.html\n", 30 | "from lxml import etree\n", 31 | "import re\n", 32 | "import time\n", 33 | "import pandas as pd\n", 34 | "from functools import reduce" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Modify dataframe to have appropriate data types\n", 44 | "def ConvertDataFrame(df):\n", 45 | " new_df = df.loc[:, df.columns != 'name'].astype(float)\n", 46 | " df[new_df.columns] = new_df\n", 47 | " df['name'] = df['name'].astype('str')\n", 48 | " df['year'] = df['year'].astype('int')\n", 49 | " df = df.groupby(['name', 'year']).mean().reset_index()\n", 50 | " return df\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Scrape play-type data tables from NBA.com webpages, grabbing only\n", 60 | "# the specified columns (by index) and for the specified seasons\n", 61 | "def FetchPlayTypeTables(urls, years, button):\n", 62 | " # Create a headless Firefox browser instance\n", 63 | " opt = FirefoxOptions()\n", 64 | " opt.add_argument(\"--headless\")\n", 65 | " driver = webdriver.Firefox(options=opt)\n", 66 | " \n", 67 | " arr = []\n", 68 | " for i,url in enumerate(urls):\n", 69 | " year = years[i]\n", 70 | " print(\"Fetching play type data from Year\", year, \"...\")\n", 71 | " \n", 72 | " driver.get(url)\n", 73 | " wait = WebDriverWait(driver, 30)\n", 74 | " wait.until(EC.presence_of_element_located((By.XPATH, \"//select[@name='TypeGrouping']\")))\n", 75 | " sel = Select(driver.find_element_by_name('TypeGrouping'))\n", 76 | " sel.select_by_visible_text(button)\n", 77 | " wait.until(EC.presence_of_element_located((By.XPATH, \"//select[contains(@class, 'stats-table-pagination__select')]\")))\n", 78 | " sel = Select(driver.find_element_by_class_name('stats-table-pagination__select'))\n", 79 | " sel.select_by_visible_text(\"All\")\n", 80 | " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;\")\n", 81 | " \n", 82 | " root = lxml.html.fromstring(driver.page_source)\n", 83 | " results = root.xpath(\"//*[@class='nba-stat-table__overflow']//table/tbody/tr\")\n", 84 | " \n", 85 | " # Process the table text and break into columns, \n", 86 | " # stripping extraneous newline characters and inserting the season year\n", 87 | " counter = 0\n", 88 | " for result in results:\n", 89 | " item = result.xpath(\"./td//text()\")\n", 90 | " item = [re.sub('\\n +', '', x) for x in item]\n", 91 | " data = [x for x in item if x != '' and x != '\\n'] \n", 92 | " if len(data) != 17:\n", 93 | " continue\n", 94 | " del data[-15]\n", 95 | " del data[-15]\n", 96 | " #% on indices 2, 7-14\n", 97 | " data = [s.strip('%') for s in data]\n", 98 | " data.insert(1, int(year))\n", 99 | " arr.append(data)\n", 100 | " counter += 1\n", 101 | " #print(data)\n", 102 | " \n", 103 | " print(\"Fetched stats for\", counter, \"NBA players.\")\n", 104 | " \n", 105 | " driver.quit()\n", 106 | " #print(arr)\n", 107 | " return np.array(arr)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "# Scrape play-type data tables from NBA.com webpages, grabbing only\n", 117 | "# the specified columns (by index) and for the specified seasons.\n", 118 | "# ADDITIONALLY: completely reset the web driver for each URL,\n", 119 | "# which is needed for specific webpages.\n", 120 | "def FetchPlayTypeTables_ResetPage(urls, years, button):\n", 121 | " # Create a headless Firefox browser instance\n", 122 | " opt = FirefoxOptions()\n", 123 | " opt.add_argument(\"--headless\")\n", 124 | " \n", 125 | " arr = []\n", 126 | " for i,url in enumerate(urls):\n", 127 | " driver = webdriver.Firefox(options=opt)\n", 128 | " year = years[i]\n", 129 | " print(\"Fetching play type data from Year\", year, \"...\")\n", 130 | " \n", 131 | " driver.get(url)\n", 132 | " wait = WebDriverWait(driver, 30)\n", 133 | " wait.until(EC.presence_of_element_located((By.XPATH, \"//select[@name='TypeGrouping']\")))\n", 134 | " sel = Select(driver.find_element_by_name('TypeGrouping'))\n", 135 | " sel.select_by_visible_text(button)\n", 136 | " wait.until(EC.presence_of_element_located((By.XPATH, \"//select[contains(@class, 'stats-table-pagination__select')]\")))\n", 137 | " sel = Select(driver.find_element_by_class_name('stats-table-pagination__select'))\n", 138 | " sel.select_by_visible_text(\"All\")\n", 139 | " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;\")\n", 140 | " \n", 141 | " root = lxml.html.fromstring(driver.page_source)\n", 142 | " results = root.xpath(\"//*[@class='nba-stat-table__overflow']//table/tbody/tr\")\n", 143 | " \n", 144 | " # Process the table text and break into columns, \n", 145 | " # stripping extraneous newline characters and inserting the season year\n", 146 | " counter = 0\n", 147 | " for result in results:\n", 148 | " item = result.xpath(\"./td//text()\")\n", 149 | " item = [re.sub('\\n +', '', x) for x in item]\n", 150 | " data = [x for x in item if x != '' and x != '\\n'] \n", 151 | " if len(data) != 17:\n", 152 | " continue\n", 153 | " del data[-15]\n", 154 | " del data[-15]\n", 155 | " #% on indices 2, 7-14\n", 156 | " data = [s.strip('%') for s in data]\n", 157 | " data.insert(1, int(year))\n", 158 | " arr.append(data)\n", 159 | " counter += 1\n", 160 | " #print(data)\n", 161 | " \n", 162 | " print(\"Fetched stats for\", counter, \"NBA players.\")\n", 163 | " driver.quit()\n", 164 | "\n", 165 | " #print(arr)\n", 166 | " return np.array(arr)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "# Scrape Player Data for Transition Plays" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 6, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "#off_play_types = ['TR', 'CUT', 'PB', 'MISC', 'ISO', 'PRBH', 'PRRM', 'PU', 'SU', 'HO', 'OS']\n", 183 | "#def_play_types = ['ISO', 'PRBH', 'PRRM', 'PU', 'SU', 'HO', 'OS']\n", 184 | "cols = ['name', 'year', '_POSS_', '_FREQ_', '_PPP_', '_PTS_', '_FGM_', '_FGA_', '_FGP_', '_EFGP_', '_FT_FREQ_', '_TO_FREQ_', '_SF_FREQ_', '_AND1_FREQ_', '_SCORE_FREQ_', '_PERC_']\n", 185 | "#for play_type in play_types:\n", 186 | "# off_cols = ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 187 | "# def_cols = ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 188 | "# print(off_cols, '\\n', def_cols)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 7, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# Establish the years for which we want to fetch play-type data from NBA.com\n", 198 | "ya = [str(n).zfill(2) for n in range(15, 20)]\n", 199 | "yb = [str(n).zfill(2) for n in range(16, 21)]\n", 200 | "years = [int(\"20\"+y) for y in yb]" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 8, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "Fetching play type data from Year 2016 ...\n", 213 | "Fetched stats for 396 NBA players.\n", 214 | "Fetching play type data from Year 2017 ...\n", 215 | "Fetched stats for 399 NBA players.\n", 216 | "Fetching play type data from Year 2018 ...\n", 217 | "Fetched stats for 423 NBA players.\n", 218 | "Fetching play type data from Year 2019 ...\n", 219 | "Fetched stats for 458 NBA players.\n", 220 | "Fetching play type data from Year 2020 ...\n", 221 | "Fetched stats for 347 NBA players.\n", 222 | " name year OFF_POSS_TR OFF_FREQ_TR OFF_PPP_TR OFF_PTS_TR \\\n", 223 | "0 Aaron Brooks 2016 0.9 10.4 0.95 0.8 \n", 224 | "1 Aaron Brooks 2017 0.6 10.2 0.72 0.4 \n", 225 | "2 Aaron Gordon 2016 1.6 17.4 1.08 1.8 \n", 226 | "3 Aaron Gordon 2017 2.6 19.9 1.03 2.7 \n", 227 | "4 Aaron Gordon 2018 3.7 20.1 0.96 3.6 \n", 228 | "... ... ... ... ... ... ... \n", 229 | "1892 Zach Randolph 2018 0.7 4.5 1.15 0.8 \n", 230 | "1893 Zaza Pachulia 2016 0.4 4.1 1.17 0.4 \n", 231 | "1894 Zaza Pachulia 2017 0.6 9.1 1.39 0.8 \n", 232 | "1895 Zaza Pachulia 2018 0.7 12.6 1.17 0.8 \n", 233 | "1896 Zaza Pachulia 2019 0.2 5.2 1.63 0.4 \n", 234 | "\n", 235 | " OFF_FGM_TR OFF_FGA_TR OFF_FGP_TR OFF_EFGP_TR OFF_FT_FREQ_TR \\\n", 236 | "0 0.4 0.7 39.2 52.0 3.3 \n", 237 | "1 0.3 0.4 33.3 46.3 5.1 \n", 238 | "2 0.5 1.1 55.8 58.1 24.4 \n", 239 | "3 0.9 1.9 53.6 56.5 17.2 \n", 240 | "4 1.5 2.8 47.3 52.7 12.1 \n", 241 | "... ... ... ... ... ... \n", 242 | "1892 0.3 0.6 55.6 65.3 2.4 \n", 243 | "1893 0.1 0.3 45.0 45.0 31.0 \n", 244 | "1894 0.2 0.5 67.6 67.6 14.6 \n", 245 | "1895 0.2 0.5 61.8 61.8 18.8 \n", 246 | "1896 0.0 0.2 76.9 76.9 25.0 \n", 247 | "\n", 248 | " OFF_TO_FREQ_TR OFF_SF_FREQ_TR OFF_AND1_FREQ_TR OFF_SCORE_FREQ_TR \\\n", 249 | "0 11.7 3.3 0.0 36.7 \n", 250 | "1 25.6 5.1 0.0 28.2 \n", 251 | "2 12.6 21.3 4.7 54.3 \n", 252 | "3 12.4 16.7 2.9 51.7 \n", 253 | "4 13.6 10.7 3.3 44.4 \n", 254 | "... ... ... ... ... \n", 255 | "1892 12.2 2.4 2.4 48.8 \n", 256 | "1893 0.0 31.0 0.0 58.6 \n", 257 | "1894 2.4 14.6 0.0 70.7 \n", 258 | "1895 14.6 18.8 4.2 58.3 \n", 259 | "1896 0.0 25.0 6.3 81.3 \n", 260 | "\n", 261 | " OFF_PERC_TR \n", 262 | "0 20.3 \n", 263 | "1 4.2 \n", 264 | "2 43.9 \n", 265 | "3 37.1 \n", 266 | "4 23.9 \n", 267 | "... ... \n", 268 | "1892 58.1 \n", 269 | "1893 64.4 \n", 270 | "1894 92.5 \n", 271 | "1895 62.4 \n", 272 | "1896 99.0 \n", 273 | "\n", 274 | "[1897 rows x 16 columns]\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "# Scrape transition play-type data, fetch the data in 2D array format,\n", 280 | "# and convert to a dataframe format with the appropriate data types\n", 281 | "play_type = \"TR\"\n", 282 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 283 | "#urls = [ \"https://stats.nba.com/players/transition/?SeasonYear=2018-19&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\" ]\n", 284 | "urls = [ \"https://stats.nba.com/players/transition/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 285 | "#print(urls)\n", 286 | "np_arr_trans = FetchPlayTypeTables(urls, years, \"Offensive\")\n", 287 | "df_trans = pd.DataFrame(np_arr_trans, columns=off_cols)\n", 288 | "df_trans = ConvertDataFrame(df_trans)\n", 289 | "#print(df_trans.dtypes)\n", 290 | "print(df_trans)\n" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "# Scrape Player Data for Isolation Plays" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 9, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "Fetching play type data from Year 2016 ...\n", 310 | "Fetched stats for 287 NBA players.\n", 311 | "Fetching play type data from Year 2017 ...\n", 312 | "Fetched stats for 288 NBA players.\n", 313 | "Fetching play type data from Year 2018 ...\n", 314 | "Fetched stats for 277 NBA players.\n", 315 | "Fetching play type data from Year 2019 ...\n", 316 | "Fetched stats for 287 NBA players.\n", 317 | "Fetching play type data from Year 2020 ...\n", 318 | "Fetched stats for 165 NBA players.\n", 319 | "Fetching play type data from Year 2016 ...\n", 320 | "Fetched stats for 380 NBA players.\n", 321 | "Fetching play type data from Year 2017 ...\n", 322 | "Fetched stats for 387 NBA players.\n", 323 | "Fetching play type data from Year 2018 ...\n", 324 | "Fetched stats for 383 NBA players.\n", 325 | "Fetching play type data from Year 2019 ...\n", 326 | "Fetched stats for 411 NBA players.\n", 327 | "Fetching play type data from Year 2020 ...\n", 328 | "Fetched stats for 304 NBA players.\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "# Scrape isolation play-type data, fetch the data in 2D array format,\n", 334 | "# and convert to a dataframe format with the appropriate data types\n", 335 | "play_type = \"ISO\"\n", 336 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 337 | "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 338 | "urls = [ \"https://stats.nba.com/players/isolation/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 339 | "\n", 340 | "np_arr_iso = FetchPlayTypeTables(urls, years, \"Offensive\")\n", 341 | "df_iso = pd.DataFrame(np_arr_iso, columns=off_cols)\n", 342 | "df_iso = ConvertDataFrame(df_iso)\n", 343 | "\n", 344 | "np_arr_iso = FetchPlayTypeTables(urls, years, \"Defensive\")\n", 345 | "df_iso2 = pd.DataFrame(np_arr_iso, columns=def_cols)\n", 346 | "df_iso2 = ConvertDataFrame(df_iso2)\n" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "# Scrape Player Data for Pick-and-Roll Plays" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 10, 359 | "metadata": { 360 | "scrolled": false 361 | }, 362 | "outputs": [ 363 | { 364 | "name": "stdout", 365 | "output_type": "stream", 366 | "text": [ 367 | "Fetching play type data from Year 2016 ...\n", 368 | "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2015-16&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n", 369 | "Fetched stats for 272 NBA players.\n", 370 | "Fetching play type data from Year 2017 ...\n", 371 | "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2016-17&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n", 372 | "Fetched stats for 265 NBA players.\n", 373 | "Fetching play type data from Year 2018 ...\n", 374 | "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2017-18&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n", 375 | "Fetched stats for 302 NBA players.\n", 376 | "Fetching play type data from Year 2019 ...\n", 377 | "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2018-19&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n", 378 | "Fetched stats for 308 NBA players.\n", 379 | "Fetching play type data from Year 2020 ...\n", 380 | "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2019-20&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n", 381 | "Fetched stats for 241 NBA players.\n", 382 | "Fetching play type data from Year 2016 ...\n", 383 | "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2015-16&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n", 384 | "Fetched stats for 312 NBA players.\n", 385 | "Fetching play type data from Year 2017 ...\n", 386 | "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2016-17&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n", 387 | "Fetched stats for 327 NBA players.\n", 388 | "Fetching play type data from Year 2018 ...\n", 389 | "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2017-18&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n", 390 | "Fetched stats for 367 NBA players.\n", 391 | "Fetching play type data from Year 2019 ...\n", 392 | "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2018-19&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n", 393 | "Fetched stats for 395 NBA players.\n", 394 | "Fetching play type data from Year 2020 ...\n", 395 | "https://stats.nba.com/players/ball-handler/#!?SeasonYear=2019-20&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\n", 396 | "Fetched stats for 302 NBA players.\n" 397 | ] 398 | } 399 | ], 400 | "source": [ 401 | "# Scrape pick-and-roll ball handler play-type data, fetch the data in 2D array format,\n", 402 | "# and convert to a dataframe format with the appropriate data types\n", 403 | "play_type = \"PRBH\"\n", 404 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 405 | "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 406 | "urls = [ \"https://stats.nba.com/players/ball-handler/#!?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 407 | "#print(urls)\n", 408 | "\n", 409 | "np_arr_prbh = FetchPlayTypeTables_ResetPage(urls, years, \"Offensive\")\n", 410 | "df_prbh = pd.DataFrame(np_arr_prbh, columns=off_cols)\n", 411 | "df_prbh = ConvertDataFrame(df_prbh)\n", 412 | "\n", 413 | "np_arr_prbh = FetchPlayTypeTables_ResetPage(urls, years, \"Defensive\")\n", 414 | "df_prbh2 = pd.DataFrame(np_arr_prbh, columns=def_cols)\n", 415 | "df_prbh2 = ConvertDataFrame(df_prbh2)\n" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 11, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "name": "stdout", 425 | "output_type": "stream", 426 | "text": [ 427 | "Fetching play type data from Year 2016 ...\n", 428 | "Fetched stats for 202 NBA players.\n", 429 | "Fetching play type data from Year 2017 ...\n", 430 | "Fetched stats for 196 NBA players.\n", 431 | "Fetching play type data from Year 2018 ...\n", 432 | "Fetched stats for 190 NBA players.\n", 433 | "Fetching play type data from Year 2019 ...\n", 434 | "Fetched stats for 201 NBA players.\n", 435 | "Fetching play type data from Year 2020 ...\n", 436 | "Fetched stats for 157 NBA players.\n", 437 | "Fetching play type data from Year 2016 ...\n", 438 | "Fetched stats for 272 NBA players.\n", 439 | "Fetching play type data from Year 2017 ...\n", 440 | "Fetched stats for 282 NBA players.\n", 441 | "Fetching play type data from Year 2018 ...\n", 442 | "Fetched stats for 293 NBA players.\n", 443 | "Fetching play type data from Year 2019 ...\n", 444 | "Fetched stats for 268 NBA players.\n", 445 | "Fetching play type data from Year 2020 ...\n", 446 | "Fetched stats for 161 NBA players.\n" 447 | ] 448 | } 449 | ], 450 | "source": [ 451 | "# Scrape pick-and-roll roll man play-type data, fetch the data in 2D array format,\n", 452 | "# and convert to a dataframe format with the appropriate data types\n", 453 | "play_type = \"PRRM\"\n", 454 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 455 | "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 456 | "urls = [ \"https://stats.nba.com/players/roll-man/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 457 | "\n", 458 | "np_arr_prrm = FetchPlayTypeTables(urls, years, \"Offensive\")\n", 459 | "df_prrm = pd.DataFrame(np_arr_prrm, columns=off_cols)\n", 460 | "df_prrm = ConvertDataFrame(df_prrm)\n", 461 | "\n", 462 | "np_arr_prrm = FetchPlayTypeTables(urls, years, \"Defensive\")\n", 463 | "df_prrm2 = pd.DataFrame(np_arr_prrm, columns=def_cols)\n", 464 | "df_prrm2 = ConvertDataFrame(df_prrm2)\n" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "# Scrape Player Data for Post-Up Plays" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 12, 477 | "metadata": {}, 478 | "outputs": [ 479 | { 480 | "name": "stdout", 481 | "output_type": "stream", 482 | "text": [ 483 | "Fetching play type data from Year 2016 ...\n", 484 | "Fetched stats for 216 NBA players.\n", 485 | "Fetching play type data from Year 2017 ...\n", 486 | "Fetched stats for 184 NBA players.\n", 487 | "Fetching play type data from Year 2018 ...\n", 488 | "Fetched stats for 192 NBA players.\n", 489 | "Fetching play type data from Year 2019 ...\n", 490 | "Fetched stats for 195 NBA players.\n", 491 | "Fetching play type data from Year 2020 ...\n", 492 | "Fetched stats for 113 NBA players.\n", 493 | "Fetching play type data from Year 2016 ...\n", 494 | "Fetched stats for 374 NBA players.\n", 495 | "Fetching play type data from Year 2017 ...\n", 496 | "Fetched stats for 358 NBA players.\n", 497 | "Fetching play type data from Year 2018 ...\n", 498 | "Fetched stats for 371 NBA players.\n", 499 | "Fetching play type data from Year 2019 ...\n", 500 | "Fetched stats for 387 NBA players.\n", 501 | "Fetching play type data from Year 2020 ...\n", 502 | "Fetched stats for 250 NBA players.\n" 503 | ] 504 | } 505 | ], 506 | "source": [ 507 | "# Scrape post up play-type data, fetch the data in 2D array format,\n", 508 | "# and convert to a dataframe format with the appropriate data types\n", 509 | "play_type = \"PU\"\n", 510 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 511 | "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 512 | "urls = [ \"https://stats.nba.com/players/playtype-post-up/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 513 | "\n", 514 | "np_arr_pu = FetchPlayTypeTables(urls, years, \"Offensive\")\n", 515 | "df_pu = pd.DataFrame(np_arr_pu, columns=off_cols)\n", 516 | "df_pu = ConvertDataFrame(df_pu)\n", 517 | "\n", 518 | "np_arr_pu = FetchPlayTypeTables(urls, years, \"Defensive\")\n", 519 | "df_pu2 = pd.DataFrame(np_arr_pu, columns=def_cols)\n", 520 | "df_pu2 = ConvertDataFrame(df_pu2)\n" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": {}, 526 | "source": [ 527 | "# Scrape Player Data for Spot-Up Plays" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 13, 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "name": "stdout", 537 | "output_type": "stream", 538 | "text": [ 539 | "Fetching play type data from Year 2016 ...\n", 540 | "Fetched stats for 392 NBA players.\n", 541 | "Fetching play type data from Year 2017 ...\n", 542 | "Fetched stats for 396 NBA players.\n", 543 | "Fetching play type data from Year 2018 ...\n", 544 | "Fetched stats for 428 NBA players.\n", 545 | "Fetching play type data from Year 2019 ...\n", 546 | "Fetched stats for 458 NBA players.\n", 547 | "Fetching play type data from Year 2020 ...\n", 548 | "Fetched stats for 359 NBA players.\n", 549 | "Fetching play type data from Year 2016 ...\n", 550 | "Fetched stats for 417 NBA players.\n", 551 | "Fetching play type data from Year 2017 ...\n", 552 | "Fetched stats for 413 NBA players.\n", 553 | "Fetching play type data from Year 2018 ...\n", 554 | "Fetched stats for 438 NBA players.\n", 555 | "Fetching play type data from Year 2019 ...\n", 556 | "Fetched stats for 477 NBA players.\n", 557 | "Fetching play type data from Year 2020 ...\n", 558 | "Fetched stats for 382 NBA players.\n" 559 | ] 560 | } 561 | ], 562 | "source": [ 563 | "# Scrape spot up play-type data, fetch the data in 2D array format,\n", 564 | "# and convert to a dataframe format with the appropriate data types\n", 565 | "play_type = \"SU\"\n", 566 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 567 | "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 568 | "urls = [ \"https://stats.nba.com/players/spot-up/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 569 | "\n", 570 | "np_arr_su = FetchPlayTypeTables(urls, years, \"Offensive\")\n", 571 | "df_su = pd.DataFrame(np_arr_su, columns=off_cols)\n", 572 | "df_su = ConvertDataFrame(df_su)\n", 573 | "\n", 574 | "np_arr_su = FetchPlayTypeTables(urls, years, \"Defensive\")\n", 575 | "df_su2 = pd.DataFrame(np_arr_su, columns=def_cols)\n", 576 | "df_su2 = ConvertDataFrame(df_su2)\n" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": {}, 582 | "source": [ 583 | "# Scrape Player Data for Hand-Off Plays" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": 14, 589 | "metadata": {}, 590 | "outputs": [ 591 | { 592 | "name": "stdout", 593 | "output_type": "stream", 594 | "text": [ 595 | "Fetching play type data from Year 2016 ...\n", 596 | "Fetched stats for 233 NBA players.\n", 597 | "Fetching play type data from Year 2017 ...\n", 598 | "Fetched stats for 235 NBA players.\n", 599 | "Fetching play type data from Year 2018 ...\n", 600 | "Fetched stats for 256 NBA players.\n", 601 | "Fetching play type data from Year 2019 ...\n", 602 | "Fetched stats for 280 NBA players.\n", 603 | "Fetching play type data from Year 2020 ...\n", 604 | "Fetched stats for 198 NBA players.\n", 605 | "Fetching play type data from Year 2016 ...\n", 606 | "Fetched stats for 269 NBA players.\n", 607 | "Fetching play type data from Year 2017 ...\n", 608 | "Fetched stats for 279 NBA players.\n", 609 | "Fetching play type data from Year 2018 ...\n", 610 | "Fetched stats for 302 NBA players.\n", 611 | "Fetching play type data from Year 2019 ...\n", 612 | "Fetched stats for 340 NBA players.\n", 613 | "Fetching play type data from Year 2020 ...\n", 614 | "Fetched stats for 239 NBA players.\n" 615 | ] 616 | } 617 | ], 618 | "source": [ 619 | "# Scrape hand off play-type data, fetch the data in 2D array format,\n", 620 | "# and convert to a dataframe format with the appropriate data types\n", 621 | "play_type = \"HO\"\n", 622 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 623 | "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 624 | "urls = [ \"https://stats.nba.com/players/hand-off/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 625 | "\n", 626 | "np_arr_ho = FetchPlayTypeTables(urls, years, \"Offensive\")\n", 627 | "df_ho = pd.DataFrame(np_arr_ho, columns=off_cols)\n", 628 | "df_ho = ConvertDataFrame(df_ho)\n", 629 | "\n", 630 | "np_arr_ho = FetchPlayTypeTables(urls, years, \"Defensive\")\n", 631 | "df_ho2 = pd.DataFrame(np_arr_ho, columns=def_cols)\n", 632 | "df_ho2 = ConvertDataFrame(df_ho2)\n" 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": {}, 638 | "source": [ 639 | "# Scrape Player Data for Cutting Plays" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 15, 645 | "metadata": {}, 646 | "outputs": [ 647 | { 648 | "name": "stdout", 649 | "output_type": "stream", 650 | "text": [ 651 | "Fetching play type data from Year 2016 ...\n", 652 | "Fetched stats for 334 NBA players.\n", 653 | "Fetching play type data from Year 2017 ...\n", 654 | "Fetched stats for 334 NBA players.\n", 655 | "Fetching play type data from Year 2018 ...\n", 656 | "Fetched stats for 340 NBA players.\n", 657 | "Fetching play type data from Year 2019 ...\n", 658 | "Fetched stats for 360 NBA players.\n", 659 | "Fetching play type data from Year 2020 ...\n", 660 | "Fetched stats for 245 NBA players.\n" 661 | ] 662 | } 663 | ], 664 | "source": [ 665 | "# Scrape cutting play-type data, fetch the data in 2D array format,\n", 666 | "# and convert to a dataframe format with the appropriate data types\n", 667 | "play_type = \"CUT\"\n", 668 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 669 | "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 670 | "urls = [ \"https://stats.nba.com/players/cut/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 671 | "\n", 672 | "np_arr_cut = FetchPlayTypeTables(urls, years, \"Offensive\")\n", 673 | "df_cut = pd.DataFrame(np_arr_cut, columns=off_cols)\n", 674 | "df_cut = ConvertDataFrame(df_cut)\n" 675 | ] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "metadata": {}, 680 | "source": [ 681 | "# Scrape Player Data for Off-Screen Plays" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 18, 687 | "metadata": {}, 688 | "outputs": [ 689 | { 690 | "name": "stdout", 691 | "output_type": "stream", 692 | "text": [ 693 | "Fetching play type data from Year 2016 ...\n", 694 | "Fetched stats for 238 NBA players.\n", 695 | "Fetching play type data from Year 2017 ...\n", 696 | "Fetched stats for 246 NBA players.\n", 697 | "Fetching play type data from Year 2018 ...\n", 698 | "Fetched stats for 247 NBA players.\n", 699 | "Fetching play type data from Year 2019 ...\n", 700 | "Fetched stats for 229 NBA players.\n", 701 | "Fetching play type data from Year 2020 ...\n", 702 | "Fetched stats for 172 NBA players.\n", 703 | "Fetching play type data from Year 2016 ...\n", 704 | "Fetched stats for 331 NBA players.\n", 705 | "Fetching play type data from Year 2017 ...\n", 706 | "Fetched stats for 336 NBA players.\n", 707 | "Fetching play type data from Year 2018 ...\n", 708 | "Fetched stats for 339 NBA players.\n", 709 | "Fetching play type data from Year 2019 ...\n", 710 | "Fetched stats for 351 NBA players.\n", 711 | "Fetching play type data from Year 2020 ...\n", 712 | "Fetched stats for 239 NBA players.\n" 713 | ] 714 | } 715 | ], 716 | "source": [ 717 | "# Scrape off screen play-type data, fetch the data in 2D array format,\n", 718 | "# and convert to a dataframe format with the appropriate data types\n", 719 | "play_type = \"OS\"\n", 720 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 721 | "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 722 | "urls = [ \"https://stats.nba.com/players/off-screen/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 723 | "\n", 724 | "np_arr_os = FetchPlayTypeTables(urls, years, \"Offensive\")\n", 725 | "df_os = pd.DataFrame(np_arr_os, columns=off_cols)\n", 726 | "df_os = ConvertDataFrame(df_os)\n", 727 | "\n", 728 | "np_arr_os = FetchPlayTypeTables(urls, years, \"Defensive\")\n", 729 | "df_os2 = pd.DataFrame(np_arr_os, columns=def_cols)\n", 730 | "df_os2 = ConvertDataFrame(df_os2)\n" 731 | ] 732 | }, 733 | { 734 | "cell_type": "markdown", 735 | "metadata": {}, 736 | "source": [ 737 | "# Scrape Player Data for Put-Back Plays" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": 20, 743 | "metadata": {}, 744 | "outputs": [ 745 | { 746 | "name": "stdout", 747 | "output_type": "stream", 748 | "text": [ 749 | "Fetching play type data from Year 2016 ...\n", 750 | "Fetched stats for 287 NBA players.\n", 751 | "Fetching play type data from Year 2017 ...\n", 752 | "Fetched stats for 280 NBA players.\n", 753 | "Fetching play type data from Year 2018 ...\n", 754 | "Fetched stats for 282 NBA players.\n", 755 | "Fetching play type data from Year 2019 ...\n", 756 | "Fetched stats for 309 NBA players.\n", 757 | "Fetching play type data from Year 2020 ...\n", 758 | "Fetched stats for 202 NBA players.\n" 759 | ] 760 | } 761 | ], 762 | "source": [ 763 | "# Scrape put-back play-type data, fetch the data in 2D array format,\n", 764 | "# and convert to a dataframe format with the appropriate data types\n", 765 | "play_type = \"PB\"\n", 766 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 767 | "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 768 | "urls = [ \"https://stats.nba.com/players/putbacks/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 769 | "\n", 770 | "np_arr_pb = FetchPlayTypeTables(urls, years, \"Offensive\")\n", 771 | "df_pb = pd.DataFrame(np_arr_pb, columns=off_cols)\n", 772 | "df_pb = ConvertDataFrame(df_pb)\n" 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "# Scrape Player Data for Miscellaneous Play-Types" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 21, 785 | "metadata": {}, 786 | "outputs": [ 787 | { 788 | "name": "stdout", 789 | "output_type": "stream", 790 | "text": [ 791 | "Fetching play type data from Year 2016 ...\n", 792 | "Fetched stats for 366 NBA players.\n", 793 | "Fetching play type data from Year 2017 ...\n", 794 | "Fetched stats for 362 NBA players.\n", 795 | "Fetching play type data from Year 2018 ...\n", 796 | "Fetched stats for 361 NBA players.\n", 797 | "Fetching play type data from Year 2019 ...\n", 798 | "Fetched stats for 377 NBA players.\n", 799 | "Fetching play type data from Year 2020 ...\n", 800 | "Fetched stats for 284 NBA players.\n" 801 | ] 802 | } 803 | ], 804 | "source": [ 805 | "# Scrape miscellaneous play-type data, fetch the data in 2D array format,\n", 806 | "# and convert to a dataframe format with the appropriate data types\n", 807 | "play_type = \"MISC\"\n", 808 | "off_cols = ['name', 'year'] + ['OFF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 809 | "def_cols = ['name', 'year'] + ['DEF'+s+play_type for i,s in enumerate(cols) if i > 1]\n", 810 | "urls = [ \"https://stats.nba.com/players/playtype-misc/?SeasonYear=20{0}-{1}&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 811 | "\n", 812 | "np_arr_misc = FetchPlayTypeTables(urls, years, \"Offensive\")\n", 813 | "df_misc = pd.DataFrame(np_arr_misc, columns=off_cols)\n", 814 | "df_misc = ConvertDataFrame(df_misc)\n" 815 | ] 816 | }, 817 | { 818 | "cell_type": "markdown", 819 | "metadata": {}, 820 | "source": [ 821 | "# Merge and Store Dataframes" 822 | ] 823 | }, 824 | { 825 | "cell_type": "code", 826 | "execution_count": 22, 827 | "metadata": {}, 828 | "outputs": [], 829 | "source": [ 830 | "# Create a complete dataframe, performing an outer merge so as\n", 831 | "# not to exclude players who play with a couple of play types exclusively\n", 832 | "df = pd.merge(df_trans, df_iso, on=['name', 'year'], how='outer')\n", 833 | "df = pd.merge(df, df_iso2, on=['name', 'year'], how='outer')\n", 834 | "df = pd.merge(df, df_prbh, on=['name', 'year'], how='outer')\n", 835 | "df = pd.merge(df, df_prbh2, on=['name', 'year'], how='outer')\n", 836 | "df = pd.merge(df, df_prrm, on=['name', 'year'], how='outer')\n", 837 | "df = pd.merge(df, df_prrm2, on=['name', 'year'], how='outer')\n", 838 | "df = pd.merge(df, df_pu, on=['name', 'year'], how='outer')\n", 839 | "df = pd.merge(df, df_pu2, on=['name', 'year'], how='outer')\n", 840 | "df = pd.merge(df, df_su, on=['name', 'year'], how='outer')\n", 841 | "df = pd.merge(df, df_su2, on=['name', 'year'], how='outer')\n", 842 | "df = pd.merge(df, df_ho, on=['name', 'year'], how='outer')\n", 843 | "df = pd.merge(df, df_ho2, on=['name', 'year'], how='outer')\n", 844 | "df = pd.merge(df, df_cut, on=['name', 'year'], how='outer')\n", 845 | "df = pd.merge(df, df_os, on=['name', 'year'], how='outer')\n", 846 | "df = pd.merge(df, df_os2, on=['name', 'year'], how='outer')\n", 847 | "df = pd.merge(df, df_pb, on=['name', 'year'], how='outer')\n", 848 | "df = pd.merge(df, df_misc, on=['name', 'year'], how='outer')\n", 849 | "\n", 850 | "# Scale percentage quantities to be in range 0-1 (for convenience)\n", 851 | "perc_cols = [col for col in df.columns if 'FGP' in col or '3PP' in col or 'FTP' in col or 'PERC' in col or 'PCT' in col or 'FREQ' in col]\n", 852 | "df[perc_cols] = df[perc_cols].astype(float)/100.\n" 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": 23, 858 | "metadata": {}, 859 | "outputs": [ 860 | { 861 | "name": "stdout", 862 | "output_type": "stream", 863 | "text": [ 864 | " name year OFF_POSS_TR OFF_FREQ_TR OFF_PPP_TR \\\n", 865 | "0 Aaron Brooks 2016 0.9 0.104 0.95 \n", 866 | "1 Aaron Brooks 2017 0.6 0.102 0.72 \n", 867 | "2 Aaron Gordon 2016 1.6 0.174 1.08 \n", 868 | "3 Aaron Gordon 2017 2.6 0.199 1.03 \n", 869 | "4 Aaron Gordon 2018 3.7 0.201 0.96 \n", 870 | "... ... ... ... ... ... \n", 871 | "2028 Devin Robinson 2019 0.0 0.000 0.00 \n", 872 | "2029 Jared Cunningham 2016 0.0 0.000 0.00 \n", 873 | "2030 Jarrod Uthoff 2017 0.0 0.000 0.00 \n", 874 | "2031 Alex Stepheson 2016 0.0 0.000 0.00 \n", 875 | "2032 Jack Cooley 2018 0.0 0.000 0.00 \n", 876 | "\n", 877 | " OFF_PTS_TR OFF_FGM_TR OFF_FGA_TR OFF_FGP_TR OFF_EFGP_TR ... \\\n", 878 | "0 0.8 0.4 0.7 0.392 0.520 ... \n", 879 | "1 0.4 0.3 0.4 0.333 0.463 ... \n", 880 | "2 1.8 0.5 1.1 0.558 0.581 ... \n", 881 | "3 2.7 0.9 1.9 0.536 0.565 ... \n", 882 | "4 3.6 1.5 2.8 0.473 0.527 ... \n", 883 | "... ... ... ... ... ... ... \n", 884 | "2028 0.0 0.0 0.0 0.000 0.000 ... \n", 885 | "2029 0.0 0.0 0.0 0.000 0.000 ... \n", 886 | "2030 0.0 0.0 0.0 0.000 0.000 ... \n", 887 | "2031 0.0 0.0 0.0 0.000 0.000 ... \n", 888 | "2032 0.0 0.0 0.0 0.000 0.000 ... \n", 889 | "\n", 890 | " OFF_FGM_MISC OFF_FGA_MISC OFF_FGP_MISC OFF_EFGP_MISC \\\n", 891 | "0 0.1 0.2 0.167 0.167 \n", 892 | "1 0.1 0.2 0.180 0.227 \n", 893 | "2 0.2 0.3 0.400 0.425 \n", 894 | "3 0.1 0.2 0.440 0.469 \n", 895 | "4 0.2 0.3 0.375 0.438 \n", 896 | "... ... ... ... ... \n", 897 | "2028 0.0 0.0 0.000 0.000 \n", 898 | "2029 0.0 0.0 0.000 0.000 \n", 899 | "2030 0.0 0.0 0.000 0.000 \n", 900 | "2031 0.0 0.0 0.000 0.000 \n", 901 | "2032 0.0 0.0 0.000 0.000 \n", 902 | "\n", 903 | " OFF_FT_FREQ_MISC OFF_TO_FREQ_MISC OFF_SF_FREQ_MISC \\\n", 904 | "0 0.207 0.379 0.034 \n", 905 | "1 0.095 0.381 0.000 \n", 906 | "2 0.283 0.358 0.057 \n", 907 | "3 0.195 0.415 0.049 \n", 908 | "4 0.195 0.488 0.098 \n", 909 | "... ... ... ... \n", 910 | "2028 0.000 0.000 0.000 \n", 911 | "2029 0.000 0.000 0.000 \n", 912 | "2030 0.000 0.000 0.000 \n", 913 | "2031 0.000 0.000 0.000 \n", 914 | "2032 0.000 0.000 0.000 \n", 915 | "\n", 916 | " OFF_AND1_FREQ_MISC OFF_SCORE_FREQ_MISC OFF_PERC_MISC \n", 917 | "0 0.00 0.276 0.311 \n", 918 | "1 0.00 0.190 0.382 \n", 919 | "2 0.02 0.377 0.729 \n", 920 | "3 0.00 0.366 0.755 \n", 921 | "4 0.07 0.268 0.547 \n", 922 | "... ... ... ... \n", 923 | "2028 0.00 0.000 0.000 \n", 924 | "2029 0.00 0.000 0.000 \n", 925 | "2030 0.00 0.000 0.000 \n", 926 | "2031 0.00 0.000 0.000 \n", 927 | "2032 0.00 0.000 0.000 \n", 928 | "\n", 929 | "[2033 rows x 254 columns]\n" 930 | ] 931 | } 932 | ], 933 | "source": [ 934 | "# Fill all NaN values with 0, which is reasonable for play-type missing values\n", 935 | "df = df.fillna(0)\n", 936 | "print(df)" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": 24, 942 | "metadata": {}, 943 | "outputs": [], 944 | "source": [ 945 | "# Write overall play-type dataframe to a .csv file\n", 946 | "df.to_csv(\"NBAPlayTypeStats.csv\")" 947 | ] 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": null, 952 | "metadata": {}, 953 | "outputs": [], 954 | "source": [] 955 | } 956 | ], 957 | "metadata": { 958 | "kernelspec": { 959 | "display_name": "Python 3", 960 | "language": "python", 961 | "name": "python3" 962 | }, 963 | "language_info": { 964 | "codemirror_mode": { 965 | "name": "ipython", 966 | "version": 3 967 | }, 968 | "file_extension": ".py", 969 | "mimetype": "text/x-python", 970 | "name": "python", 971 | "nbconvert_exporter": "python", 972 | "pygments_lexer": "ipython3", 973 | "version": "3.8.2" 974 | }, 975 | "toc": { 976 | "base_numbering": 1, 977 | "nav_menu": {}, 978 | "number_sections": true, 979 | "sideBar": false, 980 | "skip_h1_title": false, 981 | "title_cell": "Table of Contents", 982 | "title_sidebar": "Contents", 983 | "toc_cell": true, 984 | "toc_position": {}, 985 | "toc_section_display": true, 986 | "toc_window_display": false 987 | } 988 | }, 989 | "nbformat": 4, 990 | "nbformat_minor": 2 991 | } 992 | -------------------------------------------------------------------------------- /Data_Scraping/Scrape_NBA_PlayerBios_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import numpy as np\n", 20 | "import urllib.request\n", 21 | "from selenium import webdriver\n", 22 | "from selenium.webdriver.support.ui import Select\n", 23 | "from selenium.webdriver.support import expected_conditions as EC\n", 24 | "from selenium.webdriver.common.by import By\n", 25 | "from selenium.webdriver.support.ui import WebDriverWait\n", 26 | "from selenium.common.exceptions import TimeoutException\n", 27 | "from selenium.webdriver.firefox.options import Options as FirefoxOptions\n", 28 | "from selenium.webdriver.chrome.options import Options as ChromeOptions\n", 29 | "import lxml.html\n", 30 | "from lxml import etree\n", 31 | "import re\n", 32 | "import time\n", 33 | "import pandas as pd\n", 34 | "from functools import reduce\n", 35 | "from operator import itemgetter" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 56, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Modify dataframe to have appropriate data types\n", 45 | "def ConvertDataFrame(df):\n", 46 | " cols = df.columns.drop(['name', 'college', 'country', 'draft', 'nationality'])\n", 47 | " df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)\n", 48 | " df['name'] = df['name'].astype('str')\n", 49 | " df['college'] = df['college'].astype('str')\n", 50 | " df['country'] = df['country'].astype('str')\n", 51 | " df['draft'] = df['draft'].astype('str')\n", 52 | " df['nationality'] = df['nationality'].astype('str')\n", 53 | " df['actual_draft_year'] = df['actual_draft_year'].astype('float')\n", 54 | " df['draft_round'] = df['draft_round'].astype('float')\n", 55 | " df['draft_number'] = df['draft_number'].astype('float')\n", 56 | " #df = df.groupby(['name', 'actual_draft_year']).mean().reset_index()\n", 57 | " df = df.drop_duplicates(subset=['name', 'actual_draft_year'], keep='first')\n", 58 | " return df\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 52, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Scrape player bio tables from NBA.com webpages, grabbing only\n", 68 | "# the specified columns (by index) and for the specified seasons\n", 69 | "def FetchStatsTables(urls, years, col_list):\n", 70 | " # Create a headless Firefox browser instance\n", 71 | " opt = FirefoxOptions()\n", 72 | " opt.add_argument(\"--headless\")\n", 73 | " driver = webdriver.Firefox(options=opt)\n", 74 | " \n", 75 | " arr = []\n", 76 | " for i,url in enumerate(urls):\n", 77 | " year = years[i]\n", 78 | " print(\"Fetching player bios from the\", year, \"season...\")\n", 79 | " \n", 80 | " driver.get(url)\n", 81 | " wait = WebDriverWait(driver, 30)\n", 82 | " wait.until(EC.presence_of_element_located((By.XPATH, \"//select[contains(@class, 'stats-table-pagination__select')]\")))\n", 83 | " sel = Select(driver.find_element_by_class_name('stats-table-pagination__select'))\n", 84 | " sel.select_by_visible_text(\"All\")\n", 85 | " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;\")\n", 86 | " \n", 87 | " retries = 1\n", 88 | " while retries <= 3:\n", 89 | " try:\n", 90 | " wait.until(EC.presence_of_element_located((By.XPATH, \"//*[@class='nba-stat-table__overflow']//table/tbody/tr\")))\n", 91 | " break\n", 92 | " except TimeoutException:\n", 93 | " print('\\nRefreshing NBA bios page due to timeout (retry #', retries,')...')\n", 94 | " driver.refresh()\n", 95 | " time.sleep(1)\n", 96 | " retries += 1\n", 97 | "\n", 98 | " root = lxml.html.fromstring(driver.page_source)\n", 99 | " results = root.xpath(\"//*[@class='nba-stat-table__overflow']//table/tbody/tr\")\n", 100 | " \n", 101 | " # Process the table text and break into columns, \n", 102 | " # stripping extraneous newline characters and inserting the season year\n", 103 | " counter = 0\n", 104 | " for result in results:\n", 105 | " item = result.xpath(\"./td//text()\")\n", 106 | " item = [re.sub('\\n +', '', x) for x in item]\n", 107 | " data = [x for x in item if x != '' and x != '\\n']\n", 108 | " data = [s.strip('%') for s in data]\n", 109 | " if len(data) < col_list[-1]+1:\n", 110 | " continue\n", 111 | " elif len(data) > col_list[-1]+1:\n", 112 | " data[6:-13] = [' '.join(data[6:-13])]\n", 113 | " data = list(itemgetter(*col_list)(data))\n", 114 | " #print(data)\n", 115 | " \n", 116 | " if len(data[1].split('-')) == 2 and data[1].split('-')[0] != '':\n", 117 | " ft_in = data[1].split('-')\n", 118 | " data[1] = float(ft_in[0])*12. + float(ft_in[1])\n", 119 | " #print(ft_in, data[1])\n", 120 | " else:\n", 121 | " data[1] = ''\n", 122 | " \n", 123 | " if data[2] == ' ':\n", 124 | " data[2] = ''\n", 125 | "\n", 126 | " if data[5].lower() == 'undrafted':\n", 127 | " data.append('undrafted')\n", 128 | " data[5] = ''\n", 129 | " data[6] = ''\n", 130 | " data[7] = ''\n", 131 | " else:\n", 132 | " data.append('drafted')\n", 133 | " \n", 134 | " if data[4] == 'USA' or data[4] == '':\n", 135 | " data.append('domestic')\n", 136 | " else:\n", 137 | " data.append('foreign')\n", 138 | " \n", 139 | " arr.append(data)\n", 140 | " counter += 1\n", 141 | " #print(data)\n", 142 | " \n", 143 | " print(\"Fetched bios for\", counter, \"NBA players.\")\n", 144 | " \n", 145 | " driver.quit()\n", 146 | " #print(arr)\n", 147 | " return np.array(arr)\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 53, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "# Establish the years for which we want to fetch player data\n", 157 | "# (for data available for 201-15 onward)\n", 158 | "ya = [str(n).zfill(2) for n in range(0, 20)]\n", 159 | "yb = [str(n).zfill(2) for n in range(1, 21)]\n", 160 | "years = [int(\"20\"+y) for y in yb]\n" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "# Create URLs for player bios on NBA.com, \n", 170 | "# fetch the data in 2D array format, and put into a Pandas dataframe\n", 171 | "#https://stats.nba.com/players/bio/?Season=2000-01&SeasonType=Regular%20Season\n", 172 | "urls = [ \"https://stats.nba.com/players/bio/?Season=20{0}-{1}&SeasonType=Regular%20Season\".format(ya[i], yb[i]) for i in range(len(ya)) ]\n", 173 | "np_arr = FetchStatsTables(urls, years, [0, 3, 4, 5, 6, 7, 8, 9])\n", 174 | "df = pd.DataFrame(np_arr, columns=['name', 'height', 'weight', 'college', 'country', 'actual_draft_year', 'draft_round', 'draft_number', 'draft', 'nationality'])\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 57, 180 | "metadata": { 181 | "scrolled": false 182 | }, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | " name height weight college country \\\n", 189 | "0 A.C. Green 81.0 225.0 Oregon State USA \n", 190 | "1 A.J. Guyton 73.0 180.0 Indiana USA \n", 191 | "2 Aaron McKie 77.0 209.0 Temple USA \n", 192 | "3 Aaron Williams 81.0 225.0 Xavier USA \n", 193 | "4 Adam Keefe 81.0 230.0 Stanford USA \n", 194 | "... ... ... ... ... ... \n", 195 | "9339 Vincent Poirier 84.0 235.0 None France \n", 196 | "9340 Vlatko Cancar 80.0 236.0 None Slovenia \n", 197 | "9343 Wenyen Gabriel 81.0 205.0 None Sudan \n", 198 | "9354 Zach Norvell Jr. 77.0 205.0 None USA \n", 199 | "9355 Zylan Cheatham 77.0 220.0 None USA \n", 200 | "\n", 201 | " actual_draft_year draft_round draft_number draft nationality \n", 202 | "0 1985.0 1.0 23.0 drafted domestic \n", 203 | "1 2000.0 2.0 32.0 drafted domestic \n", 204 | "2 1994.0 1.0 17.0 drafted domestic \n", 205 | "3 NaN NaN NaN undrafted domestic \n", 206 | "4 1992.0 1.0 10.0 drafted domestic \n", 207 | "... ... ... ... ... ... \n", 208 | "9339 NaN NaN NaN undrafted foreign \n", 209 | "9340 2017.0 2.0 49.0 drafted foreign \n", 210 | "9343 NaN NaN NaN undrafted foreign \n", 211 | "9354 NaN NaN NaN undrafted domestic \n", 212 | "9355 NaN NaN NaN undrafted domestic \n", 213 | "\n", 214 | "[1946 rows x 10 columns]\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "df = ConvertDataFrame(df)\n", 220 | "print(df)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 58, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "# Write NBA player bios dataframe to a .csv file\n", 230 | "df.to_csv(\"NBAPlayerBios.csv\")\n" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [] 239 | } 240 | ], 241 | "metadata": { 242 | "kernelspec": { 243 | "display_name": "Python 3", 244 | "language": "python", 245 | "name": "python3" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 3 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython3", 257 | "version": "3.7.3" 258 | }, 259 | "toc": { 260 | "base_numbering": 1, 261 | "nav_menu": {}, 262 | "number_sections": true, 263 | "sideBar": false, 264 | "skip_h1_title": false, 265 | "title_cell": "Table of Contents", 266 | "title_sidebar": "Contents", 267 | "toc_cell": true, 268 | "toc_position": {}, 269 | "toc_section_display": true, 270 | "toc_window_display": false 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 2 275 | } 276 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nba-data-models 2 | 3 | This repository contains explorations and models of NBA data. 4 | 5 | ## NBA Data Scraping 6 | 7 | In "Data_Scraping" there are many notebooks which were used to amass datasets from a variety of basketball data websites. 8 | 9 | Note that only free data was used in this project; even the Synergy data used (advanced player tracking, etc.) is publicly available and scrapable using the custom tools contained in this repository. 10 | 11 | Various data scraping tools were used: Scrapy (an open source Python web crawler library built using the Twisted framework), Selenium (a web browser automation tool), GeckoDriver/ChromeDriver for Firefox/Chrome browser launching, and the lxml HTML parsing library. 12 | 13 | To scrape HTML data from simple, static webpages, Scrapy is used. This is done for the following sites: 14 | 15 | * https://www.basketball-reference.com (for some basic and advanced stats not obtained from NBA.com) 16 | * http://www.espn.com/nba/statistics/rpm (for ESPN Real Plus-Minus advanced player stats) 17 | * http://insider.espn.com/nba/hollinger/statistics/_/qualified/false (for advanced Hollinger player stats) 18 | 19 | The data obtained from these webpages are stored in JSON formats as player-season dictionaries, containing per-season stats. 20 | 21 | 22 | For interactive, dynamic pages served using Javascript, web drivers are used to launch headless browser instances and Selenium and lxml are used to make table selections and parse the HTML: 23 | 24 | * https://stats.nba.com/draft/combine-anthro (for draft combine data on drafted players) 25 | * https://stats.nba.com/players (for aggregate player data and player bios) 26 | * https://stats.nba.com/lineups (for aggregate lineup data) 27 | * https://stats.nba.com/game (for play-by-play data for individual games) 28 | 29 | The data obtained from these webpages are stored in CSV format. 30 | 31 | 32 | Player data is combined using a cleanup script which merges Pandas dataframes into a comprehensive player-season database (this combines draft combine, bio, basic, and advanced stats). Player, lineup, and play-by-play datasets are stored in separate CSV files. 33 | 34 | 35 | ## Exploration of Player, Lineup, and Play-by-Play Data 36 | 37 | The "Player_Data_Exploration" notebook breaks down and explores a vast number of player stats. 38 | 39 | In particular, a basic clustering is performed for players of each general position (Guard, Forward, Center). 40 | K-means clustering is used to lump players into categories based on a few key traits: 41 | 42 | * Passing/play-making ('AST_PH', 'ASTR', 'ATR') 43 | * Frequency of 3 point shots ('3PR') 44 | * Defensive specialization ('BLK_PH', 'DFGP_3PT_PG', 'DFGP_PG') 45 | * Usage rate ('USG') 46 | 47 | The stats used for clustering are chosen for each position through trial-and-error, using silhouette scores to evaluate the best training features. This silhouette analysis also gives the optimal number of clusters to use at each position. 48 | 49 | This notebook examines offensive and defensive performance metrics (including the relationship between different efficiency stats), shot selection (including distance from basket, number of dribbles, and defender proximity), and offensive and defensive play types used. 50 | 51 | 52 | ## A Bokeh Application for Exploring Player Data 53 | 54 | A primitive Bokeh application has been developed for interactively filtering and plotting player data, which expedites the exploration of player stats. Note that Bokeh software is needed as a prerequisite to run this application (see https://bokeh.org/). 55 | 56 | To run the local Bokeh server (which should automatically open at http://localhost:5006/bokeh_app in your browser), run the following command: 57 | 58 | > bokeh serve --show bokeh_app/ 59 | 60 | Shown below are some examples of how the player, lineup, and play-by-play tabs can be used: 61 | ![alt text](https://github.com/jecutter/nba-data-models/blob/master/img/nba_stats_explorer_curry_player_ex.png?raw=true) 62 | ![alt text](https://github.com/jecutter/nba-data-models/blob/master/img/nba_stats_explorer_curry_lineup_ex.png?raw=true) 63 | ![alt text](https://github.com/jecutter/nba-data-models/blob/master/img/nba_stats_explorer_curry_pbp_ex.png?raw=true) 64 | 65 | NOTE: It is always possible that data has moved/been removed from the repo, however the necessary data may be regenerated using the scraping tools provided in this repo. 66 | 67 | 68 | ## Modeling for Player and Lineup Evaluation 69 | 70 | The "Data_Modeling" directory contains some NBA models in various stages of development. 71 | 72 | ### Play Style Modeling 73 | 74 | The "Player_Comparison_Analysis" notebook contains a simple classification model for finding the best player comp for first-year/rookie players. Veteran players (defined as players to play in all of the last 4 full NBA seasons) are divided into player-seasons, the first 3 seasons being used for model training and the last season being used for testing and validation. 75 | 76 | The classification model is built using key stylistic player stats, specifically: 77 | * 'height' - player height in inches 78 | * 'weight' - player weight in lbs 79 | * 'FG_FREQ_05FT' - percentage of shots from 0-5 ft. from the basket 80 | * 'FG_FREQ_59FT' - percentage of shots from 5-9 ft. from the basket 81 | * 'FG_FREQ_1014FT' - percentage of shots from 10-14 ft. from the basket 82 | * 'FG_FREQ_1519FT' - percentage of shots from 15-19 ft. from the basket 83 | * 'FG_FREQ_2024FT' - percentage of shots from 20-24 ft. from the basket 84 | * 'FG_FREQ_GT24FT' - percentage of shots from > 24 ft. from the basket 85 | * 'FG_FREQ_CANDS' - percentage of shots that are catch-and-shoot (no dribbles) 86 | * 'FTR' - free throw rate 87 | * 'ASTR' - assist rate 88 | * 'TOR' - turnover rate 89 | * 'ORR' - offensive rebounding rate 90 | * 'DRR' - defensive rebounding rate 91 | * 'BLK_PH' - shot blocks (per 100 possessions) 92 | * 'STL_PH' - steals (per 100 possessions) 93 | * 'DFGP_PG' - defensive/opponent field goal percentage 94 | 95 | A variety of permutations were attempted to optimize the testing results, however accuracy of player classification (identifying a veteran player by their 4th season using the model) maxed out at ~75%. This is reasonable, given the variance of player stats from season to season. 96 | 97 | Predictive results are then shown by classifying rookies/first-year players from the 2018-2019 season. This demonstrates the usefulness of this algorithm as a scouting tool for assessing a player's play style. 98 | 99 | ### Player Impact Evaluation 100 | 101 | The "RAPM_Ridge_Regression" model uses lineup matchup data over 3 full seasons of NBA games to calculate a player's lineup-independent impact. This can be done with ridge regression to calculate a known quantity called "RAPM" (Regularized Adjusted Plus-Minus). 102 | 103 | A player's +/- is defined as the team's point differential (relative to the opposing team) while that player is on the floor. A player's offensive and defensive impact both affect their raw +/-, but it is a lineup-dependent quantity since it depends on the player's supporting cast as well as the opposing lineups. 104 | 105 | The way to take into account all players on the floor is to calculate the "APM" (Adjusted Plus-Minus). This is obtained by creating a matchup matrix *M*, where each row is a lineup matchup and each column is a player. A matrix entry is set to "1" if the player is on offensve, "-1" if the player is on defense, and "0" if the player is not involved in the matchup. The point differential per 100 possessions is calculated for each lineup matchup, which forms an array *y*. We may then solve the equation *M* *x* = *y* for player coefficients *x*, which represent the players' adjusted contributions. 106 | 107 | The problem with this method is that there is enormous variance in the coefficients due to multicollinearity between players. We therefore perform a modified/perturbed regression, a Bayesian filtering process which introduces bias but greatly reduces the variance by penalizing (regularizing) outliers. The result is a set of player RAPM coefficients, which give a relative ranking of the player's impact. More mathematical details are given in the notebook. 108 | 109 | Validation of the RAPM model is difficult, since it incorporates a global dataset to produce relative (but biased) player rankings. This means that it is not particularly useful for predicting true lineup matchup results. However, it is useful for scouting players whose impact may be underrated by their environment or have winning qualities that are intangible. 110 | 111 | ### Lineup Optimization 112 | 113 | Work in progress. 114 | -------------------------------------------------------------------------------- /bokeh_app/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | from bokeh.io import curdoc 8 | from bokeh.layouts import column, layout 9 | from bokeh.models import ColumnDataSource, Div, Select, Slider, TextInput, Panel 10 | from bokeh.models.widgets import Tabs 11 | from bokeh.plotting import figure 12 | #from bokeh import mpl 13 | 14 | from tabs.players import player_tab 15 | from tabs.lineups import lineup_tab 16 | from tabs.playbyplay import playbyplay_tab 17 | 18 | 19 | # Grab relative path to datasets 20 | current_file = os.path.abspath(os.path.dirname(__file__)) 21 | player_csv_file = os.path.join(current_file, '../CompleteNBAPlayerStats.csv') 22 | lineup_csv_file = os.path.join(current_file, '../NBALineupStats_preInsight.csv') 23 | pbp_csv_file = os.path.join(current_file, '../NBA_PBP_Data_PlusMinus.csv') 24 | 25 | # Load NBA player data 26 | df_player = pd.read_csv(player_csv_file) 27 | df_lineup = pd.read_csv(lineup_csv_file) 28 | df_pbp = pd.read_csv(pbp_csv_file) 29 | 30 | # Create each of the tabs 31 | tab1 = player_tab(df_player) 32 | tab2 = lineup_tab(df_lineup) 33 | tab3 = playbyplay_tab(df_pbp) 34 | 35 | # Collect created tabs 36 | tabs = Tabs(tabs = [tab1, tab2, tab3]) 37 | 38 | # Add tabs to the Bokeh document 39 | #curdoc().add_root(l) 40 | curdoc().add_root(tabs) 41 | curdoc().title = "NBAStats" 42 | 43 | -------------------------------------------------------------------------------- /bokeh_app/tabs/lineups.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname, join 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | from bokeh.io import curdoc 8 | from bokeh.layouts import column, layout 9 | from bokeh.models import ColumnDataSource, Div, Select, Slider, TextInput, Panel 10 | from bokeh.models.widgets import Tabs, DataTable, DateFormatter, TableColumn 11 | from bokeh.plotting import figure 12 | 13 | def ControlUpdate(df, source, controls, plot, table): 14 | mask = LineupMask(df, controls) 15 | x_name = controls[4].value 16 | y_name = controls[5].value 17 | 18 | #df["color"] = np.where(mask, "red", "grey") 19 | #df["alpha"] = np.where(mask, 0.9, 0.25) 20 | df_mask = df[mask] 21 | 22 | # Title 23 | #plot.title.text = "%d lineups selected" % len(df_mask) 24 | #plot.title.text_font_size = '20pt' 25 | #plot.title.text_font = 'serif' 26 | #plot.title.align = 'center' 27 | 28 | # Axis titles 29 | plot.xaxis.axis_label = x_name 30 | plot.yaxis.axis_label = y_name 31 | plot.xaxis.axis_label_text_font_size = '12pt' 32 | plot.xaxis.axis_label_text_font_style = 'bold' 33 | plot.yaxis.axis_label_text_font_size = '12pt' 34 | plot.yaxis.axis_label_text_font_style = 'bold' 35 | 36 | # Tick labels 37 | plot.xaxis.major_label_text_font_size = '12pt' 38 | plot.yaxis.major_label_text_font_size = '12pt' 39 | 40 | table.columns = [ 41 | TableColumn(field="name", title='Lineup'), 42 | TableColumn(field="year", title='Season'), 43 | TableColumn(field="x", title=x_name), 44 | TableColumn(field="y", title=y_name), 45 | ] 46 | 47 | source.data = dict( 48 | x = df_mask[x_name], 49 | y = df_mask[y_name], 50 | name = df_mask['lineup_name'], 51 | year = df_mask['year'], 52 | team = df_mask['team'] 53 | ) 54 | 55 | def LineupMask(df, controls): 56 | player = controls[0].value.lower() 57 | team = controls[1].value 58 | 59 | if player == '': 60 | mask_player = np.ones(len(df.index), dtype=bool) 61 | else: 62 | mask_player = df.lineup_name.str.lower().str.contains(player) 63 | 64 | if team == 'All': 65 | mask_team = np.ones(len(df.index), dtype=bool) 66 | else: 67 | if team == 'Multiple': 68 | mask_team = (df.team == 'TOT') 69 | else: 70 | mask_team = (df.team == team) 71 | 72 | mask_year = ((df.year >= controls[2].value) & (df.year <= controls[3].value)) 73 | 74 | mask = np.logical_and(mask_player, mask_team) 75 | mask = np.logical_and(mask, mask_year) 76 | 77 | return mask 78 | 79 | 80 | def lineup_tab(dfl): 81 | # Grab list of stats/columns from dataframe 82 | stats = list(dfl.columns.values) 83 | 84 | # Grab list of teams 85 | teams = np.unique(dfl.team.values) 86 | teams = np.insert(teams, 0, 'All') 87 | teams = np.insert(teams, 1, 'Multiple') 88 | teams = teams[teams != 'TOT'] 89 | teams = list(teams) 90 | 91 | player_sel = TextInput(title="Lineup Contains Player:") 92 | team_sel = Select(title="Team:", options=teams, value='All') 93 | min_year = Slider(title="Starting Season", start=2016, end=2020, value=2016, step=1) 94 | max_year = Slider(title="Ending Season", start=2016, end=2020, value=2019, step=1) 95 | x_axis = Select(title="X Axis", options=stats, value="GPT") 96 | y_axis = Select(title="Y Axis", options=stats, value="NETRTGT") 97 | 98 | # Create a data source dictionary for storing data with each update 99 | source = ColumnDataSource(data=dict(x=[], y=[], name=[], year=[], team=[])) 100 | 101 | # Create tooltips object for hover variables, 102 | # and create figure for scatterplot 103 | TOOLTIPS=[ 104 | ("Name", "@name"), 105 | ("Year", "@year"), 106 | ("Team", "@team") 107 | ] 108 | 109 | p = figure(plot_height=600, plot_width=700, title="", tooltips=TOOLTIPS, sizing_mode="scale_both") 110 | p.circle(x="x", y="y", source=source, size=7, line_color=None, fill_alpha=0.8) 111 | 112 | columns = [ 113 | TableColumn(field="name", title='Name'), 114 | TableColumn(field="year", title='Season'), 115 | TableColumn(field="x", title=x_axis.value), 116 | TableColumn(field="y", title=y_axis.value), 117 | ] 118 | data_table = DataTable(source=source, columns=columns, width=275, height=550) 119 | 120 | # Create controls for filtering plotted/table data 121 | controls = [ player_sel, team_sel, min_year, max_year, x_axis, y_axis ] 122 | for control in controls: 123 | control.on_change('value', lambda attr, old, new: ControlUpdate(dfl, source, controls, p, data_table)) 124 | 125 | # Do a preliminary update of plot and table 126 | ControlUpdate(dfl, source, controls, p, data_table) 127 | 128 | # Create layout by column 129 | inputs = column(*controls, width=250, height=600) 130 | inputs.sizing_mode = "fixed" 131 | l = layout([ 132 | [inputs, p, data_table], 133 | #], sizing_mode="scale_both") 134 | ]) 135 | 136 | # Make a tab with the layout 137 | tab = Panel(child=l, title = 'Lineup Stats') 138 | 139 | return tab 140 | 141 | -------------------------------------------------------------------------------- /bokeh_app/tabs/playbyplay.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname, join 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | from bokeh.io import curdoc 8 | from bokeh.layouts import column, layout 9 | from bokeh.models import ColumnDataSource, Div, Select, Slider, TextInput, Panel, BoxAnnotation 10 | from bokeh.models.widgets import Tabs, DataTable, DateFormatter, TableColumn 11 | from bokeh.plotting import figure 12 | 13 | 14 | global_boxes = [] 15 | 16 | def ControlUpdate(df, source, controls, plot): 17 | global global_boxes 18 | 19 | mask, starts, ends = PBPMask(df, controls) 20 | team = controls[0].value 21 | game = controls[2].value 22 | y_name = controls[4].value 23 | 24 | df_mask = df[mask] 25 | 26 | if len(starts) > 0: 27 | keep_starts = np.ones(len(starts), dtype=bool) 28 | keep_ends = np.ones(len(ends), dtype=bool) 29 | prev_end = -1 30 | for i in np.arange(len(starts)): 31 | #print(starts[i], ends[i]) 32 | if starts[i] == ends[i]: 33 | #print('throwing out equal starts ends:', starts[i], ends[i]) 34 | keep_starts[i] = False 35 | if ends[i] != 2880.: 36 | keep_ends[i] = False 37 | if i > 0: 38 | if starts[i] - prev_end <= 2.: 39 | #print('throwing out overlapping intervals, deleting:', ends[i-1], starts[i]) 40 | keep_starts[i] = False 41 | keep_ends[i-1] = False 42 | #print('setting previous end to', ends[i]) 43 | prev_end = ends[i] 44 | #print(keep_starts, keep_ends) 45 | #print(np.array(starts)[keep_starts]) 46 | #print(np.array(ends)[keep_ends]) 47 | starts = list(np.array(starts)[keep_starts]) 48 | ends = list(np.array(ends)[keep_ends]) 49 | 50 | boxes = [] 51 | for start,end in zip(starts, ends): 52 | #print('made it', start, end) 53 | box = BoxAnnotation(left=start, right=end, 54 | line_width=1, line_color='black', line_dash='dashed', 55 | fill_alpha=0.2, fill_color='orange') 56 | print('box', box) 57 | plot.add_layout(box) 58 | boxes.append(box) 59 | 60 | #if len(boxes) != 0: 61 | # plot.renderers.extend(boxes) 62 | # global_boxes += boxes 63 | #else: 64 | # if len(global_boxes) > 0: 65 | # plot.renderers.remove(global_boxes) 66 | # #for i, r in enumerate(plot.renderers): 67 | # # print(r) 68 | # # if i > 0: 69 | # # plot.renderers.remove(r) 70 | 71 | # Title 72 | if df_mask.home_team.values[0] == team: 73 | title = "Game " + game + " for " + team + ": At Home (vs. " + df_mask.vis_team.values[0] + ")" 74 | else: 75 | title = "Game " + game + " for " + team + ": Away Team (vs. " + df_mask.home_team.values[0] + ")" 76 | 77 | plot.title.text = title 78 | plot.title.text_font_size = '20pt' 79 | plot.title.text_font = 'serif' 80 | plot.title.align = 'center' 81 | 82 | # Axis titles 83 | plot.xaxis.axis_label = 'Time (seconds)' 84 | plot.yaxis.axis_label = y_name 85 | plot.xaxis.axis_label_text_font_size = '12pt' 86 | plot.xaxis.axis_label_text_font_style = 'bold' 87 | plot.yaxis.axis_label_text_font_size = '12pt' 88 | plot.yaxis.axis_label_text_font_style = 'bold' 89 | 90 | # Tick labels 91 | plot.xaxis.major_label_text_font_size = '12pt' 92 | plot.yaxis.major_label_text_font_size = '12pt' 93 | 94 | # Update source data 95 | source.data = dict( 96 | x = df_mask['time_sec'], 97 | y = df_mask[y_name], 98 | home_play = df_mask['ht_play'], 99 | away_play = df_mask['vt_play'] 100 | #color = df_mask["color"], 101 | #alpha = df_mask["alpha"] 102 | ) 103 | 104 | def PBPMask(df, controls): 105 | # Group dataframe by team and establish for which games 106 | # the team is at home or away 107 | team = controls[0].value 108 | mask_team = ((df.home_team == team) | (df.vis_team == team)) 109 | 110 | year = int(controls[1].value) 111 | mask_year = (df.year == year) 112 | 113 | game_idx = int(controls[2].value)-1 114 | team_games = np.unique(df[mask_team & mask_year].groupby(['game'], as_index=False).mean().game) 115 | mask_game = (df.game == team_games[game_idx]) 116 | 117 | mask = np.logical_and(mask_team, mask_year) 118 | mask = np.logical_and(mask, mask_game) 119 | 120 | df_masked = df[mask] 121 | 122 | # Create player mask 123 | player = controls[3].value.lower() 124 | if player == '': 125 | player_starts = [] 126 | player_ends = [] 127 | else: 128 | dfhead = df_masked.groupby((df_masked[['ht_lineup','vt_lineup']] != df_masked[['ht_lineup','vt_lineup']].shift(1)).any(axis=1).cumsum()).head(1).reset_index(drop=True) 129 | dftail = df_masked.groupby((df_masked[['ht_lineup','vt_lineup']] != df_masked[['ht_lineup','vt_lineup']].shift(1)).any(axis=1).cumsum()).tail(1).reset_index(drop=True) 130 | player_start_mask = ((dfhead.ht_lineup.str.lower().str.contains(player) & (dfhead.home_team == team)) | (dfhead.vt_lineup.str.lower().str.contains(player) & (dfhead.vis_team == team))) 131 | player_end_mask = ((dftail.ht_lineup.str.lower().str.contains(player) & (dftail.home_team == team)) | (dftail.vt_lineup.str.lower().str.contains(player) & (dftail.vis_team == team))) 132 | player_starts = list(dfhead[player_start_mask].time_sec.values) 133 | player_ends = list(dftail[player_end_mask].time_sec.values) 134 | 135 | return mask, player_starts, player_ends 136 | 137 | 138 | def playbyplay_tab(dft): 139 | # Grab list of stats/columns from dataframe 140 | stats = list(dft.columns.values) 141 | 142 | # Grab list of teams 143 | teams = np.unique(dft.home_team.values) 144 | teams = teams[teams != 'TOT'] 145 | teams = list(teams) 146 | 147 | # Grab list of years/seasons 148 | years = list(np.unique(dft.year.values).astype(str)) 149 | 150 | # Create a list of games (each team plays games 1-82) 151 | games = list(np.arange(1,83).astype(str)) 152 | 153 | team_sel = Select(title="Team:", options=teams, value='ATL') 154 | year_sel = Select(title="Season:", options=years, value='2017') 155 | game_sel = Select(title="Game:", options=games, value='1') 156 | stint_sel = TextInput(title="Stints Containing Player:") 157 | y_axis = Select(title="Y Axis", options=stats, value="ht_margin") 158 | 159 | # Create a data source dictionary for storing data with each update 160 | #source = ColumnDataSource(data=dict(x=[], y=[], home_play=[], away_play=[], color=[], alpha=[])) 161 | source = ColumnDataSource(data=dict(x=[], y=[], home_play=[], away_play=[])) 162 | 163 | # Create tooltips object for hover variables, 164 | # and create figure for scatterplot 165 | TOOLTIPS=[ 166 | ("H.T. Play:", "@home_play"), 167 | ("A.T. Play:", "@away_play") 168 | ] 169 | 170 | p = figure(plot_height=550, plot_width=1000, title="", tooltips=TOOLTIPS, sizing_mode="scale_both") 171 | #p.line(x="x", y="y", source=source, line_width=2, color='color', line_alpha='alpha') 172 | p.line(x="x", y="y", source=source, line_width=2, color='black') 173 | 174 | # Create controls for filtering plotted data 175 | controls = [ team_sel, year_sel, game_sel, stint_sel, y_axis ] 176 | for control in controls: 177 | control.on_change('value', lambda attr, old, new: ControlUpdate(dft, source, controls, p)) 178 | 179 | # Do a preliminary update of plot 180 | ControlUpdate(dft, source, controls, p) 181 | 182 | # Create layout by column 183 | inputs = column(*controls, width=250, height=600) 184 | inputs.sizing_mode = "fixed" 185 | l = layout([ 186 | [inputs, p], 187 | #], sizing_mode="scale_both") 188 | ]) 189 | 190 | # Make a tab with the layout 191 | tab = Panel(child=l, title = 'Game Play-By-Play') 192 | 193 | return tab 194 | 195 | -------------------------------------------------------------------------------- /bokeh_app/tabs/players.py: -------------------------------------------------------------------------------- 1 | from os.path import dirname, join 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | from bokeh.io import curdoc 8 | from bokeh.layouts import column, layout 9 | from bokeh.models import ColumnDataSource, Div, Select, Slider, TextInput, Panel 10 | from bokeh.models.widgets import Tabs, DataTable, DateFormatter, TableColumn 11 | from bokeh.plotting import figure 12 | 13 | def ControlUpdate(df, psource, tsource, controls, plot, table): 14 | mask = PlayerMask(df, controls) 15 | x_name = controls[6].value 16 | y_name = controls[7].value 17 | 18 | df["color"] = np.where(mask, "red", "grey") 19 | df["alpha"] = np.where(mask, 0.9, 0.25) 20 | df_mask = df[mask] 21 | 22 | # Title 23 | #plot.title.text = "%d players selected" % len(dfp) 24 | #plot.title.text_font_size = '20pt' 25 | #plot.title.text_font = 'serif' 26 | #plot.title.align = 'center' 27 | 28 | # Axis titles 29 | plot.xaxis.axis_label = x_name 30 | plot.yaxis.axis_label = y_name 31 | plot.xaxis.axis_label_text_font_size = '12pt' 32 | plot.xaxis.axis_label_text_font_style = 'bold' 33 | plot.yaxis.axis_label_text_font_size = '12pt' 34 | plot.yaxis.axis_label_text_font_style = 'bold' 35 | 36 | # Tick labels 37 | plot.xaxis.major_label_text_font_size = '12pt' 38 | plot.yaxis.major_label_text_font_size = '12pt' 39 | 40 | table.columns = [ 41 | TableColumn(field="name", title='Name'), 42 | TableColumn(field="year", title='Season'), 43 | TableColumn(field="x", title=x_name), 44 | TableColumn(field="y", title=y_name), 45 | ] 46 | 47 | psource.data = dict( 48 | x = df[x_name], 49 | y = df[y_name], 50 | name = df['name'], 51 | year = df['year'], 52 | team = df['team'], 53 | color = df['color'], 54 | alpha = df['alpha'] 55 | ) 56 | 57 | tsource.data = dict( 58 | x = df_mask[x_name], 59 | y = df_mask[y_name], 60 | name = df_mask['name'], 61 | year = df_mask['year'], 62 | team = df_mask['team'], 63 | color = df_mask['color'], 64 | alpha = df_mask['alpha'] 65 | ) 66 | 67 | def PlayerMask(df, controls): 68 | player = controls[0].value 69 | team = controls[3].value 70 | 71 | if player == '': 72 | mask_player = np.ones(len(df.index), dtype=bool) 73 | else: 74 | mask_player = df.name.str.lower().str.contains(player) 75 | 76 | if team == 'All': 77 | mask_team = np.ones(len(df.index), dtype=bool) 78 | else: 79 | if team == 'Multiple': 80 | mask_team = (df.team == 'TOT') 81 | else: 82 | mask_team = (df.team == team) 83 | 84 | mask_year = ((df.year >= controls[4].value) & (df.year <= controls[5].value)) 85 | mask_age = ((df.age >= controls[1].value) & (df.age <= controls[2].value)) 86 | 87 | mask = np.logical_and(mask_player, mask_team) 88 | mask = np.logical_and(mask, mask_year) 89 | mask = np.logical_and(mask, mask_age) 90 | 91 | return mask 92 | 93 | 94 | def player_tab(dfp): 95 | # Grab list of stats/columns from dataframe 96 | #stats = sorted(dfp.columns.values) 97 | stats = list(dfp.columns.values) 98 | 99 | # Grab list of teams 100 | teams = np.unique(dfp.team.values) 101 | teams = np.insert(teams, 0, 'All') 102 | teams = np.insert(teams, 1, 'Multiple') 103 | teams = teams[teams != 'TOT'] 104 | teams = list(teams) 105 | 106 | # Grab the minimum and maximum player ages 107 | age_low = min(dfp.age.values) 108 | age_high = max(dfp.age.values) 109 | 110 | #axis_map = { 111 | # "2 PT %": "2PP_PH", 112 | # "3 PT %": "3PP_PH", 113 | #} 114 | 115 | player_sel = TextInput(title="Player Names:") 116 | min_age = Slider(title="Min Age", start=age_low, end=age_high, value=age_low, step=1) 117 | max_age = Slider(title="Max Age", start=age_low, end=age_high, value=age_high, step=1) 118 | team_sel = Select(title="Team:", options=teams, value='All') 119 | min_year = Slider(title="Starting Season", start=2016, end=2020, value=2016, step=1) 120 | max_year = Slider(title="Ending Season", start=2016, end=2020, value=2019, step=1) 121 | #x_axis = Select(title="X Axis", options=sorted(axis_map.keys()), value="2 PT %") 122 | #y_axis = Select(title="Y Axis", options=sorted(axis_map.keys()), value="3 PT %") 123 | x_axis = Select(title="X Axis", options=stats, value="FG_FREQ_05FT") 124 | y_axis = Select(title="Y Axis", options=stats, value="FG_FREQ_GT24FT") 125 | 126 | # Create a data source dictionary for storing data with each update 127 | psource = ColumnDataSource(data=dict(x=[], y=[], name=[], year=[], team=[], color=[], alpha=[])) 128 | tsource = ColumnDataSource(data=dict(x=[], y=[], name=[], year=[], team=[], color=[], alpha=[])) 129 | 130 | # Create tooltips object for hover variables, 131 | # and create figure for scatterplot 132 | TOOLTIPS=[ 133 | ("Name", "@name"), 134 | ("Year", "@year"), 135 | ("Team", "@team") 136 | ] 137 | 138 | p = figure(plot_height=600, plot_width=700, title="", tooltips=TOOLTIPS, sizing_mode="scale_both") 139 | p.circle(x="x", y="y", source=psource, size=7, line_color=None, color='color', fill_alpha='alpha') 140 | 141 | columns = [ 142 | TableColumn(field="name", title='Name'), 143 | TableColumn(field="year", title='Season'), 144 | TableColumn(field="x", title=x_axis.value), 145 | TableColumn(field="y", title=y_axis.value), 146 | ] 147 | data_table = DataTable(source=tsource, columns=columns, width=275, height=550) 148 | 149 | # Create controls for filtering plotted/table data 150 | controls = [ player_sel, min_age, max_age, team_sel, min_year, max_year, x_axis, y_axis ] 151 | for control in controls: 152 | control.on_change('value', lambda attr, old, new: ControlUpdate(dfp, psource, tsource, controls, p, data_table)) 153 | 154 | # Do a preliminary update of plot and table 155 | ControlUpdate(dfp, psource, tsource, controls, p, data_table) 156 | 157 | # Create layout by column 158 | inputs = column(*controls, width=250, height=600) 159 | inputs.sizing_mode = "fixed" 160 | l = layout([ 161 | [inputs, p, data_table], 162 | #], sizing_mode="scale_both") 163 | ]) 164 | 165 | # Make a tab with the layout 166 | tab = Panel(child=l, title = 'Player Stats') 167 | 168 | return tab 169 | 170 | -------------------------------------------------------------------------------- /bokeh_app/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends base %} 2 | 3 | {% block preamble %} 4 | 5 | 6 | {% endblock %} 7 | 8 | {% block postamble %} 9 | 12 | {% endblock %} 13 | 14 | {% block contents %} 15 | 19 | 20 |
21 | 22 | {{ super() }} 23 |
24 | {% endblock %} 25 | -------------------------------------------------------------------------------- /bokeh_app/templates/styles.css: -------------------------------------------------------------------------------- 1 | .page-header { 2 | /* 3 | background-color: #C25100; 4 | */ 5 | background-color: #ffffff; 6 | width: 100%; 7 | margin-top: 0; 8 | padding: 0px 20px 10px 30px; 9 | } 10 | .page-header h1 { 11 | /* 12 | color: #ffffff; 13 | font-family: 'Julius Sans One', serif; 14 | */ 15 | font-family: 'Alegreya Sans SC'; 16 | font-size: 24pt; 17 | text-decoration: none; 18 | } 19 | .page-header p { 20 | /* 21 | color: #ffffff; 22 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 23 | */ 24 | font-family: 'Alegreya Sans SC'; 25 | font-size: 15pt; 26 | font-style: italic; 27 | text-align: justify; 28 | text-justify: inter-word; 29 | } 30 | .content { 31 | width: 100%; 32 | margin-left: 15px; 33 | } 34 | 35 | -------------------------------------------------------------------------------- /img/nba_stats_explorer_curry_lineup_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jecutter/nba-data-models/63a81fa04eb4eb3a1b0caa5da2c105a4442d32f9/img/nba_stats_explorer_curry_lineup_ex.png -------------------------------------------------------------------------------- /img/nba_stats_explorer_curry_pbp_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jecutter/nba-data-models/63a81fa04eb4eb3a1b0caa5da2c105a4442d32f9/img/nba_stats_explorer_curry_pbp_ex.png -------------------------------------------------------------------------------- /img/nba_stats_explorer_curry_player_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jecutter/nba-data-models/63a81fa04eb4eb3a1b0caa5da2c105a4442d32f9/img/nba_stats_explorer_curry_player_ex.png --------------------------------------------------------------------------------