├── images ├── gb cm.png ├── nn cm.png ├── rockets.png ├── cm logreg.png ├── fg by zone.png ├── gb feats.png ├── shot_dist.png ├── shot_zones.png ├── harden recs.png ├── team heatmap.png ├── all_roc_curves.png ├── harden heatmap.png ├── model results.png └── sc_shot_chart.png ├── nba_shots_scraper.py ├── neural_net.py ├── README.md ├── plotly_viz.py ├── shot_chart_viz.py ├── shallow_ML_models.py ├── new_ETL.py ├── presentation.py └── Data-Exploration.ipynb /images/gb cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/gb cm.png -------------------------------------------------------------------------------- /images/nn cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/nn cm.png -------------------------------------------------------------------------------- /images/rockets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/rockets.png -------------------------------------------------------------------------------- /images/cm logreg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/cm logreg.png -------------------------------------------------------------------------------- /images/fg by zone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/fg by zone.png -------------------------------------------------------------------------------- /images/gb feats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/gb feats.png -------------------------------------------------------------------------------- /images/shot_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/shot_dist.png -------------------------------------------------------------------------------- /images/shot_zones.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/shot_zones.png -------------------------------------------------------------------------------- /images/harden recs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/harden recs.png -------------------------------------------------------------------------------- /images/team heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/team heatmap.png -------------------------------------------------------------------------------- /images/all_roc_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/all_roc_curves.png -------------------------------------------------------------------------------- /images/harden heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/harden heatmap.png -------------------------------------------------------------------------------- /images/model results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/model results.png -------------------------------------------------------------------------------- /images/sc_shot_chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/sc_shot_chart.png -------------------------------------------------------------------------------- /nba_shots_scraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pandas as pd 3 | import numpy as np 4 | from data.all_players_list import players_list 5 | import time 6 | from court import court_shapes 7 | 8 | vets = [player[0:5] for player in players_list if (player[3] >1990) & (player[4] >2014)] 9 | 10 | vets_df = pd.DataFrame(vets, columns=['ID', 'Name', 'Active', 'RookieYear', 'LastSeasonPlayed']) 11 | vets_df = vets_df.drop(columns=['Active', 'RookieYear', 'LastSeasonPlayed']) 12 | 13 | player_ids = [player[0] for player in vets] 14 | 15 | #MULTIPLE YEARS 16 | sc_url_1 = 'https://stats.nba.com/stats/shotchartdetail?AheadBehind=&CFID=33&CFPARAMS=' 17 | sc_url_2 = '&ClutchTime=&Conference=&ContextFilter=&ContextMeasure=FGA&DateFrom=&DateTo=&Division=&EndPeriod=10&EndRange=28800&GROUP_ID=&GameEventID=&GameID=&GameSegment=&GroupID=&GroupMode=&GroupQuantity=5&LastNGames=0&LeagueID=00&Location=&Month=0&OnOff=&OpponentTeamID=0&Outcome=&PORound=0&Period=0&PlayerID=' 18 | sc_url_3 = '&PlayerID1=&PlayerID2=&PlayerID3=&PlayerID4=&PlayerID5=&PlayerPosition=&PointDiff=&Position=&RangeType=0&RookieYear=&Season=' 19 | sc_url_4 = '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StartPeriod=1&StartRange=0&StarterBench=&TeamID=0&VsConference=&VsDivision=&VsPlayerID1=&VsPlayerID2=&VsPlayerID3=&VsPlayerID4=&VsPlayerID5=&VsTeamID=' 20 | 21 | headers = requests.utils.default_headers() 22 | headers.update({ 23 | "user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 24 | }) 25 | 26 | #year in yyyy-yy format (i.e. '2017-18') 27 | def get_all_players_shot_data(player_ids, year): 28 | all_shots = [] 29 | c=0 30 | start=time.time() 31 | for player_id in player_ids: 32 | full_url = sc_url_1 + str(year) + sc_url_2 + str(player_id) + sc_url_3 + str(year) + sc_url_4 33 | shots = requests.get(full_url, headers=headers).json() 34 | all_shots.append(shots) 35 | time.sleep(1) 36 | c+=1 37 | if c%50==0: 38 | print('Runtime: {} seconds. {} players completed'.format(time.time()-start, c)) 39 | return all_shots 40 | 41 | def convert_dict_to_df(all_shot_data): 42 | start=time.time() 43 | 44 | league_avgs = all_shot_data[0]['resultSets'][1]['rowSet'] 45 | league_avg_columns = all_shot_data[0]['resultSets'][1]['headers'] 46 | league_avgs_df = pd.DataFrame.from_records(league_avgs, columns=league_avg_columns) 47 | 48 | columns = all_shot_data[0]['resultSets'][0]['headers'] 49 | 50 | df_list=[] 51 | 52 | for player in all_shot_data: 53 | data = player['resultSets'][0]['rowSet'] 54 | player_df = pd.DataFrame.from_records(data, columns=columns) 55 | df_list.append(player_df) 56 | 57 | df = pd.concat(df_list, ignore_index=True) 58 | print('Total Runtime: {} seconds.'.format(time.time()-start)) 59 | 60 | return df, league_avgs_df 61 | 62 | all_shots_1415 = get_all_players_shot_data(player_ids, '2014-15') 63 | shots_1415_df, lg_avgs_1415 = convert_dict_to_df(all_shots_1415) 64 | 65 | shots_1415_df.to_csv('data/shots_1415.csv') 66 | -------------------------------------------------------------------------------- /neural_net.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import os, itertools 5 | 6 | from sklearn.preprocessing import MinMaxScaler 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import confusion_matrix, classification_report 9 | 10 | import keras 11 | from keras.layers import Dense, Dropout, LSTM 12 | from keras.models import Sequential, load_model 13 | from keras.callbacks import EarlyStopping, TensorBoard 14 | 15 | #####LOAD DATA##### 16 | if False: 17 | df = pd.read_csv('data/final_df.csv', index_col=0) 18 | 19 | X = df.drop(columns=['name', 'pos', 'age', 'player_id', 'team_name', 'team_id', 'game_date', 'game_id', 'game_event_id', 'season', 'minutes_remaining', 'seconds_remaining', 'action_type', 'shot_type', 'opponent','opp_id', 20 | 'defender_name', 'htm', 'vtm', 'defender_id', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'Heave', 'dribbles','shot_distance', 'shot_made_flag']) 21 | y = np.array(df.shot_made_flag) 22 | 23 | minmax_scale = MinMaxScaler() 24 | X = minmax_scale.fit_transform(X) 25 | 26 | np.save('X_y_arrays/X_', X) 27 | np.save('X_y_arrays/y_', y) 28 | #####SPLIT DATA INTO TRAIN/TEST SETS##### 29 | if True: 30 | X = np.load('X_y_arrays/X_.npy') 31 | y = np.load('X_y_arrays/y_.npy') 32 | 33 | X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=23,test_size=.2) 34 | 35 | #####HELPER FUNCTION TO PLOT CM##### 36 | def plot_confusion_matrix(cm, name, cmap=plt.cm.Blues): 37 | #Create the basic matrix. 38 | fig = plt.figure(figsize=(6, 6)) 39 | plt.imshow(cm, cmap) 40 | 41 | #Add title and Axis Labels 42 | plt.title(name + ' - ' 'Confusion Matrix') 43 | plt.xlabel('Predicted') 44 | plt.ylabel('Actual') 45 | 46 | #Add appropriate Axis Scales 47 | tick_marks = np.arange(0,2) 48 | plt.xticks(tick_marks, ['Miss', 'Make']) 49 | plt.yticks(tick_marks, ['Miss', 'Make']) 50 | 51 | #Add Labels to Each Cell 52 | thresh = 0.75 * cm.max() 53 | 54 | #Add a Side Bar Legend Showing Colors 55 | plt.colorbar() 56 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 57 | plt.text(j, i, cm[i, j], 58 | horizontalalignment="center", 59 | color="black" if cm[i, j] <= thresh else "white") 60 | 61 | plt.tight_layout() 62 | fig.savefig('./models/nn/cm/' + name + '.png', bbox_inches='tight', dpi=480) 63 | plt.show() 64 | 65 | def plot_val_loss_acc(model, name): 66 | model_val_dict = model.history.history 67 | loss_values = model_val_dict['loss'] 68 | val_loss_values = model_val_dict['val_loss'] 69 | acc_values = model_val_dict['acc'] 70 | val_acc_values = model_val_dict['val_acc'] 71 | 72 | epochs_ = range(1, len(loss_values) + 1) 73 | plt.plot(epochs_, loss_values, 'g', label='Training loss') 74 | plt.plot(epochs_, val_loss_values, 'g.', label='Validation loss') 75 | plt.plot(epochs_, acc_values, 'r', label='Training acc') 76 | plt.plot(epochs_, val_acc_values, 'r.', label='Validation acc') 77 | 78 | plt.title(name + ' - Training & validation loss / accuracy') 79 | plt.xlabel('Epochs') 80 | plt.ylabel('Loss') 81 | plt.legend() 82 | plt.savefig('models/nn/val_loss_acc/' + name + '.png', bbox_inches='tight') 83 | plt.show() 84 | 85 | #####NEURAL NETWORK GENERATOR##### 86 | def build_nn__(X_train, X_test, y_train, y_test, activation, epochs, batch_size, name, nodes, dropout): 87 | 88 | adam = keras.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) 89 | 90 | nn_ = Sequential() 91 | 92 | #First layer 93 | nn_.add(Dense(X_train.shape[1], input_shape=(X_train.shape[1],), activation=activation)) 94 | #Iterate through number of nodes and add hidden layers 95 | for i, node in enumerate(nodes): 96 | nn_.add(Dense(node, activation=activation)) 97 | if dropout[i]==True: 98 | nn_.add(Dropout(0.2)) 99 | #Output layer, use 'sigmoid' activation for binary classfication 100 | nn_.add(Dense(1, activation='sigmoid')) 101 | 102 | #Show NN summary 103 | nn_.summary() 104 | #Compile model 105 | nn_.compile(loss='binary_crossentropy', optimizer=adam, metrics=['acc']) 106 | 107 | #Add early stopping and tensorboard callbacks 108 | early_stopping = EarlyStopping(monitor='val_loss', min_delta = 0.001, patience = 15, verbose=1, mode='auto', baseline=None) 109 | tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None) 110 | 111 | #Fit model 112 | nn_.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, verbose = 1, validation_split=0.1, callbacks = [early_stopping, tensorboard]) 113 | 114 | plot_val_loss_acc(nn_, activation + '_' + name) 115 | 116 | nn_.save('./models/nn/' + name + '_' + activation +'_' + str(epochs) + '_' + str(batch_size) + '_' + str(len(nodes)) + '_' + '_'.join([str(i) for i in nodes]) + '.h5') 117 | 118 | print(nn_.evaluate(X_test, y_test)) 119 | 120 | cm = confusion_matrix(nn_.predict_classes(X_test), y_test) 121 | print(cm) 122 | plot_confusion_matrix(cm, activation + '_' + name) 123 | 124 | print('Test Set Classification Report') 125 | print(classification_report(nn_.predict_classes(X_test), y_test, target_names=['Miss','Make'])) 126 | return nn_ 127 | 128 | nn = build_nn__(X_train, X_test, y_train, y_test, activation='relu', epochs=50, batch_size=32, name='16th_run_101', nodes=[128,128,64,64,32,32,16,8], dropout=[False, False, False, False, False, False, False, False]) 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NBA Shot Analysis 2 | 3 | ## Goal 4 | Build a classification model to predict whether and NBA shot will go in or not, and create visualizations to help general managers, coaches, and players identify shooting patterns, eliminate bad shots, and optimize their strategy to increase shooting efficiency. 5 | 6 | ## ETL 7 | I gathered my data from three sources: 8 | - Shot location data scraped from stats.nba.com (see my blog post for more detail) 9 | - Player tracking data from nbasavant.com 10 | - Defensive stats from basketball-reference 11 | 12 | Since the NBA stopped providing tracking data such as the number of dribbles, and defender distance in the middle of the 2016 season, I focused my project on the 2014-15 season. I gathered data on over 200,000 shots, with features including, but not limited to: 13 | - Shot distance, (x,y) coordinates, and shot zone 14 | - Touch time and number of dribbles 15 | - Name and distance of the closest defender 16 | - Game context stats such as shot clock remaining, period, game clock 17 | - Shot type (jump shot, dunk, etc.) 18 | 19 | I wanted to add more context to each shot, so I added advanced defensive stats for each defender (Block %, Defensive Win Shares/48, Defensive Box Score Plus Minus) and team (Defensive Rating). 20 | 21 | The data I gathered had two different zone breakdowns, one which detailed the directional area of the court (left, right or center) and the other which detailed a more precise location (paint, corner 3, etc.). I combined these into 15 zones, as seen below, and for every player I calculated their Field Goal % (FG%) in each zone so that my model would have a better understanding of the quality of the shot. 22 | 23 | 24 | 25 | I have never been a fan of the argument that momentum impacts basketball games, and have often argued against the concept of a "hot hand" which posits that a player is more likely to hit a shot if they have hit consecutive prior shots. In an attempt to disprove this hypothesis, I engineered new features that detailed whether the shooter has scored their previous 1, 2, and 3 shots. My models found that hitting prior shots did not have a significant impact on whether a player will score their next shot. 26 | 27 | ## Visualizations 28 | I wanted to create a wide range of visualizations that would show the frequency and efficiency of player's and team's shots. 29 | 30 | #### Binned Shot Chart 31 | The first visualization I made is a binned shot chart that breaks the court down into equally sized hexes and groups nearby shots into bubbles, with the size determined by frequency and color by FG%. The color scale differed for two's and three's to account for the point value of each shot. I also added the player's image and some additional stats to the chart. In my dashboard, there is a dropdown where you can select any player, and there is also an option to change the bubble size depending on if you want to see a more precise or broad shot chart. 32 | 33 | 34 | 35 | I made similar charts for each team, where you can get a strong sense of their shooting efficiency and frequency distribution. 36 | 37 | 38 | 39 | #### Frequency Shot Heatmap 40 | In order to get a better sense of where players and teams are shooting from, disregarding efficiency, I designed a heatmap to show the locations where they most frequently shoot from, complete with a dropdown that allows you to select any player or team. 41 | 42 | 43 | 44 | #### FG Frequency Bar Plot 45 | To visualize how the league distributes its shots, I added an interactive bar plot to my dashboard that shows FG% and the number of shots for a given feature that can be selected from a dropdown. 46 | 47 | 48 | 49 | #### FG Percentage Bar Plot 50 | To visualize FG% without focusing on frequency, I built an interactive bar plot that shows leaguewide FG% and the number of shots for a range of features that can be selected from a dropdown. 51 | 52 | 53 | 54 | #### Team Points Per Shot Heatmap Matrix 55 | I wanted to compare how teams perform in different contexts, so created a heatmap matrix that helps visualize which teams under- and overperform in certain aspects. The color of each box is determined by the team's points per shot (PPS) provided the selected feature/context. This gives teams a better sense of where they need to improve and how they stack up among the rest of the league. 56 | 57 | 58 | 59 | ## Machine Learning Models 60 | I trained 6 different machine learning classification models to predict whether a given shot would go in. The models I used were the following: 61 | - Logistic Regression 62 | - Random Forest 63 | - Gradient Boosting 64 | - AdaBoost 65 | - XGBoost 66 | - Neural Network 67 | 68 | For each model, I went through a cross-validation process to help narrow down my feature set into only the most important ones that did not show signs of multicollinearity with other included features. I ultimately narrowed down my initial set of over 20 features to the following 6: 69 | - Shot Distance 70 | - Zone FG% 71 | - Defensive Win Shares per 48 Minutes 72 | - Defender Distance 73 | - Touch Time 74 | - Shot Clock Remaining 75 | 76 | ###### Feature Importances (Gradient Boosting Classifier) 77 | 78 | 79 | Due to the inconsistency in scale of my numeric features (FG% is a decimal but shot distance is measured in feet), I used Scikit-Learn's MinMaxScaler to normalize and vectorize my data. My cross-validation process included hyperparameter tuning for each of my models by running a grid search with Stratified Kfold splits to ensure that the class balance remained consistent across all splits. 80 | For the Neural Network, I used one hidden layer that contained 50 nodes, 'relu' activation due to the lack of negative values, and the 'adam' optimizer to obtain my best results. 81 | 82 | ###### ROC curves 83 |

84 | 85 |

86 | 87 | ###### Confusion Matrix Comparisons (left: Logistic Regression, center: Gradient Boosting, right: Neural Network) 88 | 89 | 90 | My best performing model depends on how a team values the bias/variance tradeoff and whether they would prefer to minimize false negatives (predicting a miss when its actually a make) or false positives (predicting a make when its in fact a miss). A more aggressive team would prefer the Neural Network, which only recommended not to shoot when it was extremely confident the shot would miss, but often recommended the player should shoot, albeit with less than a 40% accuracy. An aggressive team would be fine with this model because it limited false negatives and gave the team more chances to score. 91 | 92 | On the other hand, a more conservative team might prefer the Gradient Boosting model, which correctly classified makes with a much higher accuracy, yet only recommended a shot ~30% of the time. It would likely lead to a higher FG%, but limits the potential scoring opportunities by recommending a team take fewer shots. The Logistic Regression model is far more balanced, sacrificing a lower overall accuracy for better precision and recall. 93 | 94 | ###### Model Results 95 | 96 | 97 | In addition to my individual models, I built a stacked ensemble model that trained the XGBoost, Random Forest, and AdaBoost classifiers, and then trained a Gradient Boosting model on output. This would, in theory, give less biased predictions by weighing multiple models; however, its results were unfortunately worse than my single layer models, so I discarded it. 98 | 99 | ## Shot Recommender 100 | For each player, I built a recommender system that outputs certain zones where the player should shoot more or less frequently from. The concept is based on the player's PPS relative to the league average in each zone. A player who has a high expected PPS relative to the league average in a zone would be recommended to shoot there more frequently. Conversely, a player who shoots poorly in a zone would be recommended to shoot less. In the future, I want to tune this recommender by accounting for the player's frequency of shots in each zone, so that it does not recommend a player shoot more in a zone that already contains a high percentage of their total shots. 101 | ###### Recommender Output 102 | 103 | 104 | ## Next Steps 105 | - Adjust the color scale of binned plots to display efficiency relative to the league average, either in terms of FG% or PPS 106 | - Tune the shot recommender to provide ideal shot distributions 107 | - Classify 2s and 3s differently in my models to see if certain models predict one shot type with higher accuracy than others 108 | - Cluster similarly skilled shooters and recommend an optimal shooting lineup that covers each shot zone 109 | - Host the project online using Dash and Flask instead of the Jupyter Notebook dashboard 110 | 111 | ## Credits 112 | * Kirk Goldsberry for inspiring me to work on this project 113 | * Savvas Tjortjoglou for his court dimensions 114 | -------------------------------------------------------------------------------- /plotly_viz.py: -------------------------------------------------------------------------------- 1 | ############################### IMPORTS ############################### 2 | if True: 3 | import plotly 4 | import plotly.plotly as py 5 | import plotly.graph_objs as go 6 | plotly.offline.init_notebook_mode(connected=True) 7 | 8 | import matplotlib 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | import numpy as np 12 | import pandas as pd 13 | pd.set_option('display.max_columns',100) 14 | 15 | from court import court_shapes 16 | 17 | import warnings 18 | warnings.filterwarnings('ignore') 19 | 20 | import itertools, math, time, re, pickle 21 | 22 | ############################## LOAD DATA ############################## 23 | df = pd.read_csv('data/clean_df_1415.csv',index_col=0) 24 | zone_ids = pd.read_csv('data/zone_ids.csv',index_col=0) 25 | zone_fg_pct = pd.read_csv('data/zone_fg_pct.csv',index_col=0) 26 | 27 | ############################## CLEANING DATA ############################ 28 | def basic_cleaning(df): 29 | df.period[df.period>5]=5 30 | df.touch_time[df.touch_time<0]=0 31 | df.touch_time[df.touch_time>24]=24 32 | #df.touch_time=round(df.touch_time*4)/4 33 | df.defender_distance[df.defender_distance>10]=10 34 | #df.shot_clock[df.shot_clock>3] = round(df.shot_clock[df.shot_clock>3]*4)/4 35 | df.shot_distance[df.shot_distance>40]=40 36 | df.blk_pct[df.blk_pct>10]=10 37 | df.dbpm[df.dbpm>5.5]=5.5 38 | df['pps'] = df.shot_type*df.shot_made_flag 39 | #basic_cleaning(df) 40 | 41 | ###################################################################### 42 | ###################################################################### 43 | ###########################--SHOT CHARTS--############################ 44 | ###################################################################### 45 | ###################################################################### 46 | 47 | ######################--DRAW PLAYER SHOT CHART--###################### 48 | def draw_shot_chart(name): 49 | player = df[df.name==name] 50 | 51 | missed_shot_trace = go.Scattergl( 52 | x = player[player.shot_made_flag == 0]['x'], 53 | y = player[player.shot_made_flag == 0]['y'], 54 | mode = 'markers', 55 | name = 'Make', 56 | marker= dict(color='blue', symbol='x', size=8, line={'width':1}, opacity=0.7), 57 | text = [str(sd) for sd in player[player.shot_made_flag == 0]['action_type']], 58 | hoverinfo = 'text' 59 | ) 60 | made_shot_trace = go.Scattergl( 61 | x = player[player.shot_made_flag == 1]['x'], 62 | y = player[player.shot_made_flag == 1]['y'], 63 | mode = 'markers', 64 | name='Make', 65 | marker= dict(color='red', symbol='circle', size=8, line={'width':1}, opacity=0.7), 66 | text = [str(sd) for sd in player[player.shot_made_flag == 1]['action_type']], 67 | hoverinfo = 'text' 68 | ) 69 | 70 | data = [missed_shot_trace, made_shot_trace] 71 | layout = go.Layout( 72 | title= name + ' Shot Chart 2014-2015', 73 | showlegend =True, 74 | xaxis={'showgrid':False, 'range':[-250,250]}, 75 | yaxis={'showgrid':False, 'range':[-47.5,500]}, 76 | height = 600, 77 | width = 650, 78 | shapes=court_shapes) 79 | 80 | fig = go.Figure(data=data, layout=layout) 81 | plotly.offline.iplot(fig, filename = name + ' Shot Chart') 82 | 83 | ########################--GROUPED SHOT CHART--######################## 84 | def grouped_plot(feature): 85 | groups = df.groupby(feature) 86 | colors = np.linspace(0,1,len(groups)) 87 | 88 | color_list = ['aliceblue', 'aqua', 'steelblue','violet', 'blue', 89 | 'blueviolet', 'brown', 'cadetblue', 90 | 'chartreuse', 'darkgreen', 'darkmagenta', 'tomato', 91 | 'gold', 'red', 'slategray'] 92 | counter=0 93 | data = [] 94 | for g, c in zip(groups, colors): 95 | data.append(go.Scattergl( 96 | x = g[1].x, 97 | y = g[1].y, 98 | mode = 'markers', 99 | name = g[0], 100 | marker= dict(symbol='circle', size=7, 101 | line={'width':1}, opacity=0.7, color=color_list[counter]), 102 | text = g[0], 103 | hoverinfo = 'text') 104 | ) 105 | counter+=1 106 | 107 | layout = go.Layout( 108 | title='Shot Distribution by ' + feature.title(), 109 | showlegend =True, 110 | xaxis={'showgrid':False, 'range':[-250,250]}, 111 | yaxis={'showgrid':False, 'range':[-47.5,500]}, 112 | height = 600, 113 | width = 750, 114 | shapes=court_shapes) 115 | 116 | fig = go.Figure(data=data, layout=layout) 117 | plotly.offline.iplot(fig, filename = 'Shot Zone Breakdown') 118 | 119 | ########################--FREQUENCY BAR PLOT--######################## 120 | def freq_bar_plots(df, feature, round_=False): 121 | df_ = df.copy() 122 | if round_==True: 123 | df_[feature] = round(df_[feature]) 124 | 125 | feat_tab = pd.crosstab(df_[feature], df_.shot_made_flag, margins=True) 126 | feat_tab['fg_pct'] = round(feat_tab[1]/feat_tab['All'],3) 127 | 128 | tab=feat_tab.drop(columns='All')[:-1] 129 | make_text= [str(round(t*100,1)) + '%' for t in tab.fg_pct] 130 | miss_text= [str(round((1-t)*100,1)) + '%' for t in tab.fg_pct] 131 | 132 | trace1 = go.Bar( 133 | x=tab.index, 134 | y=tab[1], 135 | name='Makes', 136 | text= make_text , 137 | textposition = 'inside', 138 | textfont=dict( 139 | family='sans serif', size=12, color='white'), 140 | marker=dict( 141 | color='red'), 142 | opacity=0.75 143 | ) 144 | trace2 = go.Bar( 145 | x=tab.index, 146 | y=tab[0], 147 | name='Misses', 148 | text= miss_text, 149 | textposition = 'inside', 150 | textfont=dict( 151 | family='sans serif', size=10, color='white'), 152 | marker=dict( 153 | color='blue'), 154 | opacity=0.75 155 | ) 156 | 157 | line = go.Scatter( 158 | x=tab.index, 159 | y=tab[1], 160 | mode='markers+lines', 161 | name='# Makes', 162 | hoverinfo='skip', 163 | line=dict( 164 | color='black', width=.75) 165 | ) 166 | 167 | data = [trace1, trace2, line] 168 | layout = go.Layout( 169 | barmode='stack', 170 | title='FG% by ' + feature.title().replace('_',' '), 171 | showlegend =True, 172 | xaxis=dict( 173 | automargin=True, 174 | autorange=True, 175 | ticks='', 176 | showticklabels=True, 177 | #tickangle=25, 178 | title=feature.replace('_',' ').title() 179 | ), 180 | yaxis=dict( 181 | automargin=True, 182 | ticks='', 183 | showticklabels=True, 184 | title='# of Shots' 185 | ) 186 | ) 187 | 188 | fig = go.Figure(data=data, layout=layout) 189 | plotly.offline.iplot(fig, filename='stacked-bar') 190 | 191 | ########################--PERCENTAGE BAR CHART--######################## 192 | def pct_bar_plots(feature, dataframe, round_=False, player=None, team=None): 193 | if round_==True: 194 | df_ = dataframe.copy() 195 | df_[feature] = round(df_[feature]) 196 | else: 197 | df_ = dataframe 198 | 199 | if player: 200 | df = df_[df_.name==player.title()] 201 | title= player.title() + ' - FG% by ' + feature.title().replace('_',' ') 202 | elif team: 203 | df = df_[df_.team_name==team.title()] 204 | title= team.title() + ' - FG% by ' + feature.title().replace('_',' ') 205 | else: 206 | df = df_ 207 | title= 'FG% by ' + feature.title().replace('_',' ') 208 | 209 | 210 | test=pd.crosstab(df[feature], df.shot_made_flag, margins=True) 211 | test['pct_made'] = test[1]/test.All 212 | test['pct_missed'] = 1-test.pct_made 213 | 214 | made_text= [str(round(t*100,1)) + '%' for t in test.pct_made] 215 | missed_text= [str(round(t*100,1)) + '%' for t in test.pct_missed] 216 | 217 | trace1 = go.Bar( 218 | x=test.index, 219 | y=test.pct_made, 220 | name='Makes', 221 | text= made_text, 222 | textposition = 'auto', 223 | textfont=dict( 224 | family='sans serif', 225 | size=12, color='white'), 226 | marker=dict( 227 | color='red'), 228 | opacity=0.75 229 | ) 230 | trace2 = go.Bar( 231 | x=test.index, 232 | y=test.pct_missed, 233 | name='Misses', 234 | text= missed_text, 235 | textposition = 'auto', 236 | textfont=dict( 237 | family='sans serif', 238 | size=12, color='white'), 239 | marker=dict( 240 | color='blue'), 241 | opacity=0.75, 242 | ) 243 | 244 | data = [trace1, trace2] 245 | layout = go.Layout( 246 | barmode='stack', 247 | title= title, 248 | showlegend =True, 249 | ) 250 | 251 | fig = go.Figure(data=data, layout=layout) 252 | plotly.offline.iplot(fig, filename='stacked-bar') 253 | 254 | ############################--PPS HEATMAP--############################# 255 | #FIX FUNCTION - CHANGE ZONE TO FEATURE 256 | def pps_heatmap(df, feature): 257 | pps_tab=pd.crosstab(df.team_name, df[feature], values=df.pps, aggfunc='mean',margins=False).fillna(0) 258 | 259 | team_heatmap = go.Heatmap(z=[np.array((pps_tab[pps_tab.index==pps_tab.index[i]])) for i in range(len(pps_tab.index))], 260 | x=pps_tab.columns, 261 | y= [team.split(' ')[-1] for team in pps_tab.index] 262 | ) 263 | 264 | layout = go.Layout( 265 | title='Points Per Shot Heatmap', 266 | xaxis = dict(ticks='', nticks=len(pps_tab.columns)), 267 | yaxis = dict(ticks='', nticks=len(pps_tab.index)), 268 | ) 269 | 270 | fig = go.Figure(data=[team_heatmap], layout=layout) 271 | plotly.offline.iplot(fig, filename='labelled-heatmap') 272 | 273 | #############################--PIE CHART--############################# 274 | def feature_pie_charts(feature): 275 | labels = df[feature].unique() 276 | values = df[feature].value_counts() 277 | colors = ['#FEBFB3', '#E1396C', '#005eff', '#D0F9B1'] 278 | 279 | trace = go.Pie(labels=labels, values=values, 280 | hoverinfo='label+percent', textinfo='value+percent', 281 | textfont=dict(size=20), 282 | marker=dict(colors=colors, 283 | line=dict(color='#000000', width=1))) 284 | 285 | plotly.offline.iplot([trace], filename='styled_pie_chart') 286 | 287 | ##########################--SHOT FREQ HEATMAP--######################### 288 | def shot_freq_heatmap(name): 289 | player = df[df.name==name] 290 | 291 | x_make = player[player.shot_made_flag == 1]['x'] 292 | y_make = player[player.shot_made_flag == 1]['y'] 293 | x_miss = player[player.shot_made_flag == 0]['x'] 294 | y_miss = player[player.shot_made_flag == 0]['y'] 295 | 296 | x = np.concatenate([x_make, x_miss]) 297 | y = np.concatenate([y_make, y_miss]) 298 | 299 | makes = go.Scatter( 300 | x=x_make, 301 | y=y_make, 302 | mode='markers', 303 | name='Make', 304 | showlegend=True, 305 | marker=dict( 306 | symbol='circle', 307 | opacity=0.7, 308 | color='green', 309 | size=4, 310 | line=dict(width=1), 311 | ) 312 | ) 313 | misses = go.Scatter( 314 | x=x_miss, 315 | y=y_miss, 316 | mode='markers', 317 | name='Miss', 318 | showlegend=True, 319 | marker=dict( 320 | symbol='x', 321 | opacity=0.7, 322 | color='yellow', 323 | size=4, 324 | line=dict(width=1), 325 | ) 326 | ) 327 | trace3 = go.Histogram2d( 328 | x=x, 329 | y=y, 330 | zmax=40, 331 | zmin=0, 332 | # nbinsx=20, 333 | # nbinsy=20, 334 | zsmooth='best', 335 | autobinx=True, 336 | autobiny=True, 337 | reversescale=False, 338 | opacity=.75, 339 | #zauto=True, 340 | #autocolorscale=True, 341 | ) 342 | 343 | layout = go.Layout( 344 | xaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-250,250]), 345 | yaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-47.5,450]), 346 | autosize=False, 347 | height=600, 348 | width=750, 349 | hovermode='closest', 350 | shapes= court_shapes, 351 | title= name + ' - Shot Frequency', 352 | showlegend=True, 353 | legend=dict(x=1.2, y=1), 354 | ) 355 | 356 | data = [trace3, makes, misses] 357 | fig = go.Figure(data=data, layout=layout) 358 | 359 | plotly.offline.iplot(fig) 360 | -------------------------------------------------------------------------------- /shot_chart_viz.py: -------------------------------------------------------------------------------- 1 | ############################### IMPORTS ############################### 2 | if True: 3 | import requests, time, itertools, math, shutil, matplotlib 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | # %matplotlib inline 7 | import seaborn as sns 8 | import numpy as np 9 | 10 | from court import court_shapes 11 | 12 | pd.set_option('display.max_columns',40) 13 | import warnings 14 | warnings.filterwarnings('ignore') 15 | 16 | import ipywidgets as widgets 17 | from ipywidgets import interact 18 | 19 | import plotly 20 | import plotly.plotly as py 21 | import plotly.graph_objs as go 22 | plotly.offline.init_notebook_mode(connected=True) 23 | 24 | #####READ DATAFRAME##### 25 | df = pd.read_csv('final_df_1415.csv',index_col=0) 26 | 27 | #####DRAW PLAYER SHOT CHART (PLOTLY)##### 28 | def draw_shot_chart(name): 29 | player = df[df.name==name] 30 | 31 | missed_shot_trace = go.Scattergl( 32 | x = player[player.shot_made_flag == 0]['x'], 33 | y = player[player.shot_made_flag == 0]['y'], 34 | mode = 'markers', 35 | name = 'Miss', 36 | marker={'color':'blue', 'size':5} 37 | ) 38 | made_shot_trace = go.Scattergl( 39 | x = player[player.shot_made_flag == 1]['x'], 40 | y = player[player.shot_made_flag == 1]['y'], 41 | mode = 'markers', 42 | name='Make', 43 | marker={'color':'red', 'size':5} 44 | ) 45 | 46 | data = [missed_shot_trace, made_shot_trace] 47 | layout = go.Layout( 48 | title= name + ' Shot Chart 2014-2015', 49 | showlegend =True, 50 | xaxis={'showgrid':False, 'range':[-300,300]}, 51 | yaxis={'showgrid':False, 'range':[-100,500]}, 52 | height = 600, 53 | width = 650, 54 | shapes=court_shapes) 55 | 56 | fig = go.Figure(data=data, layout=layout) 57 | plotly.offline.iplot(fig, filename = name + ' Shot Chart') 58 | 59 | #####DRAW TEAM SHOT CHART (PLOTLY)##### 60 | def draw_team_sc(team): 61 | team_df = df[df.team_name==team] 62 | 63 | missed_shot_trace = go.Scattergl( 64 | x = team_df[team_df['shot_made_flag'] == 0]['x'], 65 | y = team_df[team_df['shot_made_flag'] == 0]['y'], 66 | mode = 'markers', 67 | name = 'Miss', 68 | marker={'color':'blue', 'size':5} 69 | ) 70 | made_shot_trace = go.Scattergl( 71 | x = team_df[team_df['shot_made_flag'] == 1]['x'], 72 | y = team_df[team_df['shot_made_flag'] == 1]['y'], 73 | mode = 'markers', 74 | name='Make', 75 | marker={'color':'red', 'size':5} 76 | ) 77 | 78 | data = [missed_shot_trace, made_shot_trace] 79 | layout = go.Layout( 80 | title= team + ' Shot Chart 2014-2015', 81 | showlegend =True, 82 | xaxis={'showgrid':False, 'range':[-300,300]}, 83 | yaxis={'showgrid':False, 'range':[-100,500]}, 84 | height = 600, 85 | width = 650, 86 | shapes=court_shapes) 87 | 88 | fig = go.Figure(data=data, layout=layout) 89 | plotly.offline.iplot(fig, filename = team + ' Shot Chart') 90 | 91 | #####DROPDOWNS##### 92 | if False: 93 | # team_dropdown = widgets.Dropdown( 94 | # options = sorted(list(set(df.team_name))), 95 | # value='New York Knicks', 96 | # description='Team:', 97 | # disabled=False, 98 | # ) 99 | # 100 | # interact(draw_team_sc, team=team_dropdown); 101 | 102 | player_dropdown = widgets.Dropdown( 103 | options = sorted(list(set(df.name))), 104 | value='James Harden', 105 | description='Player:', 106 | disabled=False 107 | ) 108 | 109 | grid_slider = widgets.IntSlider( 110 | value=15, 111 | min=5, max=60, 112 | step=5, 113 | description='Bubble Size:', 114 | disabled=False, 115 | ) 116 | 117 | interact(freq_shooting_plot, player_name=player_dropdown, gridNum=grid_slider); 118 | 119 | #####DRAW COURT MATPLOTLIB##### 120 | def draw_court(ax=None, color='black', lw=2, outer_lines=False): 121 | from matplotlib.patches import Circle, Rectangle, Arc 122 | if ax is None: 123 | ax = plt.gca() 124 | hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False) 125 | backboard = Rectangle((-30, -7.5), 60, -1, linewidth=lw, color=color) 126 | outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw, color=color, 127 | fill=False) 128 | inner_box = Rectangle((-60, -47.5), 120, 190, linewidth=lw, color=color, 129 | fill=False) 130 | top_free_throw = Arc((0, 142.5), 120, 120, theta1=0, theta2=180, 131 | linewidth=lw, color=color, fill=False) 132 | bottom_free_throw = Arc((0, 142.5), 120, 120, theta1=180, theta2=0, 133 | linewidth=lw, color=color, linestyle='dashed') 134 | restricted = Arc((0, 0), 80, 80, theta1=0, theta2=180, linewidth=lw, 135 | color=color) 136 | corner_three_a = Rectangle((-220, -47.5), 0, 140, linewidth=lw, 137 | color=color) 138 | corner_three_b = Rectangle((220, -47.5), 0, 140, linewidth=lw, color=color) 139 | three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158, linewidth=lw, 140 | color=color) 141 | center_outer_arc = Arc((0, 422.5), 120, 120, theta1=180, theta2=0, 142 | linewidth=lw, color=color) 143 | center_inner_arc = Arc((0, 422.5), 40, 40, theta1=180, theta2=0, 144 | linewidth=lw, color=color) 145 | court_elements = [hoop, backboard, outer_box, inner_box, top_free_throw, 146 | bottom_free_throw, restricted, corner_three_a, 147 | corner_three_b, three_arc, center_outer_arc, 148 | center_inner_arc] 149 | if outer_lines: 150 | outer_lines = Rectangle((-250, -47.5), 500, 470, linewidth=lw, 151 | color=color, fill=False) 152 | court_elements.append(outer_lines) 153 | 154 | for element in court_elements: 155 | ax.add_patch(element) 156 | 157 | ax.set_xticklabels([]) 158 | ax.set_yticklabels([]) 159 | ax.set_xticks([]) 160 | ax.set_yticks([]) 161 | return ax 162 | 163 | #####FIND PLAYER FG% FOR EACH HEX##### 164 | def find_shootingPcts(shot_df, gridNum): 165 | x = shot_df.x[shot_df['y']<425.1] 166 | y = shot_df.y[shot_df['y']<425.1] 167 | 168 | x_made = shot_df.x[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1)] 169 | y_made = shot_df.y[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1)] 170 | 171 | #compute number of shots made and taken from each hexbin location 172 | hb_shot = plt.hexbin(x, y, gridsize=gridNum, extent=(-250,250,425,-50)); 173 | plt.close() 174 | hb_made = plt.hexbin(x_made, y_made, gridsize=gridNum, extent=(-250,250,425,-50),cmap=plt.cm.Reds); 175 | plt.close() 176 | 177 | #compute shooting percentage 178 | ShootingPctLocs = hb_made.get_array() / hb_shot.get_array() 179 | ShootingPctLocs[np.isnan(ShootingPctLocs)] = 0 #makes 0/0s=0 180 | return (ShootingPctLocs, hb_shot) 181 | 182 | #####SCRAPE PLAYER IMAGE##### 183 | def acquire_playerPic(player_id, zoom, offset=(-165,400)): 184 | from matplotlib import offsetbox as osb 185 | ID = str(player_id.unique()[0]) 186 | 187 | url = "http://stats.nba.com/media/players/230x185/"+ ID +".png" 188 | pic = requests.get(url,stream=True) 189 | 190 | with open('scraped_images/player_images/' + ID + '.png', 'wb') as out_file: 191 | shutil.copyfileobj(pic.raw, out_file) 192 | 193 | player_pic = plt.imread('scraped_images/player_images/' + ID + '.png') 194 | img = osb.OffsetImage(player_pic, zoom) 195 | img = osb.AnnotationBbox(img, offset,xycoords='data',pad=0.0, box_alignment=(1,0), frameon=False) 196 | 197 | return img 198 | 199 | #####SCRAPE TEAM LOGO##### 200 | def get_team_logo(team_acronym, zoom, offset=(-185,400)): 201 | from matplotlib import offsetbox as osb 202 | 203 | URL = 'https://www.nba.com/assets/logos/teams/primary/web/' + team_acronym + '.png' 204 | 205 | pic = requests.get(URL,stream=True) 206 | 207 | with open('scraped_images/team_images/' + str(team_acronym) + '.png', 'wb') as out_file: 208 | shutil.copyfileobj(pic.raw, out_file) 209 | 210 | team_pic = plt.imread('scraped_images/team_images/' + str(team_acronym) + '.png') 211 | img = osb.OffsetImage(team_pic, zoom) 212 | img = osb.AnnotationBbox(img, offset,xycoords='data',pad=0.0, box_alignment=(1,0), frameon=False) 213 | 214 | return img 215 | 216 | #####COLOR MAP DICTIONARY##### 217 | cdict = { 218 | 'blue': [(0.0, 0.6313725709915161, 0.6313725709915161), (0.25, 0.4470588266849518, 0.4470588266849518), (0.5, 0.29019609093666077, 0.29019609093666077), (0.75, 0.11372549086809158, 0.11372549086809158), (1.0, 0.05098039284348488, 0.05098039284348488)], 219 | 'green': [(0.0, 0.7333333492279053, 0.7333333492279053), (0.25, 0.572549045085907, 0.572549045085907), (0.5, 0.4156862795352936, 0.4156862795352936), (0.75, 0.0941176488995552, 0.0941176488995552), (1.0, 0.0, 0.0)], 220 | 'red': [(0.0, 0.9882352948188782, 0.9882352948188782), (0.25, 0.9882352948188782, 0.9882352948188782), (0.5, 0.9843137264251709, 0.9843137264251709), (0.75, 0.7960784435272217, 0.7960784435272217), (1.0, 0.40392157435417175, 0.40392157435417175)]} 221 | mymap = matplotlib.colors.LinearSegmentedColormap('my_colormap', cdict, 1024) 222 | mymap = mymap.from_list('Color Map',[(0,'#ff0000'),(.45,'#ffff00'),(1,'#00ff00')]) 223 | 224 | ####################CALCULATE SEASON STATS TO ADD TO CHART#################### 225 | def get_season_stats(player_name): 226 | player = df[df.name==player_name] 227 | 228 | stats = {} 229 | 230 | stats['NUM_GAMES'] = len(player.game_date.unique()) 231 | stats['FG_PCT'] = player.groupby(by=['season']).mean().shot_made_flag.sum() 232 | stats['THREE_PT_PCT'] = player[player.shot_type==3].groupby(by=['season']).mean().shot_made_flag.sum() 233 | 234 | twos = player.groupby(['shot_type']).sum().iloc[0].shot_made_flag 235 | threes = player.groupby(['shot_type']).sum().iloc[1].shot_made_flag * 1.5 236 | stats['EFFECTIVE_FG_PCT'] = (twos+threes)/player.shape[0] 237 | 238 | stats['POINTS_PER_SHOT'] = round(player.pps.mean(),3) 239 | stats['AVG_SHOT_DISTANCE'] = round(player.shot_distance.mean()) 240 | 241 | printout = """Games: {}\nFG: {:4.1%}\n3PT: {:4.1%}\nEFG: {:4.1%}\nPoints per Shot: {}\nAvg Shot Dist.: {} ft.""".format(*[stats.get(k) for k in stats.keys()]) 242 | 243 | return stats, printout 244 | 245 | ##################CALCULATE TEAM STATS TO ADD TO CHART######################## 246 | def get_team_stats(team): 247 | team_df = df[df.team_name==team] 248 | stats = {} 249 | 250 | stats['FG_PCT'] = team_df.groupby(by=['season']).mean().shot_made_flag.sum() 251 | stats['THREE_PT_PCT'] = team_df[team_df.shot_type==3].groupby(by=['season']).mean().shot_made_flag.sum() 252 | 253 | twos = team_df.groupby(['shot_type']).sum().iloc[0].shot_made_flag 254 | threes = team_df.groupby(['shot_type']).sum().iloc[1].shot_made_flag * 1.5 255 | stats['EFFECTIVE_FG_PCT'] = (twos+threes)/team_df.shape[0] 256 | 257 | stats['POINTS_PER_SHOT'] = round(team_df.pps.mean(),3) 258 | stats['AVG_SHOT_DISTANCE'] = round(team_df.shot_distance.mean()) 259 | 260 | printout = """FG: {:4.1%}\n3PT: {:4.1%}\nEFG: {:4.1%}\nPoints per Shot: {}\nAvg Shot Dist.: {} ft.""".format(*[stats.get(k) for k in stats.keys()]) 261 | 262 | return stats, printout 263 | 264 | #################PLOT PLAYER FREQUENCY SHOT CHART (MATPLOTLIB)################ 265 | def freq_shooting_plot(player_name,gridNum=25): 266 | plot_size=(12,8) 267 | shot_df = df[df.name==player_name] 268 | 269 | from matplotlib.patches import Circle 270 | x = shot_df.x[shot_df['y']<425.1] 271 | y = shot_df.y[shot_df['y']<425.1] 272 | 273 | #compute shooting percentage and # of shots 274 | (ShootingPctLocs, shotNumber) = find_shootingPcts(shot_df, gridNum) 275 | 276 | #draw figure and court 277 | fig = plt.figure(figsize=plot_size)#(12,7) 278 | cmap = mymap #my modified colormap 279 | ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure 280 | draw_court(outer_lines=False) 281 | plt.xlim(-250,250) 282 | plt.ylim(400, -25) 283 | 284 | #draw player image 285 | zoom = np.float(plot_size[0])/(12.0*2) #how much to zoom the player's pic. I have this hackily dependent on figure size 286 | img = acquire_playerPic(shot_df.player_id, zoom) 287 | ax.add_artist(img) 288 | 289 | #draw circles 290 | for i, shots in enumerate(ShootingPctLocs): 291 | restricted = Circle(shotNumber.get_offsets()[i], radius=shotNumber.get_array()[i], 292 | color=cmap(shots),alpha=1, fill=True) 293 | if restricted.radius > 240/gridNum: restricted.radius=240/gridNum 294 | ax.add_patch(restricted) 295 | 296 | #draw color bar 297 | ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8]) 298 | cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical') 299 | cb.set_label('Field Goal %') 300 | cb.set_ticks([0.0, 0.25, 0.5, 0.75, 1.0]) 301 | cb.set_ticklabels(['0%','25%', '50%','75%', '100%']) 302 | 303 | ax.set_title(shot_df.name.unique()[0] +' - Shot Chart 2014-15') 304 | #plot season stats 305 | ax.text(135,395,get_season_stats(player_name)[1]) 306 | plt.show() 307 | return ax 308 | 309 | #################PLOT TEAM FREQUENCY SHOT CHART (MATPLOTLIB)################# 310 | def team_freq_plot(team, gridNum=25): 311 | plot_size=(12,8) 312 | team_df = df[df.team_name==team] 313 | 314 | from matplotlib.patches import Circle 315 | 316 | #compute shooting percentage and # of shots 317 | (ShootingPctLocs, shotNumber) = find_shootingPcts(team_df, gridNum) 318 | 319 | #draw figure and court 320 | fig = plt.figure(figsize=plot_size) 321 | cmap = mymap #my modified colormap 322 | ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure 323 | draw_court(outer_lines=False) 324 | plt.xlim(-250,250) 325 | plt.ylim(400, -25) 326 | 327 | #draw team image 328 | team_ac = team_df.htm[team_df.is_home==1].unique()[0] 329 | zoom = 1 #np.float(plot_size[0])/(8.0) 330 | img = get_team_logo(team_ac, zoom) 331 | ax.add_artist(img) 332 | 333 | #draw circles 334 | for i, shots in enumerate(ShootingPctLocs): 335 | restricted = Circle(shotNumber.get_offsets()[i], radius=shotNumber.get_array()[i], 336 | color=cmap(shots),alpha=.95, fill=True) 337 | if restricted.radius > 240/gridNum: restricted.radius=240/gridNum 338 | ax.add_patch(restricted) 339 | 340 | #draw color bar 341 | ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8]) 342 | cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical') 343 | cb.set_label('Field Goal %') 344 | cb.set_ticks([0.0, 0.25, 0.5, 0.75, 1.0]) 345 | cb.set_ticklabels(['0%','25%', '50%','75%', '100%']) 346 | 347 | ax.set_title(team_df.team_name.unique()[0] +' - Shot Chart 2014-15') 348 | #plot season stats 349 | ax.text(150,395,get_team_stats(team)[1]) 350 | plt.show() 351 | -------------------------------------------------------------------------------- /shallow_ML_models.py: -------------------------------------------------------------------------------- 1 | ############################### IMPORTS ############################### 2 | if True: 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | import itertools, math, time, re, pickle 9 | 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | import plotly 14 | import plotly.plotly as py 15 | import plotly.graph_objs as go 16 | plotly.offline.init_notebook_mode(connected=True) 17 | 18 | from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, ShuffleSplit 19 | from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier 20 | from sklearn.linear_model import LogisticRegression 21 | from sklearn.metrics import accuracy_score, auc, confusion_matrix, precision_score, recall_score, roc_curve, f1_score 22 | from sklearn.preprocessing import MinMaxScaler 23 | 24 | from xgboost import XGBClassifier 25 | 26 | from pactools.grid_search import GridSearchCVProgressBar 27 | 28 | ############################## LOAD DATA ############################## 29 | if False: 30 | df = pd.read_csv('data/final_df.csv', index_col=0) 31 | 32 | X = df.drop(columns=['name', 'age', 'pos','player_id','team_id', 'opp_id', 'team_name', 'game_date', 'opponent', 'defender_name', 'game_id', 'action_type', 'season', 'htm', 'vtm', 'game_event_id', 'minutes_remaining', 'seconds_remaining', 33 | 'defender_id', 'shot_type', 'Heave', 'heave_pct', 'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'Above Break 3', 'Corner 3', 'Mid Range', 'Paint', 'Restricted Area', 'C', 'L', 'R', 'dribbles', 'shot_distance', 'shot_made_flag']) 34 | y = np.array(df.shot_made_flag) 35 | 36 | X_col_names = X.columns 37 | with open('./X_y_arrays/X_column_names', 'wb') as x_col: 38 | pickle.dump(X_col_names, x_col) 39 | 40 | minmax_scale = MinMaxScaler() 41 | X = minmax_scale.fit_transform(X) 42 | 43 | np.save('./X_y_arrays/X_shallow', X) 44 | np.save('./X_y_arrays/y_shallow', y) 45 | 46 | #new data 47 | if True: 48 | df = pd.read_csv('final_df_1415.csv', index_col=0) 49 | df[['zone_id', 'period']] = df[['zone_id', 'period']].astype('category') 50 | 51 | X = df.drop(columns=['name', 'team_name', 'game_date', 'season', 'team_id','minutes_remaining', 'seconds_remaining', 'shot_made_flag', 'shot_type', 'opponent', 'x', 'y', 'defender_name', 'opp_id', 'game_id', 'game_event_id', 52 | 'player_id', 'shot_zone_basic', 'shot_zone_area', 'shot_zone_range', 'htm', 'vtm', 'pos', 'age', 'defender_id', 'zone', 'pps', 'zone_id', 'zone_minus_lg_avg', 'lg_zone_avg', 53 | 'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'dribbles', 'period', 'action_type', 'ts%', 'dbpm', '3par', 'usg%', 'blk_pct', 'def_rating']) 54 | y = np.array(df.shot_made_flag) 55 | 56 | X_col_names = X.columns 57 | with open('./X_y_arrays/X_column_names', 'wb') as x_col: 58 | pickle.dump(X_col_names, x_col) 59 | 60 | minmax_scale = MinMaxScaler() 61 | X = minmax_scale.fit_transform(X) 62 | 63 | np.save('./X_y_arrays/X_shallow', X) 64 | np.save('./X_y_arrays/y_shallow', y) 65 | ################### SPLIT DATA INTO TRAIN/TEST SETS ################### 66 | if True: 67 | with open ('./X_y_arrays/X_column_names', 'rb') as fp: 68 | X_col_names = pickle.load(fp) 69 | 70 | X = np.load('./X_y_arrays/X_shallow.npy') 71 | y = np.load('./X_y_arrays/y_shallow.npy') 72 | 73 | X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=23,test_size=.2) 74 | 75 | ########################## HELPER FUNCTIONS ########################## 76 | def build_model(model, path, X_train, X_test, y_train, y_test, decision_function=True): 77 | start = time.time() 78 | 79 | clf = model 80 | clf.fit(X_train,y_train) 81 | y_hat_test = clf.predict(X_test) 82 | 83 | if decision_function==True: 84 | y_score = clf.decision_function(X_test) 85 | else: 86 | y_score = clf.predict_proba(X_test)[:, 1] 87 | 88 | fpr, tpr, thresholds = roc_curve(y_test, y_score) 89 | 90 | #Save model 91 | with open('./models/'+ path + '/' + str(path) + '_' + time.asctime().replace(' ', '_'), 'wb') as f: 92 | pickle.dump(clf, f) 93 | 94 | print('Total Runtime: {} seconds'.format(time.time()-start)) 95 | return clf, y_hat_test, y_score, fpr, tpr 96 | 97 | def plot_feature_importances(model, path): 98 | matplotlib.style.use('fivethirtyeight') 99 | n_features = X.shape[1] 100 | plt.figure(figsize=(10,6)) 101 | plt.barh(range(n_features), model.feature_importances_, align='center') 102 | plt.yticks(np.arange(n_features), X_col_names) 103 | plt.xlabel("Feature importance") 104 | plt.ylabel("Features") 105 | #Save output 106 | plt.savefig('./models/'+ path + '/feature_importances/' + time.asctime().replace(' ', '_') + '.png') 107 | plt.show() 108 | 109 | def plot_confusion_matrix(cm, path, title='Confusion matrix', cmap=plt.cm.Blues): 110 | #Create the basic matrix. 111 | plt.imshow(cm, cmap) 112 | 113 | #Add title and Axis Labels 114 | plt.title(title) 115 | plt.xlabel('Predicted') 116 | plt.ylabel('Actual') 117 | #Add appropriate Axis Scales 118 | class_names = set(y) 119 | tick_marks = np.arange(len(class_names)) 120 | plt.xticks(tick_marks, class_names) 121 | plt.yticks(tick_marks, class_names) 122 | 123 | #Add Labels to Each Cell 124 | thresh = cm.max()*.75 125 | 126 | #Add a Side Bar Legend Showing Colors 127 | plt.colorbar() 128 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 129 | plt.text(j, i, cm[i, j], 130 | horizontalalignment="center", 131 | color="white" if cm[i, j] > thresh else "black") 132 | #Save output 133 | plt.savefig('./models/'+ path + '/cm/' + time.asctime().replace(' ', '_') + '.png', bbox_inches='tight', dpi=480) 134 | plt.show() 135 | 136 | def print_model_metrics(y_pred, y_score, path): 137 | cm = confusion_matrix(y_test, y_pred) 138 | plot_confusion_matrix(cm, path, title='Confusion matrix', cmap=plt.cm.Blues) 139 | 140 | accuracy = accuracy_score(y_test,y_pred) 141 | precision = precision_score(y_test,y_pred) 142 | recall = recall_score(y_test,y_pred) 143 | f1 = f1_score(y_test,y_pred) 144 | fpr, tpr, thresholds = roc_curve(y_test, y_score) 145 | auc_ = auc(fpr, tpr) 146 | 147 | print('Accuracy: {}'.format(round(accuracy,4))) 148 | print('Precision: {}'.format(round(precision,4))) 149 | print('Recall: {}'.format(round(recall,4))) 150 | print('F1 {}'.format(round(f1,4))) 151 | print('AUC: {}'.format(round(auc_,4))) 152 | 153 | #Save output 154 | metrics = np.array([accuracy, precision, recall, f1, auc_]) 155 | np.save('./models/'+ path + '/metrics/' + time.asctime().replace(' ', '_'), metrics) 156 | 157 | def plot_roc_curve(fpr, tpr, path): 158 | sns.set_style("darkgrid", {"axes.facecolor": ".9"}) 159 | 160 | plt.figure(figsize=(10,6)) 161 | lw = 2 162 | plt.plot(fpr, tpr, color='darkorange', 163 | lw=lw, label='ROC curve') 164 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 165 | plt.xlim([0.0, 1.0]) 166 | plt.ylim([0.0, 1.05]) 167 | plt.yticks([i/20.0 for i in range(21)]) 168 | plt.xticks([i/20.0 for i in range(21)]) 169 | plt.xlabel('False Positive Rate') 170 | plt.ylabel('True Positive Rate') 171 | plt.title('Receiver operating characteristic (ROC) Curve') 172 | plt.legend(loc="lower right") 173 | 174 | #Save output 175 | plt.savefig('./models/'+ path + '/roc_curves/' + time.asctime().replace(' ', '_') + '.png', bbox_inches='tight', dpi=480) 176 | plt.show() 177 | ###################################################################### 178 | 179 | ############################# GRID SEARCH ############################ 180 | def run_grid_search(model, path, param_grid, X, y, cv=3): 181 | start = time.time() 182 | 183 | search = GridSearchCVProgressBar(model, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2) 184 | search.fit(X,y) 185 | 186 | print("Total Runtime for Grid Search: {:.4} seconds".format(round(time.time() - start, 2))) 187 | 188 | best_score = search.best_score_ 189 | best_params = search.best_params_ 190 | 191 | print("Testing Accuracy: {:.4}%".format(best_score * 100)) 192 | print("\nOptimal Parameters: {}".format(best_params)) 193 | 194 | search_results = pd.DataFrame.from_dict(search.cv_results_) 195 | 196 | search_results.to_csv('./grid_search_results/'+ path + '_' + str(round(best_score,4)).replace('.','') + '_' + time.asctime().replace(' ', '_')) 197 | 198 | return search_results, best_score, best_params 199 | ###################################################################### 200 | 201 | ########################## PARAMETER GRIDS ########################### 202 | if True: 203 | log_reg_param_grid = {'penalty':['l1','l2'], 204 | 'C': np.logspace(0, 4, 10) 205 | } 206 | 207 | rf_param_grid = {'n_estimators':[100,250], 208 | 'criterion':['gini', 'entropy'], 209 | 'min_samples_leaf':[2,5,10], 210 | 'min_samples_split':[2,5,10], 211 | 'n_jobs':[-1] 212 | } 213 | 214 | gb_param_grid = {'n_estimators':[50, 100, 250], 215 | 'learning_rate':[.01, .05, .1, 1], 216 | 'min_samples_leaf':[2, 5, 10], 217 | 'min_samples_split':[2, 5, 10], 218 | 'max_depth':[2, 5, 10] 219 | } 220 | 221 | xgb_param_grid = {'learning_rate':[.01, .05, .1, 1], 222 | 'n_estimators':[100, 250], 223 | 'max_depth':[2, 5, 10], 224 | 'min_child_weight': [1, 5, 10], 225 | 'gamma': [0.5, 1, 2], 226 | } 227 | 228 | ###################################################################### 229 | 230 | 231 | ######################## LOGISTIC REGRESSION ######################### 232 | if True: 233 | # log_reg, log_y_preds, log_y_score, log_fpr, log_tpr = build_model(LogisticRegression(C=1, class_weight='balanced'), 234 | # 'logreg', X_train, X_test, y_train, y_test) 235 | # 236 | # print_model_metrics(log_y_preds, log_y_score, 'logreg') 237 | # plot_roc_curve(log_fpr, log_tpr, 'logreg') 238 | 239 | log_reg_search_results, log_reg_best_score, log_reg_best_params = run_grid_search(LogisticRegression(random_state=23),'logreg', log_reg_param_grid, X, y, cv=10) 240 | ###################################################################### 241 | 242 | 243 | ###################### RANDOM FOREST CLASSIFIER ###################### 244 | if False: 245 | rf, rf_y_preds, rf_y_score, rf_fpr, rf_tpr = build_model(RandomForestClassifier(n_estimators=500, criterion='gini', min_samples_leaf=10, min_samples_split=10, verbose=.5, class_weight='balanced', n_jobs=-1, random_state=23), 246 | 'rf', X_train, X_test, y_train, y_test, decision_function=False) 247 | 248 | print_model_metrics(rf_y_preds, rf_y_score, 'rf') 249 | plot_roc_curve(rf_fpr, rf_tpr, 'rf') 250 | plot_feature_importances(rf, 'rf') 251 | 252 | # rf_search_results, rf_best_score, rf_best_params = run_grid_search(RandomForestClassifier(random_state=23),'rf', rf_param_grid, X, y, cv=3) 253 | 254 | # [ParallelProgressBar(n_jobs=-1)]: Done 108 out of 108 | elapsed: 67.1min finished 255 | # Total Runtime for Grid Search: 4.095e+03 seconds 256 | # Testing Accuracy: 70.82% 257 | # 258 | # Optimal Parameters: {'criterion': 'gini', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 250, 'n_jobs': -1} 259 | ###################################################################### 260 | 261 | 262 | #################### GRADIENT BOOSTING CLASSIFIER #################### 263 | if False: 264 | gb, gb_y_preds, gb_y_score, gb_fpr, gb_tpr = build_model(GradientBoostingClassifier(learning_rate=0.05, n_estimators=500, max_depth=5, min_samples_leaf=7, min_samples_split=7, verbose=1, random_state=23), 265 | 'gb', X_train, X_test, y_train, y_test) 266 | 267 | print_model_metrics(gb_y_preds, gb_y_score, 'gb') 268 | plot_roc_curve(gb_fpr, gb_tpr, 'gb') 269 | plot_feature_importances(gb, 'gb') 270 | ###################################################################### 271 | 272 | 273 | ######################### ADABOOST CLASSIFIER ######################### 274 | if False: 275 | ada, ada_y_preds, ada_y_score, ada_fpr, ada_tpr = build_model(AdaBoostClassifier(learning_rate=.01, n_estimators=500, algorithm='SAMME.R', random_state=23), 276 | 'ada', X_train, X_test, y_train, y_test) 277 | 278 | print_model_metrics(ada_y_preds, ada_y_score, 'ada') 279 | plot_roc_curve(ada_fpr, ada_tpr, 'ada') 280 | plot_feature_importances(ada, 'ada') 281 | ###################################################################### 282 | 283 | 284 | ######################### XGBOOST CLASSIFIER ######################### 285 | if False: 286 | xgb, xgb_y_preds, xgb_y_score, xgb_fpr, xgb_tpr = build_model(XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=1, algorithm='SAMME.R', objective='binary:logistic', reg_alpha=0, reg_lambda=0, n_jobs=-1, random_state=23), 287 | 'xgb', X_train, X_test, y_train, y_test, decision_function=False) 288 | 289 | print_model_metrics(xgb_y_preds, xgb_y_score, 'xgb') 290 | plot_roc_curve(xgb_fpr, xgb_tpr, 'xgb') 291 | plot_feature_importances(xgb, 'xgb') 292 | 293 | # xgb_search_results, xgb_best_score, xgb_best_params = run_grid_search(XGBClassifier(random_state=23),'xgb', xgb_param_grid, X_train, y_train) 294 | 295 | # Testing Accuracy: 72.23% 296 | # 297 | # Optimal Parameters: {'gamma': 1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 250} 298 | ###################################################################### 299 | 300 | 301 | ######################## STACKED ENSEMBLE MODEL ###################### 302 | def create_ensemble_model(X,y): 303 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23, test_size=.2) 304 | 305 | rf = RandomForestClassifier(n_estimators=500, criterion='gini', max_features='sqrt', min_samples_leaf=10, min_samples_split=2, verbose=1, class_weight='balanced', n_jobs=-1, random_state=23) 306 | 307 | xgb = XGBClassifier(learning_rate=0.1, n_estimators=250, max_depth=5, min_child_weight=1, gamma=1, algorithm='SAMME.R', objective='binary:logistic', n_jobs=-1, random_state=23) 308 | 309 | ada = AdaBoostClassifier(learning_rate=.75, n_estimators=500, algorithm='SAMME.R', random_state=23) 310 | 311 | rf.fit(X_train, y_train) 312 | rf_train_preds = pd.DataFrame(rf.predict_proba(X_train)) 313 | rf_test_preds = pd.DataFrame(rf.predict_proba(X_test)) 314 | 315 | xgb.fit(X_train, y_train) 316 | xgb_train_preds = pd.DataFrame(xgb.predict_proba(X_train)) 317 | xgb_test_preds = pd.DataFrame(xgb.predict_proba(X_test)) 318 | 319 | ada.fit(X_train, y_train) 320 | ada_train_preds = pd.DataFrame(ada.predict_proba(X_train)) 321 | ada_test_preds = pd.DataFrame(ada.predict_proba(X_test)) 322 | 323 | train_df = pd.concat([rf_train_preds, xgb_train_preds, ada_train_preds], names=['rf','xgb','ada'], axis=1) 324 | test_df = pd.concat([rf_test_preds, xgb_test_preds, ada_test_preds], names=['rf','xgb','ada'], axis=1) 325 | 326 | model = LogisticRegression(random_state=1) 327 | model.fit(train_df,y_train) 328 | y_preds = model.predict(test_df) 329 | # y_score = model.score(y_preds, y_test) 330 | 331 | return train_df, test_df, y_preds 332 | ###################################################################### 333 | -------------------------------------------------------------------------------- /new_ETL.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | import numpy as np 5 | import pandas as pd 6 | pd.set_option('display.max_columns',100) 7 | 8 | import warnings 9 | warnings.filterwarnings('ignore') 10 | 11 | import itertools, math, time, re 12 | 13 | ############################--LOAD DATA--############################# 14 | def load_data_to_df(): 15 | oct_nov_ = pd.read_csv('./data/nba_savant/oct-nov-14-15.csv') 16 | dec_ = pd.read_csv('./data/nba_savant/dec-14-15.csv') 17 | jan_ = pd.read_csv('./data/nba_savant/jan-14-15.csv') 18 | feb_ = pd.read_csv('./data/nba_savant/feb-14-15.csv') 19 | mar_ = pd.read_csv('./data/nba_savant/mar-14-15.csv') 20 | apr_ = pd.read_csv('./data/nba_savant/apr-14-15.csv') 21 | 22 | df = pd.concat([oct_nov_,dec_,jan_,feb_,mar_,apr_]) 23 | #reverse x values to plot correctly 24 | df.x = -df.x 25 | df.game_date = pd.to_datetime(df.game_date) 26 | df = df.reset_index(drop=True) 27 | return df 28 | df = load_data_to_df() 29 | ###################################################################### 30 | 31 | ###########################--BASIC CLEANING--######################### 32 | df.shot_type = np.where(df.shot_type=='2PT Field Goal', 2, 3) 33 | df.period[df.period>5]=5 34 | df['pps'] = df.shot_type*df.shot_made_flag 35 | df.touch_time[df.touch_time<0]=0 36 | df.touch_time[df.touch_time>24]=24 37 | 38 | def create_team_ids(df): 39 | team_id_dict = {} 40 | for id_, team in enumerate(list(set(df.team_name))): 41 | team_id_dict[team]=id_+1 42 | 43 | df['opp_id']=0 44 | #get team ids from 1-30 45 | for k,v in team_id_dict.items(): 46 | df['team_id'] = np.where(df.team_name==k, v, df['team_id']) 47 | df['opp_id'] = np.where(df.opponent==k, v, df['opp_id']) 48 | create_team_ids(df) 49 | ###################################################################### 50 | 51 | 52 | ####################--LOAD NBA SCRAPED DATA--###################### 53 | nba_shots = pd.read_csv('./data/shots_1415.csv',index_col=0) 54 | nba_shots.GAME_DATE = nba_shots.GAME_DATE.astype('str') 55 | 56 | #Adds dashes to date string so it can be converted to datetime format 57 | def add_dashes(string): 58 | date = string[:4] + '-' + string[4:6] + '-' + string[-2:] 59 | return date 60 | 61 | def clean_scraped_nba_data(): 62 | nba_shots.GAME_DATE = nba_shots.GAME_DATE.apply(lambda x: add_dashes(x)) 63 | nba_shots.GAME_DATE = pd.to_datetime(nba_shots.GAME_DATE) 64 | nba_shots.LOC_X = -nba_shots.LOC_X 65 | clean_scraped_nba_data() 66 | ###################################################################### 67 | 68 | ########################--MERGE NBA AND SAVANT--###################### 69 | def merge_nba_and_savant_data(df,nba_shots): 70 | merged_df = df.merge(nba_shots, left_on=['team_name','game_date','period', 'minutes_remaining','seconds_remaining','x','y'], right_on=['TEAM_NAME','GAME_DATE','PERIOD','MINUTES_REMAINING', 'SECONDS_REMAINING','LOC_X','LOC_Y']) 71 | 72 | merged_df = merged_df.drop(columns=['GRID_TYPE','PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING', 'SECONDS_REMAINING','SHOT_DISTANCE','LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE', 'espn_player_id', 'espn_game_id', 'EVENT_TYPE', 'SHOT_TYPE', 'ACTION_TYPE']) 73 | 74 | return merged_df 75 | merged_df = merge_nba_and_savant_data(df,nba_shots) 76 | ###################################################################### 77 | 78 | ########################--FEATURE ENGINEERING--###################### 79 | #helper function to get dictionary matching team names to home and away team acronyms 80 | def create_home_acronym_dict(): 81 | team_acronyms = sorted(list(merged_df.HTM.unique())) 82 | team_names = sorted(list(merged_df.team_name.unique())) 83 | 84 | team_name_ac_dict = dict(zip(team_names,team_acronyms)) 85 | team_name_ac_dict['Boston Celtics'] = 'BOS' 86 | team_name_ac_dict['Brooklyn Nets'] = 'BKN' 87 | return team_name_ac_dict 88 | 89 | #Function to determing if the shooter is playing at home 90 | def get_home_team(): 91 | start = time.time() 92 | is_home_arr = [] 93 | 94 | team_name_ac_dict=create_home_acronym_dict() 95 | 96 | for index, row in merged_df.iterrows(): 97 | if team_name_ac_dict[row.team_name]==row.HTM: 98 | is_home_arr.append(1) 99 | else: 100 | is_home_arr.append(0) 101 | if index%100000==0: 102 | print('Runtime: {} seconds. {} iterations to go.'.format(round(time.time()-start,2), len(merged_df)-index)) 103 | return is_home_arr 104 | merged_df['is_home'] = get_home_team() 105 | 106 | #sort the dataframe by date, game_id, player_name, and game_event_id 107 | sorted_df = merged_df.copy().sort_values(by=['game_date','GAME_ID','name','GAME_EVENT_ID']).reset_index(drop=True) 108 | 109 | #Function to calculate whether player is hot, i.e. whether they have hit 1, 2, or 3 previous shots 110 | def is_player_hot(df): 111 | start=time.time() 112 | 113 | #create array that stores whether previous 1, 2, or 3 shots were made, respectively 114 | heat_check_array=np.zeros((len(df),3)) 115 | 116 | for index, row in df.iterrows(): 117 | #If index < 3, cant check prior three shots 118 | if index==0: 119 | heat_check_array[index,:]+=[0,0,0] 120 | elif index==1: 121 | if (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1): 122 | heat_check_array[index,:]+=[1,0,0] 123 | else: 124 | heat_check_array[index,:]+=[0,0,0] 125 | elif index==2: 126 | if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1): 127 | heat_check_array[index,:]+=[1,1,0] 128 | elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==0): 129 | heat_check_array[index,:]+=[1,0,0] 130 | else: 131 | heat_check_array[index,:]+=[0,0,0] 132 | #If index >=3 133 | else: 134 | if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==1): 135 | heat_check_array[index,:]+=[1,1,1] 136 | elif (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==0): 137 | heat_check_array[index,:]+=[1,1,0] 138 | elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1): 139 | heat_check_array[index,:]+=[1,0,0] 140 | else: 141 | heat_check_array[index,:]+=[0,0,0] 142 | 143 | if index%50000==0: 144 | print('Runtime: {} seconds. {} iterations remaining.'.format(round(time.time()-start,2), len(df)-index)) 145 | 146 | return heat_check_array 147 | 148 | def add_heat_check_to_df(df): 149 | heat_check_array = is_player_hot(df) 150 | df['prev_shot_made'] = heat_check_array[:,0] 151 | df['prev_2_made'] = heat_check_array[:,1] 152 | df['prev_3_made'] = heat_check_array[:,2] 153 | add_heat_check_to_df(sorted_df) 154 | ###################################################################### 155 | 156 | 157 | ####################--LOAD ADVANCED STATS--###################### 158 | stats = pd.read_excel('./data/adv-stats-14-15.xlsx',index_col=0) 159 | stats['DWS/48'] = round(stats.DWS/stats.MP*48,3) 160 | 161 | # Clean up name discrepancies between two dfs 162 | def clean_name_discrepancies(df,stats): 163 | stats.Player = stats.Player.apply(lambda x: re.sub(r'([^\s\w]|_)+', '', x)) 164 | df.name[df.name=='Jose Juan Barea'] = 'JJ Barea' 165 | df.name[df.name=='Tim Hardaway Jr'] = 'Tim Hardaway' 166 | df.name[df.name=='Charles Hayes'] = 'Chuck Hayes' 167 | df.name[df.name=='Glen Rice Jr'] = 'Glen Rice' 168 | df.name[df.name=='Louis Williams'] = 'Lou Williams' 169 | 170 | stats.Player[stats.Player=='Nene Hilario'] = 'Nene' 171 | stats.Player[stats.Player=='Jeffery Taylor'] = 'Jeff Taylor' 172 | stats.Player[stats.Player== 'Luigi Datome'] = 'Gigi Datome' 173 | 174 | #convert defender name to first name last name format 175 | df.defender_name[df.defender_name.isnull()] = 'None' 176 | clean_name_discrepancies(sorted_df, stats) 177 | 178 | #convert defender names from last,first to first,last 179 | def convert_defender_names(player): 180 | if player =='None': 181 | return 'None' 182 | elif player=='Nene': 183 | return 'Nene' 184 | else: 185 | name = player.split(', ') 186 | full_name = ' '.join((name[1],name[0])) 187 | return re.sub(r'([^\s\w]|_)+', '', full_name) 188 | sorted_df.defender_name = sorted_df.defender_name.apply(convert_defender_names) 189 | 190 | # Clean up name discrepancies between two dfs 191 | def clean_defender_names(df): 192 | df.defender_name[df.defender_name=='Jose Juan Barea'] = 'JJ Barea' 193 | df.defender_name[df.defender_name=='Tim Hardaway Jr'] = 'Tim Hardaway' 194 | df.defender_name[df.defender_name=='Charles Hayes'] = 'Chuck Hayes' 195 | df.defender_name[df.defender_name=='Glen Rice Jr'] = 'Glen Rice' 196 | df.defender_name[df.defender_name=='Louis Williams'] = 'Lou Williams' 197 | clean_defender_names(sorted_df) 198 | 199 | ############# OFFENSE ########### 200 | def merge_off_stats(df,stats): 201 | off_stats = stats[['Player','Pos','Age','TS%','3PAr','USG%']] 202 | df = df.merge(off_stats, left_on='name', right_on='Player').drop(columns=['Player']) 203 | df.columns = map(str.lower, df.columns) 204 | return df 205 | sorted_df = merge_off_stats(sorted_df,stats) 206 | 207 | ############ DEFENSE ########### 208 | #map player ids to new df column matching to defender name 209 | def add_defender_ids(df): 210 | player_ids_df = df[['name','player_id']].rename(columns={'name': 'defender_name', 'player_id':'defender_id'}) 211 | player_ids_df = player_ids_df.groupby('defender_name').max() 212 | 213 | none_id = pd.DataFrame(data=[('None',0)], 214 | columns=['defender_name', 'defender_id']).set_index('defender_name') 215 | player_ids_df = pd.concat((player_ids_df,none_id)) 216 | 217 | #merge two dataframes with defender ids 218 | df = df.merge(player_ids_df, on='defender_name') 219 | return df 220 | sorted_df = add_defender_ids(sorted_df) 221 | 222 | def merge_def_stats(df,stats): 223 | def_stats = stats[['Player', 'BLK%', 'DWS/48', 'DBPM']].rename(columns={'Player':'defender_name', 'BLK%':'blk_pct', 'DWS/48':'dws/48', 'DBPM':'dbpm'}) 224 | 225 | #add dummy stats for no defender (id=0) and append to defense stats 226 | none_stats = pd.DataFrame(data = [('None', 0, 0, 0)], columns=['defender_name', 'blk_pct', 'dws/48', 'dbpm']) 227 | 228 | #add player advanced def stats 229 | def_stats = pd.concat((def_stats, none_stats)).reset_index(drop= True) 230 | df = df.merge(def_stats, on='defender_name') 231 | 232 | #add team defensive rating 233 | d_rating_14 = pd.read_excel('./data/drating_2014.xlsx') 234 | df = df.merge(d_rating_14, left_on='team_name', right_on='Team').drop(columns='Team') 235 | 236 | return df 237 | sorted_df = merge_def_stats(sorted_df,stats) 238 | 239 | ###################################################################### 240 | 241 | 242 | ########################--ADDITIONAL CLEANING--####################### 243 | def clean_positions(df): 244 | df.pos[df.name=='Giannis Antetokounmpo'] = 'SF' 245 | df.pos[df.pos=='PG-SG'] = 'SG' 246 | df.pos[df.pos=='SF-SG'] = 'SF' 247 | df.pos[df.pos=='SG-PG'] = 'PG' 248 | df.pos[df.pos=='PF-SF'] = 'SF' 249 | df.pos[df.pos=='SF-PF'] = 'PF' 250 | df.pos[df.pos=='SG-SF'] = 'SF' 251 | clean_positions(sorted_df) 252 | 253 | def clean_shot_zones(df): 254 | df.shot_zone_basic[df.shot_zone_basic=='In The Paint (Non-RA)'] = 'Paint' 255 | #change shots misclassified as above_break_3 to backcourt 256 | df.shot_zone_basic[(df.shot_zone_area=='Back Court(BC)') & (df.shot_zone_basic=='Above the Break 3')] = 'Backcourt' 257 | clean_shot_zones(sorted_df) 258 | 259 | def reduce_action_types(df): 260 | df.action_type=df.action_type.str.lower() 261 | new_action_types=[] 262 | for i, row in df.action_type.iteritems(): 263 | if 'dunk' in row: 264 | new_action_types.append('dunk') 265 | elif 'layup' in row: 266 | new_action_types.append('layup') 267 | elif ('driving') in row or ('running') in row: 268 | new_action_types.append('driving_running') 269 | elif 'pullup' in row: 270 | new_action_types.append('pullup') 271 | elif ('fadeaway') in row or ('turnaround') in row or 'step back' in row: 272 | new_action_types.append('fade_turn_step') 273 | elif 'hook' in row: 274 | new_action_types.append('hook_shot') 275 | elif 'jump' in row: 276 | new_action_types.append('jump_shot') 277 | else: 278 | new_action_types.append(row) 279 | return new_action_types 280 | sorted_df.action_type = reduce_action_types(sorted_df) 281 | ###################################################################### 282 | 283 | sorted_df.to_csv('data/mid_etl_checkpoint_df.csv') 284 | 285 | ########################--GET FG % FOR EACH ZONE--#################### 286 | def get_zone_fg_pct(df, date=None, event=None): 287 | fg_pct_list = [] 288 | column_names = [] 289 | 290 | # if date: 291 | # df = df[df.game_date 0: 326 | zone_ids.append((id_, zone_, area_)) 327 | id_+=1 328 | 329 | zone_id_df = pd.DataFrame.from_records(zone_ids, columns=['zone_id', 'shot_zone_basic', 'shot_zone_area']) 330 | return zone_id_df 331 | zone_ids = create_zone_ids_df(sorted_df) 332 | 333 | def add_zone_to_zone_ids(zone_ids): 334 | list_ = [] 335 | for index, row in zone_ids.iterrows(): 336 | list_.append(('_'.join([row.shot_zone_area, 337 | row.shot_zone_basic]).replace(' ','_').replace(')','').split('(')[1], 338 | row.zone_id)) 339 | 340 | zone_ids = zone_ids.merge(pd.DataFrame(list_, columns=['zone', 'zone_id']),on='zone_id') 341 | return zone_ids 342 | zone_ids = add_zone_to_zone_ids(zone_ids) 343 | 344 | #add zone_id, zone to df 345 | sorted_df = sorted_df.merge(zone_ids, on=['shot_zone_basic', 'shot_zone_area']) 346 | 347 | #get player avg for each zone they are shooting in 348 | def get_zone_avg(df): 349 | start = time.time() 350 | df_slice = df[['name','zone']] 351 | zone_avg = [] 352 | 353 | for index, row in df_slice.iterrows(): 354 | zone_slice= zone_fg_pct[zone_fg_pct.name==row[0]] 355 | zone_avg.append(zone_slice[row[1]].sum()) 356 | 357 | if index % 25000==0: 358 | print('Runtime: {} seconds. Iterations remaining: {}.'.format(round(time.time()-start,2), len(df_slice)-index)) 359 | return zone_avg 360 | zone_avgs = get_zone_avg(sorted_df) 361 | sorted_df['zone_avg']=zone_avgs 362 | 363 | #add league avg for each zone 364 | sorted_df = sorted_df.merge(sorted_df.groupby('zone').mean().zone_avg.reset_index().rename(columns={'zone_avg': 'lg_zone_avg'}), on='zone') 365 | #add fg% relative to lg avg for each zone 366 | sorted_df['zone_minus_lg_avg'] = sorted_df.zone_avg-sorted_df.lg_zone_avg 367 | 368 | sorted_df.to_csv('final_df_1415.csv') 369 | zone_fg_pct.to_csv('data/zone_fg_pct.csv') 370 | zone_ids.to_csv('data/zone_ids.csv') 371 | 372 | ###################################################################### 373 | #rearrange columns for better visability 374 | # clean = sorted_df[['name','pos','age','player_id', 'team_name', 'team_id', 'game_date', 375 | # 'game_id', 'game_event_id','season', 'period', 376 | # 'minutes_remaining', 'seconds_remaining', 'shot_made_flag', 377 | # 'action_type', 'shot_zone_basic', 'shot_zone_area', 'shot_zone_range', 378 | # 'shot_type', 'shot_distance', 'x', 'y', 'dribbles', 'touch_time', 379 | # 'opponent', 'opp_id', 'defender_name', 'defender_distance', 'shot_clock', 'htm', 'vtm', 380 | # 'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'ts%', '3par', 'usg%']] 381 | -------------------------------------------------------------------------------- /presentation.py: -------------------------------------------------------------------------------- 1 | ############################### IMPORTS ############################### 2 | if True: 3 | import itertools, math, time, re, pickle 4 | 5 | import plotly 6 | import plotly.plotly as py 7 | import plotly.graph_objs as go 8 | plotly.offline.init_notebook_mode(connected=True) 9 | 10 | import matplotlib 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | import numpy as np 14 | import pandas as pd 15 | pd.set_option('display.max_columns',100) 16 | 17 | import ipywidgets as widgets 18 | from ipywidgets import interact 19 | 20 | import warnings 21 | warnings.filterwarnings('ignore') 22 | 23 | from court import court_shapes 24 | 25 | from shot_chart_viz import acquire_playerPic, get_team_logo, get_season_stats, get_team_stats, draw_court 26 | 27 | cdict = { 28 | 'blue': [(0.0, 0.6313725709915161, 0.6313725709915161), (0.25, 0.4470588266849518, 0.4470588266849518), (0.5, 0.29019609093666077, 0.29019609093666077), (0.75, 0.11372549086809158, 0.11372549086809158), (1.0, 0.05098039284348488, 0.05098039284348488)], 29 | 'green': [(0.0, 0.7333333492279053, 0.7333333492279053), (0.25, 0.572549045085907, 0.572549045085907), (0.5, 0.4156862795352936, 0.4156862795352936), (0.75, 0.0941176488995552, 0.0941176488995552), (1.0, 0.0, 0.0)], 30 | 'red': [(0.0, 0.9882352948188782, 0.9882352948188782), (0.25, 0.9882352948188782, 0.9882352948188782), (0.5, 0.9843137264251709, 0.9843137264251709), (0.75, 0.7960784435272217, 0.7960784435272217), (1.0, 0.40392157435417175, 0.40392157435417175)]} 31 | mymap = matplotlib.colors.LinearSegmentedColormap('my_colormap', cdict, 1024) 32 | ############################## LOAD DATA ############################## 33 | df = pd.read_csv('final_df_1415.csv', index_col=0) 34 | 35 | ###################################################################### 36 | ###########################--SHOT CHARTS--############################ 37 | ###################################################################### 38 | 39 | ########################--BUBBLE SHOT CHARTS--######################## 40 | def find_shootingPcts(shot_df, gridNum): 41 | x2 = shot_df.x[(shot_df['y']<425.1) & (shot_df.shot_type==2)] 42 | y2 = shot_df.y[(shot_df['y']<425.1) & (shot_df.shot_type==2)] 43 | 44 | x2_made = shot_df.x[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==2)] 45 | y2_made = shot_df.y[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==2)] 46 | 47 | #compute number of shots made and taken from each hexbin location 48 | hb_shot2 = plt.hexbin(x2, y2, gridsize=gridNum, extent=(-250,250,425,-50)); 49 | plt.close() 50 | hb_made2 = plt.hexbin(x2_made, y2_made, gridsize=gridNum, extent=(-250,250,425,-50),cmap=plt.cm.Reds); 51 | plt.close() 52 | 53 | #compute shooting percentage 54 | ShootingPctLocs2 = hb_made2.get_array() / hb_shot2.get_array() 55 | ShootingPctLocs2[np.isnan(ShootingPctLocs2)] = 0 #makes 0/0s=0 56 | 57 | ############################################################################################################# 58 | ############################################################################################################# 59 | ########################################### THREE POINTERS ################################################ 60 | ############################################################################################################# 61 | ############################################################################################################# 62 | 63 | x3 = shot_df.x[(shot_df['y']<425.1) & (shot_df.shot_type==3)] 64 | y3 = shot_df.y[(shot_df['y']<425.1) & (shot_df.shot_type==3)] 65 | 66 | x3_made = shot_df.x[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==3)] 67 | y3_made = shot_df.y[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==3)] 68 | 69 | #compute number of shots made and taken from each hexbin location 70 | hb_shot3 = plt.hexbin(x3, y3, gridsize=gridNum, extent=(-250,250,425,-50)); 71 | plt.close() 72 | hb_made3 = plt.hexbin(x3_made, y3_made, gridsize=gridNum, extent=(-250,250,425,-50),cmap=plt.cm.Reds); 73 | plt.close() 74 | 75 | #compute shooting percentage 76 | ShootingPctLocs3 = hb_made3.get_array() / hb_shot3.get_array() 77 | ShootingPctLocs3[np.isnan(ShootingPctLocs3)] = 0 #makes 0/0s=0 78 | 79 | return (ShootingPctLocs2, hb_shot2, ShootingPctLocs3, hb_shot3) 80 | 81 | def freq_shooting_plot(player_name, gridNum=25): 82 | plot_size=(10,8) 83 | shot_df = df[df.name==player_name] 84 | 85 | from matplotlib.patches import Circle 86 | #compute shooting percentage and # of shots 87 | (ShootingPctLocs2, shotNumber2) = find_shootingPcts(shot_df, gridNum)[0:2] 88 | (ShootingPctLocs3, shotNumber3) = find_shootingPcts(shot_df, gridNum)[2:] 89 | 90 | #draw figure and court 91 | fig = plt.figure(figsize=plot_size)#(12,7) 92 | ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure 93 | draw_court(outer_lines=False) 94 | plt.xlim(-250,250) 95 | plt.ylim(400, -25) 96 | 97 | #draw player image 98 | zoom = np.float(plot_size[0])/(12.0*2) #how much to zoom the player's pic. I have this hackily dependent on figure size 99 | img = acquire_playerPic(shot_df.player_id, zoom) 100 | ax.add_artist(img) 101 | 102 | ############################################ TWO POINTERS ################################################# 103 | cmap = mymap.from_list('Color Map',[(0,'#ff0000'),(.45,'#ffff00'),(.6,'#00ff00'), (1,'#004d00')]) 104 | #draw circles 105 | for i, shots in enumerate(ShootingPctLocs2): 106 | restricted2 = Circle(shotNumber2.get_offsets()[i], radius=shotNumber2.get_array()[i], 107 | color=cmap(shots),alpha=1, fill=True) 108 | if restricted2.radius > 240/gridNum: restricted2.radius=240/gridNum 109 | ax.add_patch(restricted2) 110 | 111 | #draw color bar 112 | ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8]) 113 | cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical') 114 | cb.set_label('Field Goal %', labelpad=20) 115 | cb.set_ticks([0.0, 0.25, .485, 0.75, 1.0]) 116 | cb.set_ticklabels(['0%','25%','48.5%\nLg Avg', '75%', '100%']) 117 | 118 | ########################################### THREE POINTERS ################################################ 119 | #plotting 3 pointers separately to account for expected lower fg% from deep 120 | cmap3 = mymap.from_list('Color Map',[(0,'#ff0000'),(.35,'#ffff00'),(.6,'#00ff00'),(1,'#004d00')]) 121 | #draw circles 122 | for i, shots in enumerate(ShootingPctLocs3): 123 | restricted3 = Circle(shotNumber3.get_offsets()[i], radius=shotNumber3.get_array()[i], 124 | color=cmap3(shots),alpha=1, fill=True) 125 | if restricted3.radius > 240/gridNum: restricted3.radius=240/gridNum 126 | ax.add_patch(restricted3) 127 | 128 | #draw color bar 129 | ax3 = fig.add_axes([1.1, 0.1, 0.02, 0.8]) 130 | cb3 = matplotlib.colorbar.ColorbarBase(ax3,cmap=cmap3, orientation='vertical') 131 | cb3.set_label('Three Point %',labelpad=-8) 132 | cb3.set_ticks([0.0, 0.25,.35, 0.5, 0.75, 1.0]) 133 | cb3.set_ticklabels(['0%','25%','35% - Lg Avg', '50%','75%', '100%']) 134 | 135 | ax.set_title(shot_df.name.unique()[0] +' - Shot Chart 2014-15') 136 | #plot season stats 137 | ax.text(135,395,get_season_stats(player_name)[1]) 138 | 139 | plt.show() 140 | shot_recommender(player_name) 141 | 142 | #################PLOT TEAM FREQUENCY SHOT CHART (MATPLOTLIB)################# 143 | def team_freq_plot(team, gridNum=25): 144 | plot_size=(10,8) 145 | team_df = df[df.team_name==team] 146 | 147 | from matplotlib.patches import Circle 148 | #compute shooting percentage and # of shots 149 | (ShootingPctLocs2, shotNumber2) = find_shootingPcts(team_df, gridNum)[0:2] 150 | (ShootingPctLocs3, shotNumber3) = find_shootingPcts(team_df, gridNum)[2:] 151 | 152 | #draw figure and court 153 | fig = plt.figure(figsize=plot_size) 154 | ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure 155 | draw_court(outer_lines=False) 156 | plt.xlim(-250,250) 157 | plt.ylim(400, -25) 158 | 159 | #draw team image 160 | team_ac = team_df.htm[team_df.is_home==1].unique()[0] 161 | zoom = 1 #np.float(plot_size[0])/(8.0) 162 | img = get_team_logo(team_ac, zoom) 163 | ax.add_artist(img) 164 | 165 | ############################################ TWO POINTERS ################################################# 166 | cmap = mymap.from_list('Color Map',[(0,'#ff0000'),(.45,'#ffff00'),(.6,'#00ff00'), (1,'#004d00')]) 167 | #draw circles 168 | for i, shots in enumerate(ShootingPctLocs2): 169 | restricted2 = Circle(shotNumber2.get_offsets()[i], radius=shotNumber2.get_array()[i], 170 | color=cmap(shots),alpha=.9, fill=True) 171 | if restricted2.radius > 240/gridNum: restricted2.radius=240/gridNum 172 | ax.add_patch(restricted2) 173 | 174 | #draw color bar 175 | ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8]) 176 | cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical') 177 | cb.set_label('Field Goal %', labelpad=20) 178 | cb.set_ticks([0.0, 0.25, .485, 0.75, 1.0]) 179 | cb.set_ticklabels(['0%','25%','48.5%\nLg Avg', '75%', '100%']) 180 | 181 | ########################################### THREE POINTERS ################################################ 182 | #plotting 3 pointers separately to account for expected lower fg% from deep 183 | cmap3 = mymap.from_list('Color Map',[(0,'#ff0000'),(.35,'#ffff00'),(.6,'#00ff00'),(1,'#004d00')]) 184 | #draw circles 185 | for i, shots in enumerate(ShootingPctLocs3): 186 | restricted3 = Circle(shotNumber3.get_offsets()[i], radius=shotNumber3.get_array()[i], 187 | color=cmap3(shots),alpha=.9, fill=True) 188 | if restricted3.radius > 240/gridNum: restricted3.radius=240/gridNum 189 | ax.add_patch(restricted3) 190 | 191 | #draw color bar 192 | ax3 = fig.add_axes([1.1, 0.1, 0.02, 0.8]) 193 | cb3 = matplotlib.colorbar.ColorbarBase(ax3,cmap=cmap3, orientation='vertical') 194 | cb3.set_label('Three Point %',labelpad=-8) 195 | cb3.set_ticks([0.0, 0.25,.35, 0.5, 0.75, 1.0]) 196 | cb3.set_ticklabels(['0%','25%','35% - Lg Avg', '50%','75%', '100%']) 197 | 198 | 199 | ax.set_title(team_df.team_name.unique()[0] +' - Shot Chart 2014-15') 200 | #plot season stats 201 | ax.text(150,395,get_team_stats(team)[1]) 202 | plt.show() 203 | 204 | ########################--GROUPED SHOT CHART--######################## 205 | def grouped_plot(feature): 206 | groups = df.groupby(feature) 207 | colors = np.linspace(0,1,len(groups)) 208 | 209 | color_list = ['aliceblue', 'aqua', 'steelblue','violet', 'blue', 210 | 'blueviolet', 'brown', 'cadetblue', 211 | 'chartreuse', 'darkgreen', 'darkmagenta', 'tomato', 212 | 'gold', 'red', 'slategray'] 213 | counter=0 214 | data = [] 215 | for g, c in zip(groups, colors): 216 | data.append(go.Scattergl( 217 | x = g[1].x, 218 | y = g[1].y, 219 | mode = 'markers', 220 | name = g[0], 221 | marker= dict(symbol='circle', size=7, 222 | line={'width':1}, opacity=0.7, color=color_list[counter]), 223 | text = g[0], 224 | hoverinfo = 'text') 225 | ) 226 | counter+=1 227 | 228 | layout = go.Layout( 229 | title='Shot Distribution by ' + feature.title(), 230 | showlegend =True, 231 | xaxis={'showgrid':False, 'range':[-250,250]}, 232 | yaxis={'showgrid':False, 'range':[-47.5,500]}, 233 | height = 600, 234 | width = 750, 235 | hovermode='closest', 236 | shapes=court_shapes) 237 | 238 | fig = go.Figure(data=data, layout=layout) 239 | plotly.offline.iplot(fig, filename = 'Shot Zone Breakdown') 240 | 241 | ##########################--SHOT FREQ HEATMAP--######################### 242 | def shot_freq_heatmap(name): 243 | if name in df.name.unique(): 244 | df_ = df[df.name==name] 245 | z_max=40 246 | z_min=0 247 | else: 248 | df_ = df[df.team_name==name] 249 | z_max=250 250 | z_min=5 251 | 252 | x_make = df_[df_.shot_made_flag == 1]['x'] 253 | y_make = df_[df_.shot_made_flag == 1]['y'] 254 | x_miss = df_[df_.shot_made_flag == 0]['x'] 255 | y_miss = df_[df_.shot_made_flag == 0]['y'] 256 | 257 | x = np.concatenate([x_make, x_miss]) 258 | y = np.concatenate([y_make, y_miss]) 259 | 260 | makes = go.Scatter( 261 | x=x_make, 262 | y=y_make, 263 | mode='markers', 264 | name='Make', 265 | showlegend=True, 266 | marker=dict( 267 | symbol='circle', 268 | opacity=0.7, 269 | color='green', 270 | size=4, 271 | line=dict(width=1), 272 | ) 273 | ) 274 | misses = go.Scatter( 275 | x=x_miss, 276 | y=y_miss, 277 | mode='markers', 278 | name='Miss', 279 | showlegend=True, 280 | marker=dict( 281 | symbol='x', 282 | opacity=0.7, 283 | color='yellow', 284 | size=4, 285 | line=dict(width=1), 286 | ) 287 | ) 288 | trace3 = go.Histogram2d( 289 | x=x, 290 | y=y, 291 | zmax=z_max, 292 | zmin=z_min, 293 | # nbinsx=20, 294 | # nbinsy=20, 295 | zsmooth='best', 296 | autobinx=True, 297 | autobiny=True, 298 | reversescale=False, 299 | opacity=.75, 300 | #zauto=True, 301 | #autocolorscale=True, 302 | ) 303 | 304 | layout = go.Layout( 305 | xaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-250,250]), 306 | yaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-47.5,450]), 307 | autosize=False, 308 | height=600, 309 | width=750, 310 | hovermode='closest', 311 | shapes= court_shapes, 312 | title= name + ' - Shot Frequency Heatmap', 313 | showlegend=True, 314 | legend=dict(x=1.2, y=1), 315 | ) 316 | 317 | data = [trace3]#, makes, misses] 318 | fig = go.Figure(data=data, layout=layout) 319 | 320 | plotly.offline.iplot(fig) 321 | 322 | ############################--PPS HEATMAP--############################# 323 | # def pps_heatmap(feature): 324 | # pps_tab=pd.crosstab(df.team_name, df[feature], values=df.pps, aggfunc='mean',margins=False).fillna(0) 325 | # 326 | # team_heatmap = go.Heatmap(z=[np.array((pps_tab[pps_tab.index==pps_tab.index[i]])) for i in range(len(pps_tab.index))], 327 | # x=pps_tab.columns, y= [team.split(' ')[-1] for team in pps_tab.index] 328 | # ) 329 | # 330 | # layout = go.Layout( 331 | # title='Points Per Shot Heatmap', 332 | # xaxis = dict(ticks='', nticks=len(pps_tab.columns), automargin=True), 333 | # yaxis = dict(ticks='', nticks=len(pps_tab.index), automargin=True), 334 | # ) 335 | # 336 | # fig = go.Figure(data=[team_heatmap], layout=layout) 337 | # plotly.offline.iplot(fig, filename='pps-heatmap') 338 | def pps_heatmap_sns(feature): 339 | pps_tab=pd.crosstab(df[feature], df.team_name, values=df.pps, aggfunc='mean',margins=False).fillna(0) 340 | 341 | plt.figure(figsize=(15,6)) 342 | sns.heatmap(pps_tab, annot=False, robust=True) 343 | plt.show() 344 | 345 | ########################--FREQUENCY BAR PLOT--######################## 346 | def freq_bar_plots(feature, round_=False): 347 | df_ = df.copy() 348 | if round_==True: 349 | df_[feature] = round(df_[feature]) 350 | 351 | feat_tab = pd.crosstab(df_[feature], df_.shot_made_flag, margins=True) 352 | feat_tab['fg_pct'] = round(feat_tab[1]/feat_tab['All'],3) 353 | 354 | tab=feat_tab.drop(columns='All')[:-1] 355 | make_text= [str(round(t*100,1)) + '%' for t in tab.fg_pct] 356 | miss_text= [str(round((1-t)*100,1)) + '%' for t in tab.fg_pct] 357 | 358 | trace1 = go.Bar( 359 | x=tab.index, 360 | y=tab[1], 361 | name='Makes', 362 | text= make_text , 363 | textposition = 'inside', 364 | textfont=dict( 365 | family='sans serif', size=12, color='white'), 366 | marker=dict( 367 | color='red'), 368 | opacity=0.75 369 | ) 370 | trace2 = go.Bar( 371 | x=tab.index, 372 | y=tab[0], 373 | name='Misses', 374 | text= miss_text, 375 | textposition = 'inside', 376 | textfont=dict( 377 | family='sans serif', size=10, color='white'), 378 | marker=dict( 379 | color='blue'), 380 | opacity=0.75 381 | ) 382 | 383 | line = go.Scatter( 384 | x=tab.index, 385 | y=tab[1], 386 | mode='markers+lines', 387 | name='# Makes', 388 | hoverinfo='skip', 389 | line=dict( 390 | color='black', width=.75) 391 | ) 392 | 393 | data = [trace1, trace2]#, line] 394 | layout = go.Layout( 395 | barmode='stack', 396 | title='FG% by ' + feature.title().replace('_',' '), 397 | showlegend =True, 398 | xaxis=dict( 399 | automargin=True, 400 | autorange=True, 401 | ticks='', 402 | showticklabels=True, 403 | #tickangle=25, 404 | title=feature.replace('_',' ').title() 405 | ), 406 | yaxis=dict( 407 | automargin=True, 408 | ticks='', 409 | showticklabels=True, 410 | title='# of Shots' 411 | ) 412 | ) 413 | 414 | fig = go.Figure(data=data, layout=layout) 415 | plotly.offline.iplot(fig, filename='stacked-bar') 416 | 417 | #########################--PERCENTAGE BAR CHART--########################## 418 | def pct_bar_plots(feature, round_=False, player=None, team=None): 419 | if round_==True: 420 | df_ = df.copy() 421 | df_[feature] = round(df_[feature]) 422 | else: 423 | df_ = df 424 | 425 | if player: 426 | df_ = df[df.name==player.title()] 427 | title= player.title() + ' - FG% by ' + feature.title().replace('_',' ') 428 | elif team: 429 | df_ = df[df.team_name==team.title()] 430 | title= team.title() + ' - FG% by ' + feature.title().replace('_',' ') 431 | else: 432 | df_ = df 433 | title= 'FG% by ' + feature.title().replace('_',' ') 434 | 435 | 436 | c_tab=pd.crosstab(df_[feature], df_.shot_made_flag, margins=True) 437 | c_tab['pct_made'] = c_tab[1]/c_tab.All 438 | c_tab['pct_missed'] = 1-c_tab.pct_made 439 | 440 | made_text= [str(round(t*100,1)) + '%' for t in c_tab.pct_made] 441 | missed_text= [str(round(t*100,1)) + '%' for t in c_tab.pct_missed] 442 | 443 | trace1 = go.Bar( 444 | x=c_tab.index, 445 | y=c_tab.pct_made, 446 | name='Makes', 447 | text= made_text, 448 | textposition = 'auto', 449 | textfont=dict( 450 | family='sans serif', 451 | size=12, color='white'), 452 | marker=dict( 453 | color='red'), 454 | opacity=0.75 455 | ) 456 | trace2 = go.Bar( 457 | x=c_tab.index, 458 | y=c_tab.pct_missed, 459 | name='Misses', 460 | text= missed_text, 461 | textposition = 'auto', 462 | textfont=dict( 463 | family='sans serif', 464 | size=12, color='white'), 465 | marker=dict( 466 | color='blue'), 467 | opacity=0.75, 468 | ) 469 | 470 | data = [trace1, trace2] 471 | layout = go.Layout( 472 | barmode='stack', 473 | title= title, 474 | showlegend =True, 475 | xaxis=dict( 476 | automargin=True, 477 | autorange=True, 478 | ticks='', 479 | showticklabels=True, 480 | title=feature.replace('_',' ').title() 481 | ), 482 | yaxis=dict( 483 | automargin=True, 484 | ticks='', 485 | showticklabels=True, 486 | title='FG %' 487 | ) 488 | ) 489 | 490 | fig = go.Figure(data=data, layout=layout) 491 | plotly.offline.iplot(fig, filename='stacked-bar') 492 | 493 | 494 | ###########################--SHOT RECOMMENDER--########################### 495 | def player_pps(name): 496 | player = df[df.name==name] 497 | pps_tab=pd.crosstab(player.zone, player.name, 498 | values=player.pps, aggfunc='mean', 499 | margins=False).fillna(0).rename( 500 | columns={list(set(player.name))[0]:'pps'}) 501 | 502 | pps_freq = pd.concat([pps_tab, 503 | pd.DataFrame( 504 | player.zone.value_counts()).rename( 505 | columns={'zone':'count_'})], 506 | axis=1).sort_values(by='pps', 507 | ascending=False) 508 | 509 | pps_freq['freq_pct'] = pps_freq.count_/pps_freq.count_.sum() 510 | 511 | pps_freq=pps_freq.sort_values('freq_pct',ascending=False) 512 | 513 | return pps_freq[pps_freq.freq_pct>=.05] 514 | 515 | def pps_zone_percentiles(name): 516 | pps_per_zone = pd.crosstab(df.name, df.zone, df.pps, aggfunc='mean').fillna(0) 517 | pps_percentiles = pps_per_zone.quantile(np.round(np.arange(.1,1,.2)*10)/10) 518 | return pps_percentiles 519 | 520 | def shot_recommender(name): 521 | pps = player_pps(name) 522 | zone_percentiles = pps_zone_percentiles(name) 523 | 524 | more_freq = [] 525 | less_freq = [] 526 | 527 | for i in pps.index: 528 | if pps.loc[i].pps > zone_percentiles[i].loc[.7]: 529 | more_freq.append(i) 530 | #print(name + ' should shoot in ' + i + ' more frequently') 531 | elif pps.loc[i].pps < zone_percentiles[i].loc[.5]: 532 | if i != 'C_Restricted_Area': 533 | less_freq.append(i) 534 | #print(name + ' should shoot in ' + i + ' less frequently') 535 | if len(more_freq)>0: 536 | print(name + ' should shoot in the following zones more frequently:') 537 | [print(' - ' + zone) for zone in more_freq] 538 | if len(less_freq)>0: 539 | print(name + ' should shoot in the following zones less frequently:') 540 | [print(' - ' + zone) for zone in less_freq] 541 | -------------------------------------------------------------------------------- /Data-Exploration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 583, 6 | "metadata": { 7 | "extensions": { 8 | "jupyter_dashboards": { 9 | "version": 1, 10 | "views": { 11 | "grid_default": { 12 | "hidden": true 13 | }, 14 | "report_default": { 15 | "hidden": true 16 | } 17 | } 18 | } 19 | }, 20 | "scrolled": true 21 | }, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/html": [ 26 | "" 27 | ], 28 | "text/vnd.plotly.v1+html": [ 29 | "" 30 | ] 31 | }, 32 | "metadata": {}, 33 | "output_type": "display_data" 34 | } 35 | ], 36 | "source": [ 37 | "import matplotlib\n", 38 | "import matplotlib.pyplot as plt\n", 39 | "import seaborn as sns\n", 40 | "import numpy as np\n", 41 | "import pandas as pd\n", 42 | "pd.set_option('display.max_columns',100)\n", 43 | "\n", 44 | "import plotly\n", 45 | "import plotly.plotly as py\n", 46 | "import plotly.graph_objs as go\n", 47 | "plotly.offline.init_notebook_mode(connected=True)\n", 48 | "\n", 49 | "import warnings\n", 50 | "warnings.filterwarnings('ignore')\n", 51 | "from court import court_shapes\n", 52 | "\n", 53 | "import ipywidgets as widgets\n", 54 | "from ipywidgets import interact\n", 55 | "\n", 56 | "import itertools, math, time" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 589, 62 | "metadata": { 63 | "extensions": { 64 | "jupyter_dashboards": { 65 | "version": 1, 66 | "views": { 67 | "grid_default": { 68 | "hidden": true 69 | }, 70 | "report_default": { 71 | "hidden": true 72 | } 73 | } 74 | } 75 | }, 76 | "scrolled": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "oct_nov_ = pd.read_csv('./data/nba_savant/oct-nov-14-15.csv')\n", 81 | "dec_ = pd.read_csv('./data/nba_savant/dec-14-15.csv')\n", 82 | "jan_ = pd.read_csv('./data/nba_savant/jan-14-15.csv')\n", 83 | "feb_ = pd.read_csv('./data/nba_savant/feb-14-15.csv')\n", 84 | "mar_ = pd.read_csv('./data/nba_savant/mar-14-15.csv')\n", 85 | "apr_ = pd.read_csv('./data/nba_savant/apr-14-15.csv')" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 590, 91 | "metadata": { 92 | "extensions": { 93 | "jupyter_dashboards": { 94 | "version": 1, 95 | "views": { 96 | "grid_default": { 97 | "hidden": true 98 | }, 99 | "report_default": { 100 | "hidden": true 101 | } 102 | } 103 | } 104 | }, 105 | "scrolled": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "df = pd.concat([oct_nov_,dec_,jan_,feb_,mar_,apr_])\n", 110 | "#reverse x values to plot correctly\n", 111 | "df.x = -df.x\n", 112 | "df.game_date = pd.to_datetime(df.game_date)\n", 113 | "df = df.reset_index(drop=True)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 591, 119 | "metadata": { 120 | "extensions": { 121 | "jupyter_dashboards": { 122 | "version": 1, 123 | "views": { 124 | "grid_default": { 125 | "hidden": true 126 | }, 127 | "report_default": { 128 | "hidden": true 129 | } 130 | } 131 | } 132 | }, 133 | "scrolled": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "stats = pd.read_excel('./data/adv-stats-14-15.xlsx',index_col=0)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 592, 143 | "metadata": { 144 | "extensions": { 145 | "jupyter_dashboards": { 146 | "version": 1, 147 | "views": { 148 | "grid_default": { 149 | "hidden": true 150 | }, 151 | "report_default": { 152 | "hidden": true 153 | } 154 | } 155 | } 156 | }, 157 | "scrolled": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "# Clean up name discrepancies between two dfs\n", 162 | "import re\n", 163 | "stats.Player = stats.Player.apply(lambda x: re.sub(r'([^\\s\\w]|_)+', '', x))\n", 164 | "df.name[df.name=='Jose Juan Barea'] = 'JJ Barea'\n", 165 | "df.name[df.name=='Tim Hardaway Jr'] = 'Tim Hardaway'\n", 166 | "df.name[df.name=='Charles Hayes'] = 'Chuck Hayes'\n", 167 | "df.name[df.name=='Glen Rice Jr'] = 'Glen Rice'\n", 168 | "df.name[df.name=='Louis Williams'] = 'Lou Williams'\n", 169 | "\n", 170 | "stats.Player[stats.Player=='Nene Hilario'] = 'Nene'\n", 171 | "stats.Player[stats.Player=='Jeffery Taylor'] = 'Jeff Taylor'\n", 172 | "stats.Player[stats.Player== 'Luigi Datome'] = 'Gigi Datome'" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 593, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# #convert defender name to first name last name format\n", 182 | "# df.defender_name[df.defender_name.isnull()] = 'None'\n", 183 | "\n", 184 | "# def convert_defender_names(player):\n", 185 | "# if player =='None':\n", 186 | "# return 'None'\n", 187 | "# elif player=='Nene':\n", 188 | "# return 'Nene'\n", 189 | "# else:\n", 190 | "# name = player.split(', ')\n", 191 | "# full_name = ' '.join((name[1],name[0]))\n", 192 | "# return re.sub(r'([^\\s\\w]|_)+', '', full_name)\n", 193 | " \n", 194 | "# df.defender_name = df.defender_name.apply(convert_defender_names)\n", 195 | "\n", 196 | "# # Clean up name discrepancies between two dfs\n", 197 | "# df.defender_name[df.defender_name=='Jose Juan Barea'] = 'JJ Barea'\n", 198 | "# df.defender_name[df.defender_name=='Tim Hardaway Jr'] = 'Tim Hardaway'\n", 199 | "# df.defender_name[df.defender_name=='Charles Hayes'] = 'Chuck Hayes'\n", 200 | "# df.defender_name[df.defender_name=='Glen Rice Jr'] = 'Glen Rice'\n", 201 | "# df.defender_name[df.defender_name=='Louis Williams'] = 'Lou Williams'" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 594, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "# #map player ids to new df column matching to defender name\n", 211 | "# player_ids_df = df[['name','player_id']].rename(columns={'name':'defender_name','player_id':'defender_id'})\n", 212 | "# player_ids_df = player_ids_df.groupby('defender_name').max()\n", 213 | "\n", 214 | "# df = df.merge(player_ids_df, on='defender_name')" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 595, 220 | "metadata": { 221 | "extensions": { 222 | "jupyter_dashboards": { 223 | "version": 1, 224 | "views": { 225 | "grid_default": { 226 | "hidden": true 227 | }, 228 | "report_default": { 229 | "hidden": true 230 | } 231 | } 232 | } 233 | }, 234 | "scrolled": true 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "df.shot_type = np.where(df.shot_type=='2PT Field Goal', 2, 3)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 596, 244 | "metadata": { 245 | "extensions": { 246 | "jupyter_dashboards": { 247 | "version": 1, 248 | "views": { 249 | "grid_default": { 250 | "hidden": true 251 | }, 252 | "report_default": {} 253 | } 254 | } 255 | }, 256 | "scrolled": true 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "def get_shot_distance(x,y):\n", 261 | " x_squared=x**2\n", 262 | " y_squared=y**2\n", 263 | " shot_distance = math.sqrt(x_squared + y_squared) / 10 # unit for distance is off by factor of 10, divide by 10 to convert to feet\n", 264 | " return round(shot_distance, 1)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 598, 270 | "metadata": { 271 | "extensions": { 272 | "jupyter_dashboards": { 273 | "version": 1, 274 | "views": { 275 | "grid_default": { 276 | "hidden": true 277 | }, 278 | "report_default": {} 279 | } 280 | } 281 | }, 282 | "scrolled": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "def get_shot_zone(row):\n", 287 | " x = row.x\n", 288 | " y = row.y\n", 289 | " \n", 290 | " shot_zone = ''\n", 291 | " shot_area = ''\n", 292 | " \n", 293 | " #restricted area, shots within 4ft of hoop\n", 294 | " if get_shot_distance(x,y)<=4:\n", 295 | " shot_zone = 'Restricted Area'\n", 296 | " \n", 297 | " #abov break 3 pointers\n", 298 | " elif (get_shot_distance(x,y)>=23.9) & (y>=92.5):\n", 299 | " shot_zone = 'Above Break 3'\n", 300 | " #corner 3s \n", 301 | " elif (y<92.5) & ((x<=-220) | (x>=220)):\n", 302 | " shot_zone = 'Corner 3'\n", 303 | " #in the paint shots excluding restricted area \n", 304 | " elif (-80<=x<=80) & (-47.5<=y<=143.5) & (get_shot_distance(x,y)>4):\n", 305 | " shot_zone = 'Paint'\n", 306 | " #mid range shots, left and right side\n", 307 | " elif (get_shot_distance(x,y)<23.9) & ((-22035:\n", 315 | " shot_zone = 'Heave'\n", 316 | " \n", 317 | " #Get area of court (left, right, or center)\n", 318 | " if shot_zone !='Paint':\n", 319 | " if (x <= 80) & (x>=-80):\n", 320 | " shot_area = 'C'\n", 321 | " elif (x>80):\n", 322 | " shot_area = 'L'\n", 323 | " else:\n", 324 | " shot_area = 'R' \n", 325 | " #for shots in paint, they have special designation for left, right, and center\n", 326 | " else:\n", 327 | " if x>40:\n", 328 | " shot_area = 'L'\n", 329 | " elif x<-40:\n", 330 | " shot_area = 'R'\n", 331 | " else:\n", 332 | " shot_area = 'C'\n", 333 | " return shot_zone, shot_area" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 599, 339 | "metadata": { 340 | "extensions": { 341 | "jupyter_dashboards": { 342 | "version": 1, 343 | "views": { 344 | "grid_default": { 345 | "hidden": true 346 | }, 347 | "report_default": {} 348 | } 349 | } 350 | }, 351 | "scrolled": true 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "def add_shot_zones_area_to_df(df):\n", 356 | " shot_zones = []\n", 357 | " shot_areas = []\n", 358 | "\n", 359 | " for index, row in df.iterrows():\n", 360 | " shot_zones.append(get_shot_zone(row)[0])\n", 361 | " shot_areas.append(get_shot_zone(row)[1])\n", 362 | "\n", 363 | " df['shot_zone'] = shot_zones\n", 364 | " df['shot_area'] = shot_areas\n", 365 | "\n", 366 | "add_shot_zones_area_to_df(df) " 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 600, 372 | "metadata": { 373 | "extensions": { 374 | "jupyter_dashboards": { 375 | "version": 1, 376 | "views": { 377 | "grid_default": { 378 | "col": 0, 379 | "height": 10, 380 | "hidden": false, 381 | "row": 61, 382 | "width": 12 383 | }, 384 | "report_default": {} 385 | } 386 | } 387 | }, 388 | "scrolled": true 389 | }, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/html": [ 394 | "
\n", 395 | "\n", 408 | "\n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | "
nameteam_namegame_dateseasonespn_player_idteam_idespn_game_idperiodminutes_remainingseconds_remainingshot_made_flagaction_typeshot_typeshot_distanceopponentxydribblestouch_timedefender_namedefender_distanceshot_clockshot_zoneshot_area
205545Evan TurnerBoston Celtics2015-04-0820144239.01610612738400579456.0110291Turnaround Jump Shot213Detroit Pistons1146454.3Monroe, Greg4.98.0Mid RangeL
205546PJ TuckerPhoenix Suns2015-04-0820143033.01610612756400579463.019230Turnaround Jump Shot27Dallas Mavericks-732611.6Rondo, Rajon2.917.7PaintR
205547Dion WaitersOklahoma City Thunder2015-04-0120146628.01610612760NaN110370Turnaround Jump Shot26Dallas Mavericks-67-235.1Nowitzki, Dirk2.511.2PaintR
205548Dante ExumUtah Jazz2015-04-0820143102528.01610612762400579462.012580Turnaround Jump Shot28Sacramento Kings714845.5Landry, Carl4.414.3PaintL
205549Jason SmithNew York Knicks2015-04-0820143232.01610612752400579457.023321Turnaround Jump Shot27Indiana Pacers73-2445.1Allen, Lavoy4.73.7PaintL
\n", 576 | "
" 577 | ], 578 | "text/plain": [ 579 | " name team_name game_date season \\\n", 580 | "205545 Evan Turner Boston Celtics 2015-04-08 2014 \n", 581 | "205546 PJ Tucker Phoenix Suns 2015-04-08 2014 \n", 582 | "205547 Dion Waiters Oklahoma City Thunder 2015-04-01 2014 \n", 583 | "205548 Dante Exum Utah Jazz 2015-04-08 2014 \n", 584 | "205549 Jason Smith New York Knicks 2015-04-08 2014 \n", 585 | "\n", 586 | " espn_player_id team_id espn_game_id period minutes_remaining \\\n", 587 | "205545 4239.0 1610612738 400579456.0 1 10 \n", 588 | "205546 3033.0 1610612756 400579463.0 1 9 \n", 589 | "205547 6628.0 1610612760 NaN 1 10 \n", 590 | "205548 3102528.0 1610612762 400579462.0 1 2 \n", 591 | "205549 3232.0 1610612752 400579457.0 2 3 \n", 592 | "\n", 593 | " seconds_remaining shot_made_flag action_type shot_type \\\n", 594 | "205545 29 1 Turnaround Jump Shot 2 \n", 595 | "205546 23 0 Turnaround Jump Shot 2 \n", 596 | "205547 37 0 Turnaround Jump Shot 2 \n", 597 | "205548 58 0 Turnaround Jump Shot 2 \n", 598 | "205549 32 1 Turnaround Jump Shot 2 \n", 599 | "\n", 600 | " shot_distance opponent x y dribbles touch_time \\\n", 601 | "205545 13 Detroit Pistons 114 64 5 4.3 \n", 602 | "205546 7 Dallas Mavericks -73 26 1 1.6 \n", 603 | "205547 6 Dallas Mavericks -67 -2 3 5.1 \n", 604 | "205548 8 Sacramento Kings 71 48 4 5.5 \n", 605 | "205549 7 Indiana Pacers 73 -24 4 5.1 \n", 606 | "\n", 607 | " defender_name defender_distance shot_clock shot_zone shot_area \n", 608 | "205545 Monroe, Greg 4.9 8.0 Mid Range L \n", 609 | "205546 Rondo, Rajon 2.9 17.7 Paint R \n", 610 | "205547 Nowitzki, Dirk 2.5 11.2 Paint R \n", 611 | "205548 Landry, Carl 4.4 14.3 Paint L \n", 612 | "205549 Allen, Lavoy 4.7 3.7 Paint L " 613 | ] 614 | }, 615 | "execution_count": 600, 616 | "metadata": {}, 617 | "output_type": "execute_result" 618 | } 619 | ], 620 | "source": [ 621 | "df.tail()" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 601, 627 | "metadata": { 628 | "extensions": { 629 | "jupyter_dashboards": { 630 | "version": 1, 631 | "views": { 632 | "grid_default": { 633 | "hidden": true 634 | }, 635 | "report_default": {} 636 | } 637 | } 638 | }, 639 | "scrolled": true 640 | }, 641 | "outputs": [], 642 | "source": [ 643 | "def get_lg_avgs(shot_zone_area_tup, df):\n", 644 | " sz = shot_zone_area_tup[0]\n", 645 | " sa = shot_zone_area_tup[1]\n", 646 | " shots_made = len(df[(df.shot_zone==sz) & (df.shot_area==sa) & (df.shot_made_flag==1)])\n", 647 | " total_shots = len(df[(df.shot_zone==sz) & (df.shot_area==sa)])\n", 648 | " if total_shots ==0:\n", 649 | " make_pct = 0\n", 650 | " else:\n", 651 | " make_pct = round((shots_made / total_shots),4)\n", 652 | " return make_pct" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 602, 658 | "metadata": { 659 | "extensions": { 660 | "jupyter_dashboards": { 661 | "version": 1, 662 | "views": { 663 | "grid_default": { 664 | "hidden": true 665 | }, 666 | "report_default": {} 667 | } 668 | } 669 | }, 670 | "scrolled": true 671 | }, 672 | "outputs": [], 673 | "source": [ 674 | "sz = set(shot_zones)\n", 675 | "sa = set(shot_areas)\n", 676 | "sza_tups = list(itertools.product(sz,sa))\n", 677 | "\n", 678 | "sza_dict = {}\n", 679 | "for sza in sza_tups:\n", 680 | " sza_dict[sza] = get_lg_avgs(sza, df)" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 603, 686 | "metadata": { 687 | "scrolled": true 688 | }, 689 | "outputs": [], 690 | "source": [ 691 | "def add_lg_avg_to_df(df):\n", 692 | " df['lg_avg']=0\n", 693 | " for k,v in sza_dict.items():\n", 694 | " df['lg_avg'] = np.where((df.shot_zone==k[0]) & (df.shot_area==k[1]), v, df['lg_avg'])" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 604, 700 | "metadata": { 701 | "scrolled": true 702 | }, 703 | "outputs": [], 704 | "source": [ 705 | "add_lg_avg_to_df(df)" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": 607, 711 | "metadata": { 712 | "extensions": { 713 | "jupyter_dashboards": { 714 | "version": 1, 715 | "views": { 716 | "grid_default": { 717 | "hidden": true 718 | }, 719 | "report_default": {} 720 | } 721 | } 722 | }, 723 | "scrolled": true 724 | }, 725 | "outputs": [], 726 | "source": [ 727 | "def create_team_ids(df):\n", 728 | " team_id_dict = {}\n", 729 | " for id_, team in enumerate(list(set(df.team_name))):\n", 730 | " team_id_dict[team]=id_+1\n", 731 | "\n", 732 | " df['opp_id']=0\n", 733 | " #get team ids from 1-30\n", 734 | " for k,v in team_id_dict.items():\n", 735 | " df['team_id'] = np.where(df.team_name==k, v, df['team_id'])\n", 736 | " df['opp_id'] = np.where(df.opponent==k, v, df['opp_id'])\n", 737 | " return team_id_dict\n", 738 | "\n", 739 | "create_team_ids(df)" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": 608, 745 | "metadata": { 746 | "scrolled": true 747 | }, 748 | "outputs": [], 749 | "source": [ 750 | "#df.groupby(by=['game_date','team_id','opp_id']).mean()" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": 609, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [ 759 | "nba_shots = pd.read_csv('./data/shots_1415.csv',index_col=0)\n", 760 | "nba_shots.GAME_DATE = nba_shots.GAME_DATE.astype('str')\n", 761 | "\n", 762 | "#Adds dashes to date string so it can be converted to datetime format\n", 763 | "def add_dashes(string):\n", 764 | " date = string[:4] + '-' + string[4:6] + '-' + string[-2:]\n", 765 | " return date\n", 766 | "\n", 767 | "nba_shots.GAME_DATE = nba_shots.GAME_DATE.apply(lambda x: add_dashes(x))\n", 768 | "nba_shots.GAME_DATE = pd.to_datetime(nba_shots.GAME_DATE)\n", 769 | "nba_shots.LOC_X = -nba_shots.LOC_X" 770 | ] 771 | }, 772 | { 773 | "cell_type": "markdown", 774 | "metadata": {}, 775 | "source": [ 776 | "### Merge Dataframes" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 610, 782 | "metadata": {}, 783 | "outputs": [], 784 | "source": [ 785 | "merged_df = df.merge(nba_shots, left_on=['team_name','game_date','period','minutes_remaining','seconds_remaining','x','y'],\n", 786 | " right_on=['TEAM_NAME','GAME_DATE','PERIOD','MINUTES_REMAINING','SECONDS_REMAINING','LOC_X','LOC_Y'])\n", 787 | "\n", 788 | "merged_df = merged_df.drop(columns=['GRID_TYPE','PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING',\n", 789 | " 'SECONDS_REMAINING','SHOT_DISTANCE','LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE',\n", 790 | " 'espn_player_id', 'espn_game_id', 'EVENT_TYPE','ACTION_TYPE', 'SHOT_TYPE','SHOT_ZONE_BASIC',\n", 791 | " 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE'])" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": 611, 797 | "metadata": { 798 | "scrolled": true 799 | }, 800 | "outputs": [], 801 | "source": [ 802 | "#get dictionary matching team names to home and away team acronyms\n", 803 | "def create_home_acronym_dict():\n", 804 | " team_acronyms = sorted(list(merged_df.HTM.unique()))\n", 805 | " team_names = sorted(list(merged_df.team_name.unique()))\n", 806 | "\n", 807 | " team_name_ac_dict = dict(zip(team_names,team_acronyms))\n", 808 | " team_name_ac_dict['Boston Celtics'] = 'BOS'\n", 809 | " team_name_ac_dict['Brooklyn Nets'] = 'BKN'\n", 810 | " return team_name_ac_dict" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": 612, 816 | "metadata": { 817 | "scrolled": false 818 | }, 819 | "outputs": [ 820 | { 821 | "name": "stdout", 822 | "output_type": "stream", 823 | "text": [ 824 | "Runtime: 2.05 seconds. 205539 iterations to go.\n", 825 | "Runtime: 8.47 seconds. 155539 iterations to go.\n", 826 | "Runtime: 16.45 seconds. 105539 iterations to go.\n", 827 | "Runtime: 23.53 seconds. 55539 iterations to go.\n", 828 | "Runtime: 31.85 seconds. 5539 iterations to go.\n" 829 | ] 830 | } 831 | ], 832 | "source": [ 833 | "def get_home_team():\n", 834 | " start = time.time()\n", 835 | " is_home_arr = []\n", 836 | "\n", 837 | " team_name_ac_dict=create_home_acronym_dict()\n", 838 | "\n", 839 | " for index, row in merged_df.iterrows():\n", 840 | " if team_name_ac_dict[row.team_name]==row.HTM:\n", 841 | " is_home_arr.append(1)\n", 842 | " else:\n", 843 | " is_home_arr.append(0)\n", 844 | " if index%50000==0:\n", 845 | " print('Runtime: {} seconds. {} iterations to go.'.format(round(time.time()-start,2), len(merged_df)-index))\n", 846 | " return is_home_arr\n", 847 | "\n", 848 | "merged_df['is_home'] = get_home_team()" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": 613, 854 | "metadata": { 855 | "scrolled": true 856 | }, 857 | "outputs": [], 858 | "source": [ 859 | "#sort the dataframe by date, game_id, player_name, and game_event_id\n", 860 | "sorted_df = merged_df.copy().sort_values(by=['game_date','GAME_ID','name','GAME_EVENT_ID']).reset_index(drop=True)\n", 861 | "\n", 862 | "#adds to dataframe whether player has hit previous 1, 2, or 3 shots\n", 863 | "def is_player_hot(dataframe):\n", 864 | " start=time.time()\n", 865 | "\n", 866 | " df = dataframe\n", 867 | " #create array that stores whether previous 1, 2, or 3 shots were made, respectively\n", 868 | " heat_check_array=np.zeros((len(df),3))\n", 869 | "\n", 870 | " for index, row in df.iterrows():\n", 871 | " if index==0:\n", 872 | " heat_check_array[index,:]+=[0,0,0]\n", 873 | " elif index==1:\n", 874 | " if (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1):\n", 875 | " heat_check_array[index,:]+=[1,0,0]\n", 876 | " else:\n", 877 | " heat_check_array[index,:]+=[0,0,0]\n", 878 | " elif index==2:\n", 879 | " if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1):\n", 880 | " heat_check_array[index,:]+=[1,1,0]\n", 881 | " elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==0):\n", 882 | " heat_check_array[index,:]+=[1,0,0]\n", 883 | " else:\n", 884 | " heat_check_array[index,:]+=[0,0,0]\n", 885 | " else:\n", 886 | " if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==1):\n", 887 | " heat_check_array[index,:]+=[1,1,1]\n", 888 | " elif (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==0):\n", 889 | " heat_check_array[index,:]+=[1,1,0]\n", 890 | " elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1):\n", 891 | " heat_check_array[index,:]+=[1,0,0]\n", 892 | " else:\n", 893 | " heat_check_array[index,:]+=[0,0,0]\n", 894 | "\n", 895 | " if index%50000==0:\n", 896 | " print('Runtime: {} seconds. {} iterations remaining.'.format(round(time.time()-start,2),len(df)-index))\n", 897 | "\n", 898 | " return heat_check_array" 899 | ] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": 614, 904 | "metadata": { 905 | "scrolled": true 906 | }, 907 | "outputs": [ 908 | { 909 | "name": "stdout", 910 | "output_type": "stream", 911 | "text": [ 912 | "Runtime: 2.21 seconds. 205539 iterations remaining.\n", 913 | "Runtime: 38.93 seconds. 155539 iterations remaining.\n", 914 | "Runtime: 75.29 seconds. 105539 iterations remaining.\n", 915 | "Runtime: 117.37 seconds. 55539 iterations remaining.\n", 916 | "Runtime: 157.18 seconds. 5539 iterations remaining.\n" 917 | ] 918 | } 919 | ], 920 | "source": [ 921 | "heat_check_array = is_player_hot(sorted_df)" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": 632, 927 | "metadata": {}, 928 | "outputs": [ 929 | { 930 | "data": { 931 | "text/html": [ 932 | "
\n", 933 | "\n", 946 | "\n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | "
nameshot_made_flagprev_shot_madeprev_2_madeprev_3_madegame_dateGAME_EVENT_ID
210Cory Joseph10.00.00.02014-10-28380
211Cory Joseph11.00.00.02014-10-28387
212Danny Green00.00.00.02014-10-289
213Danny Green10.00.00.02014-10-2815
214Danny Green11.00.00.02014-10-28102
215Danny Green11.01.00.02014-10-28132
216Danny Green01.01.01.02014-10-28150
217Danny Green00.00.00.02014-10-28175
218Danny Green10.00.00.02014-10-28259
219Danny Green01.00.00.02014-10-28284
\n", 1062 | "
" 1063 | ], 1064 | "text/plain": [ 1065 | " name shot_made_flag prev_shot_made prev_2_made prev_3_made \\\n", 1066 | "210 Cory Joseph 1 0.0 0.0 0.0 \n", 1067 | "211 Cory Joseph 1 1.0 0.0 0.0 \n", 1068 | "212 Danny Green 0 0.0 0.0 0.0 \n", 1069 | "213 Danny Green 1 0.0 0.0 0.0 \n", 1070 | "214 Danny Green 1 1.0 0.0 0.0 \n", 1071 | "215 Danny Green 1 1.0 1.0 0.0 \n", 1072 | "216 Danny Green 0 1.0 1.0 1.0 \n", 1073 | "217 Danny Green 0 0.0 0.0 0.0 \n", 1074 | "218 Danny Green 1 0.0 0.0 0.0 \n", 1075 | "219 Danny Green 0 1.0 0.0 0.0 \n", 1076 | "\n", 1077 | " game_date GAME_EVENT_ID \n", 1078 | "210 2014-10-28 380 \n", 1079 | "211 2014-10-28 387 \n", 1080 | "212 2014-10-28 9 \n", 1081 | "213 2014-10-28 15 \n", 1082 | "214 2014-10-28 102 \n", 1083 | "215 2014-10-28 132 \n", 1084 | "216 2014-10-28 150 \n", 1085 | "217 2014-10-28 175 \n", 1086 | "218 2014-10-28 259 \n", 1087 | "219 2014-10-28 284 " 1088 | ] 1089 | }, 1090 | "execution_count": 632, 1091 | "metadata": {}, 1092 | "output_type": "execute_result" 1093 | } 1094 | ], 1095 | "source": [ 1096 | "#add heat check stats to dataframe\n", 1097 | "sorted_df['prev_shot_made'] = heat_check_array[:,0]\n", 1098 | "sorted_df['prev_2_made'] = heat_check_array[:,1]\n", 1099 | "sorted_df['prev_3_made'] = heat_check_array[:,2]\n", 1100 | "sorted_df[210:220][['name','shot_made_flag','prev_shot_made','prev_2_made','prev_3_made','game_date','GAME_EVENT_ID']]" 1101 | ] 1102 | }, 1103 | { 1104 | "cell_type": "code", 1105 | "execution_count": 37, 1106 | "metadata": { 1107 | "scrolled": true 1108 | }, 1109 | "outputs": [ 1110 | { 1111 | "data": { 1112 | "text/html": [ 1113 | "
\n", 1114 | "\n", 1127 | "\n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | "
nameteam_namegame_dateseasonteam_idperiodminutes_remainingseconds_remainingshot_made_flagaction_typeshot_typeshot_distanceopponentxydribblestouch_timedefender_namedefender_distanceshot_clockshot_zoneshot_arealg_avgopp_idGAME_IDGAME_EVENT_IDPLAYER_IDHTMVTMis_homeprev_shot_madeprev_2_madeprev_3_made
0Aaron GordonOrlando Magic2014-10-28201410211341Jump Shot24New Orleans Pelicans-104445.1Anderson, Ryan3.90.6PaintC0.40111321400001164203932NOPORL0000
1Aaron GordonOrlando Magic2014-10-2820141029131Jump Shot323New Orleans Pelicans-2332000.7Evans, Tyreke4.37.4Corner 3R0.39151321400001198203932NOPORL0100
2Aaron GordonOrlando Magic2014-10-2820141022550Jump Shot323New Orleans Pelicans-234000.9Gordon, Eric12.514.8Corner 3R0.39151321400001275203932NOPORL0110
3Aaron GordonOrlando Magic2014-10-282014103511Jump Shot25New Orleans Pelicans-95822.6Asik, Omer3.58.3PaintC0.40111321400001381203932NOPORL0000
4Aaron GordonOrlando Magic2014-10-2820141045580Jump Shot211New Orleans Pelicans4610576.2Davis, Anthony4.81.5PaintL0.38411321400001524203932NOPORL0100
\n", 1349 | "
" 1350 | ], 1351 | "text/plain": [ 1352 | " name team_name game_date season team_id period \\\n", 1353 | "0 Aaron Gordon Orlando Magic 2014-10-28 2014 10 2 \n", 1354 | "1 Aaron Gordon Orlando Magic 2014-10-28 2014 10 2 \n", 1355 | "2 Aaron Gordon Orlando Magic 2014-10-28 2014 10 2 \n", 1356 | "3 Aaron Gordon Orlando Magic 2014-10-28 2014 10 3 \n", 1357 | "4 Aaron Gordon Orlando Magic 2014-10-28 2014 10 4 \n", 1358 | "\n", 1359 | " minutes_remaining seconds_remaining shot_made_flag action_type \\\n", 1360 | "0 11 34 1 Jump Shot \n", 1361 | "1 9 13 1 Jump Shot \n", 1362 | "2 2 55 0 Jump Shot \n", 1363 | "3 5 1 1 Jump Shot \n", 1364 | "4 5 58 0 Jump Shot \n", 1365 | "\n", 1366 | " shot_type shot_distance opponent x y dribbles \\\n", 1367 | "0 2 4 New Orleans Pelicans -10 44 4 \n", 1368 | "1 3 23 New Orleans Pelicans -233 20 0 \n", 1369 | "2 3 23 New Orleans Pelicans -234 0 0 \n", 1370 | "3 2 5 New Orleans Pelicans -9 58 2 \n", 1371 | "4 2 11 New Orleans Pelicans 46 105 7 \n", 1372 | "\n", 1373 | " touch_time defender_name defender_distance shot_clock shot_zone \\\n", 1374 | "0 5.1 Anderson, Ryan 3.9 0.6 Paint \n", 1375 | "1 0.7 Evans, Tyreke 4.3 7.4 Corner 3 \n", 1376 | "2 0.9 Gordon, Eric 12.5 14.8 Corner 3 \n", 1377 | "3 2.6 Asik, Omer 3.5 8.3 Paint \n", 1378 | "4 6.2 Davis, Anthony 4.8 1.5 Paint \n", 1379 | "\n", 1380 | " shot_area lg_avg opp_id GAME_ID GAME_EVENT_ID PLAYER_ID HTM VTM \\\n", 1381 | "0 C 0.4011 13 21400001 164 203932 NOP ORL \n", 1382 | "1 R 0.3915 13 21400001 198 203932 NOP ORL \n", 1383 | "2 R 0.3915 13 21400001 275 203932 NOP ORL \n", 1384 | "3 C 0.4011 13 21400001 381 203932 NOP ORL \n", 1385 | "4 L 0.3841 13 21400001 524 203932 NOP ORL \n", 1386 | "\n", 1387 | " is_home prev_shot_made prev_2_made prev_3_made \n", 1388 | "0 0 0 0 0 \n", 1389 | "1 0 1 0 0 \n", 1390 | "2 0 1 1 0 \n", 1391 | "3 0 0 0 0 \n", 1392 | "4 0 1 0 0 " 1393 | ] 1394 | }, 1395 | "execution_count": 37, 1396 | "metadata": {}, 1397 | "output_type": "execute_result" 1398 | } 1399 | ], 1400 | "source": [ 1401 | "sorted_df.head()" 1402 | ] 1403 | }, 1404 | { 1405 | "cell_type": "code", 1406 | "execution_count": 39, 1407 | "metadata": {}, 1408 | "outputs": [], 1409 | "source": [ 1410 | "positions = stats[['Player','Pos','Age']]" 1411 | ] 1412 | }, 1413 | { 1414 | "cell_type": "code", 1415 | "execution_count": 46, 1416 | "metadata": {}, 1417 | "outputs": [], 1418 | "source": [ 1419 | "sorted_df = sorted_df.merge(positions, left_on='name', right_on='Player').drop(columns=['Player'])\n", 1420 | "sorted_df.columns = map(str.lower, sorted_df.columns)" 1421 | ] 1422 | }, 1423 | { 1424 | "cell_type": "code", 1425 | "execution_count": 55, 1426 | "metadata": {}, 1427 | "outputs": [], 1428 | "source": [ 1429 | "#rearrange columns for better visability\n", 1430 | "sorted_df = sorted_df[['name','pos','age','player_id', 'team_name', 'team_id', 'game_date',\n", 1431 | " 'game_id', 'game_event_id','season', 'period',\n", 1432 | " 'minutes_remaining', 'seconds_remaining', 'shot_made_flag',\n", 1433 | " 'action_type', 'shot_type', 'shot_distance', 'x', 'y',\n", 1434 | " 'dribbles', 'touch_time', 'opponent', 'opp_id', 'defender_name', 'defender_distance',\n", 1435 | " 'shot_clock', 'shot_zone', 'shot_area', 'lg_avg','htm', 'vtm',\n", 1436 | " 'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made']]" 1437 | ] 1438 | }, 1439 | { 1440 | "cell_type": "code", 1441 | "execution_count": 58, 1442 | "metadata": { 1443 | "scrolled": true 1444 | }, 1445 | "outputs": [ 1446 | { 1447 | "data": { 1448 | "text/html": [ 1449 | "
\n", 1450 | "\n", 1463 | "\n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | "
nameposageplayer_idteam_nameteam_idgame_dategame_idgame_event_idseasonperiodminutes_remainingseconds_remainingshot_made_flagaction_typeshot_typeshot_distancexydribblestouch_timeopponentopp_iddefender_namedefender_distanceshot_clockshot_zoneshot_arealg_avghtmvtmis_homeprev_shot_madeprev_2_madeprev_3_made
205534Vander BlueSG22203505Los Angeles Lakers222015-04-1521401230508201445251Turnaround Jump Shot22012516501.1Sacramento Kings11Stockton, David9.68.7Mid RangeL0.3925LALSAC1000
205535Vander BlueSG22203505Los Angeles Lakers222015-04-152140123052120144440Jump Shot216109126109.3Sacramento Kings11Stockton, David3.112.7Mid RangeL0.3925LALSAC1100
205536Vander BlueSG22203505Los Angeles Lakers222015-04-152140123056520144180Running Jump Shot2165115477.9Sacramento Kings11Stockton, David1.414.2Mid RangeC0.3994LALSAC1000
205537Jamaal FranklinSG23203479Denver Nuggets192015-04-1521401229500201445331Pullup Jump shot3265925712.7Golden State Warriors15Livingston, Shaun3.514.0Above Break 3C0.3415GSWDEN0000
205538Jamaal FranklinSG23203479Denver Nuggets192015-04-152140122956320144280Pullup Jump shot326-7225211.9Golden State Warriors15Rush, Brandon4.211.8Above Break 3C0.3415GSWDEN0100
\n", 1697 | "
" 1698 | ], 1699 | "text/plain": [ 1700 | " name pos age player_id team_name team_id \\\n", 1701 | "205534 Vander Blue SG 22 203505 Los Angeles Lakers 22 \n", 1702 | "205535 Vander Blue SG 22 203505 Los Angeles Lakers 22 \n", 1703 | "205536 Vander Blue SG 22 203505 Los Angeles Lakers 22 \n", 1704 | "205537 Jamaal Franklin SG 23 203479 Denver Nuggets 19 \n", 1705 | "205538 Jamaal Franklin SG 23 203479 Denver Nuggets 19 \n", 1706 | "\n", 1707 | " game_date game_id game_event_id season period minutes_remaining \\\n", 1708 | "205534 2015-04-15 21401230 508 2014 4 5 \n", 1709 | "205535 2015-04-15 21401230 521 2014 4 4 \n", 1710 | "205536 2015-04-15 21401230 565 2014 4 1 \n", 1711 | "205537 2015-04-15 21401229 500 2014 4 5 \n", 1712 | "205538 2015-04-15 21401229 563 2014 4 2 \n", 1713 | "\n", 1714 | " seconds_remaining shot_made_flag action_type shot_type \\\n", 1715 | "205534 25 1 Turnaround Jump Shot 2 \n", 1716 | "205535 4 0 Jump Shot 2 \n", 1717 | "205536 8 0 Running Jump Shot 2 \n", 1718 | "205537 33 1 Pullup Jump shot 3 \n", 1719 | "205538 8 0 Pullup Jump shot 3 \n", 1720 | "\n", 1721 | " shot_distance x y dribbles touch_time opponent \\\n", 1722 | "205534 20 125 165 0 1.1 Sacramento Kings \n", 1723 | "205535 16 109 126 10 9.3 Sacramento Kings \n", 1724 | "205536 16 51 154 7 7.9 Sacramento Kings \n", 1725 | "205537 26 59 257 1 2.7 Golden State Warriors \n", 1726 | "205538 26 -72 252 1 1.9 Golden State Warriors \n", 1727 | "\n", 1728 | " opp_id defender_name defender_distance shot_clock \\\n", 1729 | "205534 11 Stockton, David 9.6 8.7 \n", 1730 | "205535 11 Stockton, David 3.1 12.7 \n", 1731 | "205536 11 Stockton, David 1.4 14.2 \n", 1732 | "205537 15 Livingston, Shaun 3.5 14.0 \n", 1733 | "205538 15 Rush, Brandon 4.2 11.8 \n", 1734 | "\n", 1735 | " shot_zone shot_area lg_avg htm vtm is_home prev_shot_made \\\n", 1736 | "205534 Mid Range L 0.3925 LAL SAC 1 0 \n", 1737 | "205535 Mid Range L 0.3925 LAL SAC 1 1 \n", 1738 | "205536 Mid Range C 0.3994 LAL SAC 1 0 \n", 1739 | "205537 Above Break 3 C 0.3415 GSW DEN 0 0 \n", 1740 | "205538 Above Break 3 C 0.3415 GSW DEN 0 1 \n", 1741 | "\n", 1742 | " prev_2_made prev_3_made \n", 1743 | "205534 0 0 \n", 1744 | "205535 0 0 \n", 1745 | "205536 0 0 \n", 1746 | "205537 0 0 \n", 1747 | "205538 0 0 " 1748 | ] 1749 | }, 1750 | "execution_count": 58, 1751 | "metadata": {}, 1752 | "output_type": "execute_result" 1753 | } 1754 | ], 1755 | "source": [ 1756 | "sorted_df.tail()" 1757 | ] 1758 | }, 1759 | { 1760 | "cell_type": "markdown", 1761 | "metadata": {}, 1762 | "source": [ 1763 | "## Final cleaning and export" 1764 | ] 1765 | }, 1766 | { 1767 | "cell_type": "code", 1768 | "execution_count": 75, 1769 | "metadata": {}, 1770 | "outputs": [], 1771 | "source": [ 1772 | "#clean positions down to 5 standard positions (no combos)\n", 1773 | "sorted_df.pos[sorted_df.name=='Giannis Antetokounmpo'] = 'SF'\n", 1774 | "\n", 1775 | "sorted_df.pos[sorted_df.pos=='PG-SG']='SG'\n", 1776 | "sorted_df.pos[sorted_df.pos=='SF-SG'] = 'SF'\n", 1777 | "sorted_df.pos[sorted_df.pos=='SG-PG'] = 'PG'\n", 1778 | "sorted_df.pos[sorted_df.pos=='PF-SF'] = 'SF'\n", 1779 | "sorted_df.pos[sorted_df.pos=='SF-PF'] = 'PF'\n", 1780 | "sorted_df.pos[sorted_df.pos=='SG-SF'] = 'SF'\n", 1781 | "\n" 1782 | ] 1783 | }, 1784 | { 1785 | "cell_type": "code", 1786 | "execution_count": null, 1787 | "metadata": {}, 1788 | "outputs": [], 1789 | "source": [] 1790 | }, 1791 | { 1792 | "cell_type": "code", 1793 | "execution_count": 493, 1794 | "metadata": {}, 1795 | "outputs": [], 1796 | "source": [ 1797 | "# players \n", 1798 | " # name | team | \n", 1799 | "# shots \n", 1800 | " # |player_id| zone name| area| made? \n", 1801 | " # \n", 1802 | "# def player_shots() \n", 1803 | " # shots[shots[player_id] == id]\n", 1804 | " \n", 1805 | "# def shots_by_zone(shots):\n", 1806 | "# \"\"\" first zone\"\"\"\n", 1807 | "# returns {'2' = [[], , {}]}\n", 1808 | "\n", 1809 | "# shots = player_shots('bob koozie')\n", 1810 | "# shots_by_zone(shots)\n", 1811 | "\n", 1812 | "# iterate through every player \n", 1813 | "# retrieve each player's shots\n", 1814 | "# for each zone\n", 1815 | "# retreive shots taken\n", 1816 | "# retrieve shots scored \n", 1817 | "# \n", 1818 | "\n", 1819 | "def get_fg_pct_by_player_for_each_zone(df):\n", 1820 | " start = time.time()\n", 1821 | " player_names = list(df.name.unique())\n", 1822 | " df_list = []\n", 1823 | "\n", 1824 | " for c, player in enumerate(player_names):\n", 1825 | " df_ = df[df.name==player].reset_index(drop=True)\n", 1826 | " shot_arr = np.zeros((len(df_),26))\n", 1827 | "\n", 1828 | " if (c+1)%100==0:\n", 1829 | " print('Runtime: {} seconds. {} of {} players completed.'.format(round(time.time()-start,2), c+1, len(player_names)))\n", 1830 | " for index, row in df_.iterrows():\n", 1831 | " if index != 0:\n", 1832 | " shot_arr[index,:] = shot_arr[index-1,:]\n", 1833 | " if row.shot_zone=='Mid Range':\n", 1834 | " if row.shot_area=='R':\n", 1835 | " if row.shot_made_flag==1:\n", 1836 | " shot_arr[index,0:2]+=[1,1]\n", 1837 | " else:\n", 1838 | " shot_arr[index,0:2]+=[0,1]\n", 1839 | " elif row.shot_area=='C':\n", 1840 | " if row.shot_made_flag==1:\n", 1841 | " shot_arr[index,2:4]+=[1,1]\n", 1842 | " else:\n", 1843 | " shot_arr[index,2:4]+=[0,1]\n", 1844 | " else:\n", 1845 | " if row.shot_made_flag==1:\n", 1846 | " shot_arr[index,4:6]+=[1,1]\n", 1847 | " else:\n", 1848 | " shot_arr[index,4:6]+=[0,1]\n", 1849 | " elif row.shot_zone=='Restricted Area':\n", 1850 | " if row.shot_made_flag==1:\n", 1851 | " shot_arr[index,6:8]+=[1,1]\n", 1852 | " else:\n", 1853 | " shot_arr[index,6:8]+=[0,1]\n", 1854 | " elif row.shot_zone=='Heave':\n", 1855 | " if row.shot_made_flag==1:\n", 1856 | " shot_arr[index,8:10]+=[1,1]\n", 1857 | " else:\n", 1858 | " shot_arr[index,8:10]+=[0,1]\n", 1859 | " elif row.shot_zone=='Above Break 3':\n", 1860 | " if row.shot_area=='R':\n", 1861 | " if row.shot_made_flag==1:\n", 1862 | " shot_arr[index,10:12]+=[1,1]\n", 1863 | " else:\n", 1864 | " shot_arr[index,10:12]+=[0,1]\n", 1865 | " elif row.shot_area=='C':\n", 1866 | " if row.shot_made_flag==1:\n", 1867 | " shot_arr[index,12:14]+=[1,1]\n", 1868 | " else:\n", 1869 | " shot_arr[index,12:14]+=[0,1]\n", 1870 | " else:\n", 1871 | " if row.shot_made_flag==1:\n", 1872 | " shot_arr[index,14:16]+=[1,1]\n", 1873 | " else:\n", 1874 | " shot_arr[index,14:16]+=[0,1]\n", 1875 | " elif row.shot_zone=='Paint':\n", 1876 | " if row.shot_area=='R':\n", 1877 | " if row.shot_made_flag==1:\n", 1878 | " shot_arr[index,16:18]+=[1,1]\n", 1879 | " else:\n", 1880 | " shot_arr[index,16:18]+=[0,1]\n", 1881 | " elif row.shot_area=='C':\n", 1882 | " if row.shot_made_flag==1:\n", 1883 | " shot_arr[index,18:20]+=[1,1]\n", 1884 | " else:\n", 1885 | " shot_arr[index,18:20]+=[0,1]\n", 1886 | " else:\n", 1887 | " if row.shot_made_flag==1:\n", 1888 | " shot_arr[index,20:22]+=[1,1]\n", 1889 | " else:\n", 1890 | " shot_arr[index,20:22]+=[0,1]\n", 1891 | " elif row.shot_zone=='Corner 3':\n", 1892 | " if row.shot_area=='R':\n", 1893 | " if row.shot_made_flag==1:\n", 1894 | " shot_arr[index,22:24]+=[1,1]\n", 1895 | " else:\n", 1896 | " shot_arr[index,22:24]+=[0,1]\n", 1897 | " else:\n", 1898 | " if row.shot_made_flag==1:\n", 1899 | " shot_arr[index,24:26]+=[1,1]\n", 1900 | " else:\n", 1901 | " shot_arr[index,24:26]+=[0,1]\n", 1902 | "\n", 1903 | " df_list.append(pd.DataFrame(shot_arr,index=df_.name))\n", 1904 | "\n", 1905 | " print('Total Runtime: {} seconds.'.format(round(time.time()-start,2),\n", 1906 | " c, len(player_names)))\n", 1907 | " return df_list" 1908 | ] 1909 | }, 1910 | { 1911 | "cell_type": "code", 1912 | "execution_count": 574, 1913 | "metadata": {}, 1914 | "outputs": [], 1915 | "source": [ 1916 | "def add_zone_fg_pct_to_df(df):\n", 1917 | " df_list = get_fg_pct_by_player_for_each_zone(df)\n", 1918 | " zone_df = pd.concat([df_ for df_ in df_list])\n", 1919 | " \n", 1920 | " column_names = ['mid_R_pct', 'mid_C_pct', 'mid_L_pct', 'restricted_pct', 'heave_pct', 'ab_3_R_pct', 'ab_3_C_pct',\n", 1921 | " 'ab_3_L_pct', 'paint_R_pct', 'paint_C_pct', 'paint_L_pct', 'corner_3_R_pct', 'corner_3_L_pct',] \n", 1922 | "\n", 1923 | " counter = 0\n", 1924 | " for col in column_names:\n", 1925 | " zone_df[col] = np.round(zone_df[counter]/zone_df[counter+1],4)\n", 1926 | " counter+=2\n", 1927 | " \n", 1928 | " zone_df = zone_df.drop(columns=list(range(0,26))).reset_index().rename(columns={\n", 1929 | " 'name':'player_name'})\n", 1930 | " zone_fg_df = pd.concat((sorted_df,zone_df),axis=1)\n", 1931 | " \n", 1932 | " return zone_fg_df.drop(columns=['player_name'])\n" 1933 | ] 1934 | }, 1935 | { 1936 | "cell_type": "code", 1937 | "execution_count": 575, 1938 | "metadata": { 1939 | "scrolled": true 1940 | }, 1941 | "outputs": [ 1942 | { 1943 | "name": "stdout", 1944 | "output_type": "stream", 1945 | "text": [ 1946 | "Runtime: 13.34 seconds. 100 of 490 players completed.\n", 1947 | "Runtime: 27.03 seconds. 200 of 490 players completed.\n", 1948 | "Runtime: 39.58 seconds. 300 of 490 players completed.\n", 1949 | "Runtime: 47.93 seconds. 400 of 490 players completed.\n", 1950 | "Total Runtime: 51.96 seconds.\n" 1951 | ] 1952 | } 1953 | ], 1954 | "source": [ 1955 | "zone_fg_df = add_zone_fg_pct_to_df(sorted_df)" 1956 | ] 1957 | }, 1958 | { 1959 | "cell_type": "code", 1960 | "execution_count": 581, 1961 | "metadata": {}, 1962 | "outputs": [ 1963 | { 1964 | "data": { 1965 | "text/plain": [ 1966 | "name 0\n", 1967 | "pos 0\n", 1968 | "age 0\n", 1969 | "player_id 0\n", 1970 | "team_name 0\n", 1971 | "team_id 0\n", 1972 | "game_date 0\n", 1973 | "game_id 0\n", 1974 | "game_event_id 0\n", 1975 | "season 0\n", 1976 | "period 0\n", 1977 | "minutes_remaining 0\n", 1978 | "seconds_remaining 0\n", 1979 | "shot_made_flag 0\n", 1980 | "action_type 0\n", 1981 | "shot_type 0\n", 1982 | "shot_distance 0\n", 1983 | "x 0\n", 1984 | "y 0\n", 1985 | "dribbles 0\n", 1986 | "touch_time 0\n", 1987 | "opponent 0\n", 1988 | "opp_id 0\n", 1989 | "defender_name 0\n", 1990 | "defender_distance 0\n", 1991 | "shot_clock 0\n", 1992 | "shot_zone 0\n", 1993 | "shot_area 0\n", 1994 | "lg_avg 0\n", 1995 | "htm 0\n", 1996 | "vtm 0\n", 1997 | "is_home 0\n", 1998 | "prev_shot_made 0\n", 1999 | "prev_2_made 0\n", 2000 | "prev_3_made 0\n", 2001 | "mid_R_pct 0\n", 2002 | "mid_C_pct 0\n", 2003 | "mid_L_pct 0\n", 2004 | "restricted_pct 0\n", 2005 | "heave_pct 0\n", 2006 | "ab_3_R_pct 0\n", 2007 | "ab_3_C_pct 0\n", 2008 | "ab_3_L_pct 0\n", 2009 | "paint_R_pct 0\n", 2010 | "paint_C_pct 0\n", 2011 | "paint_L_pct 0\n", 2012 | "corner_3_R_pct 0\n", 2013 | "corner_3_L_pct 0\n", 2014 | "dtype: int64" 2015 | ] 2016 | }, 2017 | "execution_count": 581, 2018 | "metadata": {}, 2019 | "output_type": "execute_result" 2020 | } 2021 | ], 2022 | "source": [ 2023 | "#fill null values with 0\n", 2024 | "zone_fg_df = zone_fg_df.fillna(value=0)" 2025 | ] 2026 | }, 2027 | { 2028 | "cell_type": "code", 2029 | "execution_count": 582, 2030 | "metadata": {}, 2031 | "outputs": [], 2032 | "source": [ 2033 | "#export as csv\n", 2034 | "zone_fg_df.to_csv('./data/sorted_df_14_15.csv')" 2035 | ] 2036 | }, 2037 | { 2038 | "cell_type": "code", 2039 | "execution_count": null, 2040 | "metadata": {}, 2041 | "outputs": [], 2042 | "source": [] 2043 | } 2044 | ], 2045 | "metadata": { 2046 | "extensions": { 2047 | "jupyter_dashboards": { 2048 | "activeView": "grid_default", 2049 | "version": 1, 2050 | "views": { 2051 | "grid_default": { 2052 | "cellMargin": 10, 2053 | "defaultCellHeight": 20, 2054 | "maxColumns": 12, 2055 | "name": "grid", 2056 | "type": "grid" 2057 | }, 2058 | "report_default": { 2059 | "name": "report", 2060 | "type": "report" 2061 | } 2062 | } 2063 | } 2064 | }, 2065 | "kernelspec": { 2066 | "display_name": "Python 3", 2067 | "language": "python", 2068 | "name": "python3" 2069 | }, 2070 | "language_info": { 2071 | "codemirror_mode": { 2072 | "name": "ipython", 2073 | "version": 3 2074 | }, 2075 | "file_extension": ".py", 2076 | "mimetype": "text/x-python", 2077 | "name": "python", 2078 | "nbconvert_exporter": "python", 2079 | "pygments_lexer": "ipython3", 2080 | "version": "3.6.5" 2081 | } 2082 | }, 2083 | "nbformat": 4, 2084 | "nbformat_minor": 2 2085 | } 2086 | --------------------------------------------------------------------------------