├── images
├── gb cm.png
├── nn cm.png
├── rockets.png
├── cm logreg.png
├── fg by zone.png
├── gb feats.png
├── shot_dist.png
├── shot_zones.png
├── harden recs.png
├── team heatmap.png
├── all_roc_curves.png
├── harden heatmap.png
├── model results.png
└── sc_shot_chart.png
├── nba_shots_scraper.py
├── neural_net.py
├── README.md
├── plotly_viz.py
├── shot_chart_viz.py
├── shallow_ML_models.py
├── new_ETL.py
├── presentation.py
└── Data-Exploration.ipynb
/images/gb cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/gb cm.png
--------------------------------------------------------------------------------
/images/nn cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/nn cm.png
--------------------------------------------------------------------------------
/images/rockets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/rockets.png
--------------------------------------------------------------------------------
/images/cm logreg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/cm logreg.png
--------------------------------------------------------------------------------
/images/fg by zone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/fg by zone.png
--------------------------------------------------------------------------------
/images/gb feats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/gb feats.png
--------------------------------------------------------------------------------
/images/shot_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/shot_dist.png
--------------------------------------------------------------------------------
/images/shot_zones.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/shot_zones.png
--------------------------------------------------------------------------------
/images/harden recs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/harden recs.png
--------------------------------------------------------------------------------
/images/team heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/team heatmap.png
--------------------------------------------------------------------------------
/images/all_roc_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/all_roc_curves.png
--------------------------------------------------------------------------------
/images/harden heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/harden heatmap.png
--------------------------------------------------------------------------------
/images/model results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/model results.png
--------------------------------------------------------------------------------
/images/sc_shot_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/sc_shot_chart.png
--------------------------------------------------------------------------------
/nba_shots_scraper.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import pandas as pd
3 | import numpy as np
4 | from data.all_players_list import players_list
5 | import time
6 | from court import court_shapes
7 |
8 | vets = [player[0:5] for player in players_list if (player[3] >1990) & (player[4] >2014)]
9 |
10 | vets_df = pd.DataFrame(vets, columns=['ID', 'Name', 'Active', 'RookieYear', 'LastSeasonPlayed'])
11 | vets_df = vets_df.drop(columns=['Active', 'RookieYear', 'LastSeasonPlayed'])
12 |
13 | player_ids = [player[0] for player in vets]
14 |
15 | #MULTIPLE YEARS
16 | sc_url_1 = 'https://stats.nba.com/stats/shotchartdetail?AheadBehind=&CFID=33&CFPARAMS='
17 | sc_url_2 = '&ClutchTime=&Conference=&ContextFilter=&ContextMeasure=FGA&DateFrom=&DateTo=&Division=&EndPeriod=10&EndRange=28800&GROUP_ID=&GameEventID=&GameID=&GameSegment=&GroupID=&GroupMode=&GroupQuantity=5&LastNGames=0&LeagueID=00&Location=&Month=0&OnOff=&OpponentTeamID=0&Outcome=&PORound=0&Period=0&PlayerID='
18 | sc_url_3 = '&PlayerID1=&PlayerID2=&PlayerID3=&PlayerID4=&PlayerID5=&PlayerPosition=&PointDiff=&Position=&RangeType=0&RookieYear=&Season='
19 | sc_url_4 = '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StartPeriod=1&StartRange=0&StarterBench=&TeamID=0&VsConference=&VsDivision=&VsPlayerID1=&VsPlayerID2=&VsPlayerID3=&VsPlayerID4=&VsPlayerID5=&VsTeamID='
20 |
21 | headers = requests.utils.default_headers()
22 | headers.update({
23 | "user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
24 | })
25 |
26 | #year in yyyy-yy format (i.e. '2017-18')
27 | def get_all_players_shot_data(player_ids, year):
28 | all_shots = []
29 | c=0
30 | start=time.time()
31 | for player_id in player_ids:
32 | full_url = sc_url_1 + str(year) + sc_url_2 + str(player_id) + sc_url_3 + str(year) + sc_url_4
33 | shots = requests.get(full_url, headers=headers).json()
34 | all_shots.append(shots)
35 | time.sleep(1)
36 | c+=1
37 | if c%50==0:
38 | print('Runtime: {} seconds. {} players completed'.format(time.time()-start, c))
39 | return all_shots
40 |
41 | def convert_dict_to_df(all_shot_data):
42 | start=time.time()
43 |
44 | league_avgs = all_shot_data[0]['resultSets'][1]['rowSet']
45 | league_avg_columns = all_shot_data[0]['resultSets'][1]['headers']
46 | league_avgs_df = pd.DataFrame.from_records(league_avgs, columns=league_avg_columns)
47 |
48 | columns = all_shot_data[0]['resultSets'][0]['headers']
49 |
50 | df_list=[]
51 |
52 | for player in all_shot_data:
53 | data = player['resultSets'][0]['rowSet']
54 | player_df = pd.DataFrame.from_records(data, columns=columns)
55 | df_list.append(player_df)
56 |
57 | df = pd.concat(df_list, ignore_index=True)
58 | print('Total Runtime: {} seconds.'.format(time.time()-start))
59 |
60 | return df, league_avgs_df
61 |
62 | all_shots_1415 = get_all_players_shot_data(player_ids, '2014-15')
63 | shots_1415_df, lg_avgs_1415 = convert_dict_to_df(all_shots_1415)
64 |
65 | shots_1415_df.to_csv('data/shots_1415.csv')
66 |
--------------------------------------------------------------------------------
/neural_net.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import os, itertools
5 |
6 | from sklearn.preprocessing import MinMaxScaler
7 | from sklearn.model_selection import train_test_split
8 | from sklearn.metrics import confusion_matrix, classification_report
9 |
10 | import keras
11 | from keras.layers import Dense, Dropout, LSTM
12 | from keras.models import Sequential, load_model
13 | from keras.callbacks import EarlyStopping, TensorBoard
14 |
15 | #####LOAD DATA#####
16 | if False:
17 | df = pd.read_csv('data/final_df.csv', index_col=0)
18 |
19 | X = df.drop(columns=['name', 'pos', 'age', 'player_id', 'team_name', 'team_id', 'game_date', 'game_id', 'game_event_id', 'season', 'minutes_remaining', 'seconds_remaining', 'action_type', 'shot_type', 'opponent','opp_id',
20 | 'defender_name', 'htm', 'vtm', 'defender_id', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'Heave', 'dribbles','shot_distance', 'shot_made_flag'])
21 | y = np.array(df.shot_made_flag)
22 |
23 | minmax_scale = MinMaxScaler()
24 | X = minmax_scale.fit_transform(X)
25 |
26 | np.save('X_y_arrays/X_', X)
27 | np.save('X_y_arrays/y_', y)
28 | #####SPLIT DATA INTO TRAIN/TEST SETS#####
29 | if True:
30 | X = np.load('X_y_arrays/X_.npy')
31 | y = np.load('X_y_arrays/y_.npy')
32 |
33 | X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=23,test_size=.2)
34 |
35 | #####HELPER FUNCTION TO PLOT CM#####
36 | def plot_confusion_matrix(cm, name, cmap=plt.cm.Blues):
37 | #Create the basic matrix.
38 | fig = plt.figure(figsize=(6, 6))
39 | plt.imshow(cm, cmap)
40 |
41 | #Add title and Axis Labels
42 | plt.title(name + ' - ' 'Confusion Matrix')
43 | plt.xlabel('Predicted')
44 | plt.ylabel('Actual')
45 |
46 | #Add appropriate Axis Scales
47 | tick_marks = np.arange(0,2)
48 | plt.xticks(tick_marks, ['Miss', 'Make'])
49 | plt.yticks(tick_marks, ['Miss', 'Make'])
50 |
51 | #Add Labels to Each Cell
52 | thresh = 0.75 * cm.max()
53 |
54 | #Add a Side Bar Legend Showing Colors
55 | plt.colorbar()
56 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
57 | plt.text(j, i, cm[i, j],
58 | horizontalalignment="center",
59 | color="black" if cm[i, j] <= thresh else "white")
60 |
61 | plt.tight_layout()
62 | fig.savefig('./models/nn/cm/' + name + '.png', bbox_inches='tight', dpi=480)
63 | plt.show()
64 |
65 | def plot_val_loss_acc(model, name):
66 | model_val_dict = model.history.history
67 | loss_values = model_val_dict['loss']
68 | val_loss_values = model_val_dict['val_loss']
69 | acc_values = model_val_dict['acc']
70 | val_acc_values = model_val_dict['val_acc']
71 |
72 | epochs_ = range(1, len(loss_values) + 1)
73 | plt.plot(epochs_, loss_values, 'g', label='Training loss')
74 | plt.plot(epochs_, val_loss_values, 'g.', label='Validation loss')
75 | plt.plot(epochs_, acc_values, 'r', label='Training acc')
76 | plt.plot(epochs_, val_acc_values, 'r.', label='Validation acc')
77 |
78 | plt.title(name + ' - Training & validation loss / accuracy')
79 | plt.xlabel('Epochs')
80 | plt.ylabel('Loss')
81 | plt.legend()
82 | plt.savefig('models/nn/val_loss_acc/' + name + '.png', bbox_inches='tight')
83 | plt.show()
84 |
85 | #####NEURAL NETWORK GENERATOR#####
86 | def build_nn__(X_train, X_test, y_train, y_test, activation, epochs, batch_size, name, nodes, dropout):
87 |
88 | adam = keras.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
89 |
90 | nn_ = Sequential()
91 |
92 | #First layer
93 | nn_.add(Dense(X_train.shape[1], input_shape=(X_train.shape[1],), activation=activation))
94 | #Iterate through number of nodes and add hidden layers
95 | for i, node in enumerate(nodes):
96 | nn_.add(Dense(node, activation=activation))
97 | if dropout[i]==True:
98 | nn_.add(Dropout(0.2))
99 | #Output layer, use 'sigmoid' activation for binary classfication
100 | nn_.add(Dense(1, activation='sigmoid'))
101 |
102 | #Show NN summary
103 | nn_.summary()
104 | #Compile model
105 | nn_.compile(loss='binary_crossentropy', optimizer=adam, metrics=['acc'])
106 |
107 | #Add early stopping and tensorboard callbacks
108 | early_stopping = EarlyStopping(monitor='val_loss', min_delta = 0.001, patience = 15, verbose=1, mode='auto', baseline=None)
109 | tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None)
110 |
111 | #Fit model
112 | nn_.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, verbose = 1, validation_split=0.1, callbacks = [early_stopping, tensorboard])
113 |
114 | plot_val_loss_acc(nn_, activation + '_' + name)
115 |
116 | nn_.save('./models/nn/' + name + '_' + activation +'_' + str(epochs) + '_' + str(batch_size) + '_' + str(len(nodes)) + '_' + '_'.join([str(i) for i in nodes]) + '.h5')
117 |
118 | print(nn_.evaluate(X_test, y_test))
119 |
120 | cm = confusion_matrix(nn_.predict_classes(X_test), y_test)
121 | print(cm)
122 | plot_confusion_matrix(cm, activation + '_' + name)
123 |
124 | print('Test Set Classification Report')
125 | print(classification_report(nn_.predict_classes(X_test), y_test, target_names=['Miss','Make']))
126 | return nn_
127 |
128 | nn = build_nn__(X_train, X_test, y_train, y_test, activation='relu', epochs=50, batch_size=32, name='16th_run_101', nodes=[128,128,64,64,32,32,16,8], dropout=[False, False, False, False, False, False, False, False])
129 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NBA Shot Analysis
2 |
3 | ## Goal
4 | Build a classification model to predict whether and NBA shot will go in or not, and create visualizations to help general managers, coaches, and players identify shooting patterns, eliminate bad shots, and optimize their strategy to increase shooting efficiency.
5 |
6 | ## ETL
7 | I gathered my data from three sources:
8 | - Shot location data scraped from stats.nba.com (see my blog post for more detail)
9 | - Player tracking data from nbasavant.com
10 | - Defensive stats from basketball-reference
11 |
12 | Since the NBA stopped providing tracking data such as the number of dribbles, and defender distance in the middle of the 2016 season, I focused my project on the 2014-15 season. I gathered data on over 200,000 shots, with features including, but not limited to:
13 | - Shot distance, (x,y) coordinates, and shot zone
14 | - Touch time and number of dribbles
15 | - Name and distance of the closest defender
16 | - Game context stats such as shot clock remaining, period, game clock
17 | - Shot type (jump shot, dunk, etc.)
18 |
19 | I wanted to add more context to each shot, so I added advanced defensive stats for each defender (Block %, Defensive Win Shares/48, Defensive Box Score Plus Minus) and team (Defensive Rating).
20 |
21 | The data I gathered had two different zone breakdowns, one which detailed the directional area of the court (left, right or center) and the other which detailed a more precise location (paint, corner 3, etc.). I combined these into 15 zones, as seen below, and for every player I calculated their Field Goal % (FG%) in each zone so that my model would have a better understanding of the quality of the shot.
22 |
23 |
24 |
25 | I have never been a fan of the argument that momentum impacts basketball games, and have often argued against the concept of a "hot hand" which posits that a player is more likely to hit a shot if they have hit consecutive prior shots. In an attempt to disprove this hypothesis, I engineered new features that detailed whether the shooter has scored their previous 1, 2, and 3 shots. My models found that hitting prior shots did not have a significant impact on whether a player will score their next shot.
26 |
27 | ## Visualizations
28 | I wanted to create a wide range of visualizations that would show the frequency and efficiency of player's and team's shots.
29 |
30 | #### Binned Shot Chart
31 | The first visualization I made is a binned shot chart that breaks the court down into equally sized hexes and groups nearby shots into bubbles, with the size determined by frequency and color by FG%. The color scale differed for two's and three's to account for the point value of each shot. I also added the player's image and some additional stats to the chart. In my dashboard, there is a dropdown where you can select any player, and there is also an option to change the bubble size depending on if you want to see a more precise or broad shot chart.
32 |
33 |
34 |
35 | I made similar charts for each team, where you can get a strong sense of their shooting efficiency and frequency distribution.
36 |
37 |
38 |
39 | #### Frequency Shot Heatmap
40 | In order to get a better sense of where players and teams are shooting from, disregarding efficiency, I designed a heatmap to show the locations where they most frequently shoot from, complete with a dropdown that allows you to select any player or team.
41 |
42 |
43 |
44 | #### FG Frequency Bar Plot
45 | To visualize how the league distributes its shots, I added an interactive bar plot to my dashboard that shows FG% and the number of shots for a given feature that can be selected from a dropdown.
46 |
47 |
48 |
49 | #### FG Percentage Bar Plot
50 | To visualize FG% without focusing on frequency, I built an interactive bar plot that shows leaguewide FG% and the number of shots for a range of features that can be selected from a dropdown.
51 |
52 |
53 |
54 | #### Team Points Per Shot Heatmap Matrix
55 | I wanted to compare how teams perform in different contexts, so created a heatmap matrix that helps visualize which teams under- and overperform in certain aspects. The color of each box is determined by the team's points per shot (PPS) provided the selected feature/context. This gives teams a better sense of where they need to improve and how they stack up among the rest of the league.
56 |
57 |
58 |
59 | ## Machine Learning Models
60 | I trained 6 different machine learning classification models to predict whether a given shot would go in. The models I used were the following:
61 | - Logistic Regression
62 | - Random Forest
63 | - Gradient Boosting
64 | - AdaBoost
65 | - XGBoost
66 | - Neural Network
67 |
68 | For each model, I went through a cross-validation process to help narrow down my feature set into only the most important ones that did not show signs of multicollinearity with other included features. I ultimately narrowed down my initial set of over 20 features to the following 6:
69 | - Shot Distance
70 | - Zone FG%
71 | - Defensive Win Shares per 48 Minutes
72 | - Defender Distance
73 | - Touch Time
74 | - Shot Clock Remaining
75 |
76 | ###### Feature Importances (Gradient Boosting Classifier)
77 |
78 |
79 | Due to the inconsistency in scale of my numeric features (FG% is a decimal but shot distance is measured in feet), I used Scikit-Learn's MinMaxScaler to normalize and vectorize my data. My cross-validation process included hyperparameter tuning for each of my models by running a grid search with Stratified Kfold splits to ensure that the class balance remained consistent across all splits.
80 | For the Neural Network, I used one hidden layer that contained 50 nodes, 'relu' activation due to the lack of negative values, and the 'adam' optimizer to obtain my best results.
81 |
82 | ###### ROC curves
83 |
84 |
85 |
86 |
87 | ###### Confusion Matrix Comparisons (left: Logistic Regression, center: Gradient Boosting, right: Neural Network)
88 |
89 |
90 | My best performing model depends on how a team values the bias/variance tradeoff and whether they would prefer to minimize false negatives (predicting a miss when its actually a make) or false positives (predicting a make when its in fact a miss). A more aggressive team would prefer the Neural Network, which only recommended not to shoot when it was extremely confident the shot would miss, but often recommended the player should shoot, albeit with less than a 40% accuracy. An aggressive team would be fine with this model because it limited false negatives and gave the team more chances to score.
91 |
92 | On the other hand, a more conservative team might prefer the Gradient Boosting model, which correctly classified makes with a much higher accuracy, yet only recommended a shot ~30% of the time. It would likely lead to a higher FG%, but limits the potential scoring opportunities by recommending a team take fewer shots. The Logistic Regression model is far more balanced, sacrificing a lower overall accuracy for better precision and recall.
93 |
94 | ###### Model Results
95 |
96 |
97 | In addition to my individual models, I built a stacked ensemble model that trained the XGBoost, Random Forest, and AdaBoost classifiers, and then trained a Gradient Boosting model on output. This would, in theory, give less biased predictions by weighing multiple models; however, its results were unfortunately worse than my single layer models, so I discarded it.
98 |
99 | ## Shot Recommender
100 | For each player, I built a recommender system that outputs certain zones where the player should shoot more or less frequently from. The concept is based on the player's PPS relative to the league average in each zone. A player who has a high expected PPS relative to the league average in a zone would be recommended to shoot there more frequently. Conversely, a player who shoots poorly in a zone would be recommended to shoot less. In the future, I want to tune this recommender by accounting for the player's frequency of shots in each zone, so that it does not recommend a player shoot more in a zone that already contains a high percentage of their total shots.
101 | ###### Recommender Output
102 |
103 |
104 | ## Next Steps
105 | - Adjust the color scale of binned plots to display efficiency relative to the league average, either in terms of FG% or PPS
106 | - Tune the shot recommender to provide ideal shot distributions
107 | - Classify 2s and 3s differently in my models to see if certain models predict one shot type with higher accuracy than others
108 | - Cluster similarly skilled shooters and recommend an optimal shooting lineup that covers each shot zone
109 | - Host the project online using Dash and Flask instead of the Jupyter Notebook dashboard
110 |
111 | ## Credits
112 | * Kirk Goldsberry for inspiring me to work on this project
113 | * Savvas Tjortjoglou for his court dimensions
114 |
--------------------------------------------------------------------------------
/plotly_viz.py:
--------------------------------------------------------------------------------
1 | ############################### IMPORTS ###############################
2 | if True:
3 | import plotly
4 | import plotly.plotly as py
5 | import plotly.graph_objs as go
6 | plotly.offline.init_notebook_mode(connected=True)
7 |
8 | import matplotlib
9 | import matplotlib.pyplot as plt
10 | import seaborn as sns
11 | import numpy as np
12 | import pandas as pd
13 | pd.set_option('display.max_columns',100)
14 |
15 | from court import court_shapes
16 |
17 | import warnings
18 | warnings.filterwarnings('ignore')
19 |
20 | import itertools, math, time, re, pickle
21 |
22 | ############################## LOAD DATA ##############################
23 | df = pd.read_csv('data/clean_df_1415.csv',index_col=0)
24 | zone_ids = pd.read_csv('data/zone_ids.csv',index_col=0)
25 | zone_fg_pct = pd.read_csv('data/zone_fg_pct.csv',index_col=0)
26 |
27 | ############################## CLEANING DATA ############################
28 | def basic_cleaning(df):
29 | df.period[df.period>5]=5
30 | df.touch_time[df.touch_time<0]=0
31 | df.touch_time[df.touch_time>24]=24
32 | #df.touch_time=round(df.touch_time*4)/4
33 | df.defender_distance[df.defender_distance>10]=10
34 | #df.shot_clock[df.shot_clock>3] = round(df.shot_clock[df.shot_clock>3]*4)/4
35 | df.shot_distance[df.shot_distance>40]=40
36 | df.blk_pct[df.blk_pct>10]=10
37 | df.dbpm[df.dbpm>5.5]=5.5
38 | df['pps'] = df.shot_type*df.shot_made_flag
39 | #basic_cleaning(df)
40 |
41 | ######################################################################
42 | ######################################################################
43 | ###########################--SHOT CHARTS--############################
44 | ######################################################################
45 | ######################################################################
46 |
47 | ######################--DRAW PLAYER SHOT CHART--######################
48 | def draw_shot_chart(name):
49 | player = df[df.name==name]
50 |
51 | missed_shot_trace = go.Scattergl(
52 | x = player[player.shot_made_flag == 0]['x'],
53 | y = player[player.shot_made_flag == 0]['y'],
54 | mode = 'markers',
55 | name = 'Make',
56 | marker= dict(color='blue', symbol='x', size=8, line={'width':1}, opacity=0.7),
57 | text = [str(sd) for sd in player[player.shot_made_flag == 0]['action_type']],
58 | hoverinfo = 'text'
59 | )
60 | made_shot_trace = go.Scattergl(
61 | x = player[player.shot_made_flag == 1]['x'],
62 | y = player[player.shot_made_flag == 1]['y'],
63 | mode = 'markers',
64 | name='Make',
65 | marker= dict(color='red', symbol='circle', size=8, line={'width':1}, opacity=0.7),
66 | text = [str(sd) for sd in player[player.shot_made_flag == 1]['action_type']],
67 | hoverinfo = 'text'
68 | )
69 |
70 | data = [missed_shot_trace, made_shot_trace]
71 | layout = go.Layout(
72 | title= name + ' Shot Chart 2014-2015',
73 | showlegend =True,
74 | xaxis={'showgrid':False, 'range':[-250,250]},
75 | yaxis={'showgrid':False, 'range':[-47.5,500]},
76 | height = 600,
77 | width = 650,
78 | shapes=court_shapes)
79 |
80 | fig = go.Figure(data=data, layout=layout)
81 | plotly.offline.iplot(fig, filename = name + ' Shot Chart')
82 |
83 | ########################--GROUPED SHOT CHART--########################
84 | def grouped_plot(feature):
85 | groups = df.groupby(feature)
86 | colors = np.linspace(0,1,len(groups))
87 |
88 | color_list = ['aliceblue', 'aqua', 'steelblue','violet', 'blue',
89 | 'blueviolet', 'brown', 'cadetblue',
90 | 'chartreuse', 'darkgreen', 'darkmagenta', 'tomato',
91 | 'gold', 'red', 'slategray']
92 | counter=0
93 | data = []
94 | for g, c in zip(groups, colors):
95 | data.append(go.Scattergl(
96 | x = g[1].x,
97 | y = g[1].y,
98 | mode = 'markers',
99 | name = g[0],
100 | marker= dict(symbol='circle', size=7,
101 | line={'width':1}, opacity=0.7, color=color_list[counter]),
102 | text = g[0],
103 | hoverinfo = 'text')
104 | )
105 | counter+=1
106 |
107 | layout = go.Layout(
108 | title='Shot Distribution by ' + feature.title(),
109 | showlegend =True,
110 | xaxis={'showgrid':False, 'range':[-250,250]},
111 | yaxis={'showgrid':False, 'range':[-47.5,500]},
112 | height = 600,
113 | width = 750,
114 | shapes=court_shapes)
115 |
116 | fig = go.Figure(data=data, layout=layout)
117 | plotly.offline.iplot(fig, filename = 'Shot Zone Breakdown')
118 |
119 | ########################--FREQUENCY BAR PLOT--########################
120 | def freq_bar_plots(df, feature, round_=False):
121 | df_ = df.copy()
122 | if round_==True:
123 | df_[feature] = round(df_[feature])
124 |
125 | feat_tab = pd.crosstab(df_[feature], df_.shot_made_flag, margins=True)
126 | feat_tab['fg_pct'] = round(feat_tab[1]/feat_tab['All'],3)
127 |
128 | tab=feat_tab.drop(columns='All')[:-1]
129 | make_text= [str(round(t*100,1)) + '%' for t in tab.fg_pct]
130 | miss_text= [str(round((1-t)*100,1)) + '%' for t in tab.fg_pct]
131 |
132 | trace1 = go.Bar(
133 | x=tab.index,
134 | y=tab[1],
135 | name='Makes',
136 | text= make_text ,
137 | textposition = 'inside',
138 | textfont=dict(
139 | family='sans serif', size=12, color='white'),
140 | marker=dict(
141 | color='red'),
142 | opacity=0.75
143 | )
144 | trace2 = go.Bar(
145 | x=tab.index,
146 | y=tab[0],
147 | name='Misses',
148 | text= miss_text,
149 | textposition = 'inside',
150 | textfont=dict(
151 | family='sans serif', size=10, color='white'),
152 | marker=dict(
153 | color='blue'),
154 | opacity=0.75
155 | )
156 |
157 | line = go.Scatter(
158 | x=tab.index,
159 | y=tab[1],
160 | mode='markers+lines',
161 | name='# Makes',
162 | hoverinfo='skip',
163 | line=dict(
164 | color='black', width=.75)
165 | )
166 |
167 | data = [trace1, trace2, line]
168 | layout = go.Layout(
169 | barmode='stack',
170 | title='FG% by ' + feature.title().replace('_',' '),
171 | showlegend =True,
172 | xaxis=dict(
173 | automargin=True,
174 | autorange=True,
175 | ticks='',
176 | showticklabels=True,
177 | #tickangle=25,
178 | title=feature.replace('_',' ').title()
179 | ),
180 | yaxis=dict(
181 | automargin=True,
182 | ticks='',
183 | showticklabels=True,
184 | title='# of Shots'
185 | )
186 | )
187 |
188 | fig = go.Figure(data=data, layout=layout)
189 | plotly.offline.iplot(fig, filename='stacked-bar')
190 |
191 | ########################--PERCENTAGE BAR CHART--########################
192 | def pct_bar_plots(feature, dataframe, round_=False, player=None, team=None):
193 | if round_==True:
194 | df_ = dataframe.copy()
195 | df_[feature] = round(df_[feature])
196 | else:
197 | df_ = dataframe
198 |
199 | if player:
200 | df = df_[df_.name==player.title()]
201 | title= player.title() + ' - FG% by ' + feature.title().replace('_',' ')
202 | elif team:
203 | df = df_[df_.team_name==team.title()]
204 | title= team.title() + ' - FG% by ' + feature.title().replace('_',' ')
205 | else:
206 | df = df_
207 | title= 'FG% by ' + feature.title().replace('_',' ')
208 |
209 |
210 | test=pd.crosstab(df[feature], df.shot_made_flag, margins=True)
211 | test['pct_made'] = test[1]/test.All
212 | test['pct_missed'] = 1-test.pct_made
213 |
214 | made_text= [str(round(t*100,1)) + '%' for t in test.pct_made]
215 | missed_text= [str(round(t*100,1)) + '%' for t in test.pct_missed]
216 |
217 | trace1 = go.Bar(
218 | x=test.index,
219 | y=test.pct_made,
220 | name='Makes',
221 | text= made_text,
222 | textposition = 'auto',
223 | textfont=dict(
224 | family='sans serif',
225 | size=12, color='white'),
226 | marker=dict(
227 | color='red'),
228 | opacity=0.75
229 | )
230 | trace2 = go.Bar(
231 | x=test.index,
232 | y=test.pct_missed,
233 | name='Misses',
234 | text= missed_text,
235 | textposition = 'auto',
236 | textfont=dict(
237 | family='sans serif',
238 | size=12, color='white'),
239 | marker=dict(
240 | color='blue'),
241 | opacity=0.75,
242 | )
243 |
244 | data = [trace1, trace2]
245 | layout = go.Layout(
246 | barmode='stack',
247 | title= title,
248 | showlegend =True,
249 | )
250 |
251 | fig = go.Figure(data=data, layout=layout)
252 | plotly.offline.iplot(fig, filename='stacked-bar')
253 |
254 | ############################--PPS HEATMAP--#############################
255 | #FIX FUNCTION - CHANGE ZONE TO FEATURE
256 | def pps_heatmap(df, feature):
257 | pps_tab=pd.crosstab(df.team_name, df[feature], values=df.pps, aggfunc='mean',margins=False).fillna(0)
258 |
259 | team_heatmap = go.Heatmap(z=[np.array((pps_tab[pps_tab.index==pps_tab.index[i]])) for i in range(len(pps_tab.index))],
260 | x=pps_tab.columns,
261 | y= [team.split(' ')[-1] for team in pps_tab.index]
262 | )
263 |
264 | layout = go.Layout(
265 | title='Points Per Shot Heatmap',
266 | xaxis = dict(ticks='', nticks=len(pps_tab.columns)),
267 | yaxis = dict(ticks='', nticks=len(pps_tab.index)),
268 | )
269 |
270 | fig = go.Figure(data=[team_heatmap], layout=layout)
271 | plotly.offline.iplot(fig, filename='labelled-heatmap')
272 |
273 | #############################--PIE CHART--#############################
274 | def feature_pie_charts(feature):
275 | labels = df[feature].unique()
276 | values = df[feature].value_counts()
277 | colors = ['#FEBFB3', '#E1396C', '#005eff', '#D0F9B1']
278 |
279 | trace = go.Pie(labels=labels, values=values,
280 | hoverinfo='label+percent', textinfo='value+percent',
281 | textfont=dict(size=20),
282 | marker=dict(colors=colors,
283 | line=dict(color='#000000', width=1)))
284 |
285 | plotly.offline.iplot([trace], filename='styled_pie_chart')
286 |
287 | ##########################--SHOT FREQ HEATMAP--#########################
288 | def shot_freq_heatmap(name):
289 | player = df[df.name==name]
290 |
291 | x_make = player[player.shot_made_flag == 1]['x']
292 | y_make = player[player.shot_made_flag == 1]['y']
293 | x_miss = player[player.shot_made_flag == 0]['x']
294 | y_miss = player[player.shot_made_flag == 0]['y']
295 |
296 | x = np.concatenate([x_make, x_miss])
297 | y = np.concatenate([y_make, y_miss])
298 |
299 | makes = go.Scatter(
300 | x=x_make,
301 | y=y_make,
302 | mode='markers',
303 | name='Make',
304 | showlegend=True,
305 | marker=dict(
306 | symbol='circle',
307 | opacity=0.7,
308 | color='green',
309 | size=4,
310 | line=dict(width=1),
311 | )
312 | )
313 | misses = go.Scatter(
314 | x=x_miss,
315 | y=y_miss,
316 | mode='markers',
317 | name='Miss',
318 | showlegend=True,
319 | marker=dict(
320 | symbol='x',
321 | opacity=0.7,
322 | color='yellow',
323 | size=4,
324 | line=dict(width=1),
325 | )
326 | )
327 | trace3 = go.Histogram2d(
328 | x=x,
329 | y=y,
330 | zmax=40,
331 | zmin=0,
332 | # nbinsx=20,
333 | # nbinsy=20,
334 | zsmooth='best',
335 | autobinx=True,
336 | autobiny=True,
337 | reversescale=False,
338 | opacity=.75,
339 | #zauto=True,
340 | #autocolorscale=True,
341 | )
342 |
343 | layout = go.Layout(
344 | xaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-250,250]),
345 | yaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-47.5,450]),
346 | autosize=False,
347 | height=600,
348 | width=750,
349 | hovermode='closest',
350 | shapes= court_shapes,
351 | title= name + ' - Shot Frequency',
352 | showlegend=True,
353 | legend=dict(x=1.2, y=1),
354 | )
355 |
356 | data = [trace3, makes, misses]
357 | fig = go.Figure(data=data, layout=layout)
358 |
359 | plotly.offline.iplot(fig)
360 |
--------------------------------------------------------------------------------
/shot_chart_viz.py:
--------------------------------------------------------------------------------
1 | ############################### IMPORTS ###############################
2 | if True:
3 | import requests, time, itertools, math, shutil, matplotlib
4 | import pandas as pd
5 | import matplotlib.pyplot as plt
6 | # %matplotlib inline
7 | import seaborn as sns
8 | import numpy as np
9 |
10 | from court import court_shapes
11 |
12 | pd.set_option('display.max_columns',40)
13 | import warnings
14 | warnings.filterwarnings('ignore')
15 |
16 | import ipywidgets as widgets
17 | from ipywidgets import interact
18 |
19 | import plotly
20 | import plotly.plotly as py
21 | import plotly.graph_objs as go
22 | plotly.offline.init_notebook_mode(connected=True)
23 |
24 | #####READ DATAFRAME#####
25 | df = pd.read_csv('final_df_1415.csv',index_col=0)
26 |
27 | #####DRAW PLAYER SHOT CHART (PLOTLY)#####
28 | def draw_shot_chart(name):
29 | player = df[df.name==name]
30 |
31 | missed_shot_trace = go.Scattergl(
32 | x = player[player.shot_made_flag == 0]['x'],
33 | y = player[player.shot_made_flag == 0]['y'],
34 | mode = 'markers',
35 | name = 'Miss',
36 | marker={'color':'blue', 'size':5}
37 | )
38 | made_shot_trace = go.Scattergl(
39 | x = player[player.shot_made_flag == 1]['x'],
40 | y = player[player.shot_made_flag == 1]['y'],
41 | mode = 'markers',
42 | name='Make',
43 | marker={'color':'red', 'size':5}
44 | )
45 |
46 | data = [missed_shot_trace, made_shot_trace]
47 | layout = go.Layout(
48 | title= name + ' Shot Chart 2014-2015',
49 | showlegend =True,
50 | xaxis={'showgrid':False, 'range':[-300,300]},
51 | yaxis={'showgrid':False, 'range':[-100,500]},
52 | height = 600,
53 | width = 650,
54 | shapes=court_shapes)
55 |
56 | fig = go.Figure(data=data, layout=layout)
57 | plotly.offline.iplot(fig, filename = name + ' Shot Chart')
58 |
59 | #####DRAW TEAM SHOT CHART (PLOTLY)#####
60 | def draw_team_sc(team):
61 | team_df = df[df.team_name==team]
62 |
63 | missed_shot_trace = go.Scattergl(
64 | x = team_df[team_df['shot_made_flag'] == 0]['x'],
65 | y = team_df[team_df['shot_made_flag'] == 0]['y'],
66 | mode = 'markers',
67 | name = 'Miss',
68 | marker={'color':'blue', 'size':5}
69 | )
70 | made_shot_trace = go.Scattergl(
71 | x = team_df[team_df['shot_made_flag'] == 1]['x'],
72 | y = team_df[team_df['shot_made_flag'] == 1]['y'],
73 | mode = 'markers',
74 | name='Make',
75 | marker={'color':'red', 'size':5}
76 | )
77 |
78 | data = [missed_shot_trace, made_shot_trace]
79 | layout = go.Layout(
80 | title= team + ' Shot Chart 2014-2015',
81 | showlegend =True,
82 | xaxis={'showgrid':False, 'range':[-300,300]},
83 | yaxis={'showgrid':False, 'range':[-100,500]},
84 | height = 600,
85 | width = 650,
86 | shapes=court_shapes)
87 |
88 | fig = go.Figure(data=data, layout=layout)
89 | plotly.offline.iplot(fig, filename = team + ' Shot Chart')
90 |
91 | #####DROPDOWNS#####
92 | if False:
93 | # team_dropdown = widgets.Dropdown(
94 | # options = sorted(list(set(df.team_name))),
95 | # value='New York Knicks',
96 | # description='Team:',
97 | # disabled=False,
98 | # )
99 | #
100 | # interact(draw_team_sc, team=team_dropdown);
101 |
102 | player_dropdown = widgets.Dropdown(
103 | options = sorted(list(set(df.name))),
104 | value='James Harden',
105 | description='Player:',
106 | disabled=False
107 | )
108 |
109 | grid_slider = widgets.IntSlider(
110 | value=15,
111 | min=5, max=60,
112 | step=5,
113 | description='Bubble Size:',
114 | disabled=False,
115 | )
116 |
117 | interact(freq_shooting_plot, player_name=player_dropdown, gridNum=grid_slider);
118 |
119 | #####DRAW COURT MATPLOTLIB#####
120 | def draw_court(ax=None, color='black', lw=2, outer_lines=False):
121 | from matplotlib.patches import Circle, Rectangle, Arc
122 | if ax is None:
123 | ax = plt.gca()
124 | hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)
125 | backboard = Rectangle((-30, -7.5), 60, -1, linewidth=lw, color=color)
126 | outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw, color=color,
127 | fill=False)
128 | inner_box = Rectangle((-60, -47.5), 120, 190, linewidth=lw, color=color,
129 | fill=False)
130 | top_free_throw = Arc((0, 142.5), 120, 120, theta1=0, theta2=180,
131 | linewidth=lw, color=color, fill=False)
132 | bottom_free_throw = Arc((0, 142.5), 120, 120, theta1=180, theta2=0,
133 | linewidth=lw, color=color, linestyle='dashed')
134 | restricted = Arc((0, 0), 80, 80, theta1=0, theta2=180, linewidth=lw,
135 | color=color)
136 | corner_three_a = Rectangle((-220, -47.5), 0, 140, linewidth=lw,
137 | color=color)
138 | corner_three_b = Rectangle((220, -47.5), 0, 140, linewidth=lw, color=color)
139 | three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158, linewidth=lw,
140 | color=color)
141 | center_outer_arc = Arc((0, 422.5), 120, 120, theta1=180, theta2=0,
142 | linewidth=lw, color=color)
143 | center_inner_arc = Arc((0, 422.5), 40, 40, theta1=180, theta2=0,
144 | linewidth=lw, color=color)
145 | court_elements = [hoop, backboard, outer_box, inner_box, top_free_throw,
146 | bottom_free_throw, restricted, corner_three_a,
147 | corner_three_b, three_arc, center_outer_arc,
148 | center_inner_arc]
149 | if outer_lines:
150 | outer_lines = Rectangle((-250, -47.5), 500, 470, linewidth=lw,
151 | color=color, fill=False)
152 | court_elements.append(outer_lines)
153 |
154 | for element in court_elements:
155 | ax.add_patch(element)
156 |
157 | ax.set_xticklabels([])
158 | ax.set_yticklabels([])
159 | ax.set_xticks([])
160 | ax.set_yticks([])
161 | return ax
162 |
163 | #####FIND PLAYER FG% FOR EACH HEX#####
164 | def find_shootingPcts(shot_df, gridNum):
165 | x = shot_df.x[shot_df['y']<425.1]
166 | y = shot_df.y[shot_df['y']<425.1]
167 |
168 | x_made = shot_df.x[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1)]
169 | y_made = shot_df.y[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1)]
170 |
171 | #compute number of shots made and taken from each hexbin location
172 | hb_shot = plt.hexbin(x, y, gridsize=gridNum, extent=(-250,250,425,-50));
173 | plt.close()
174 | hb_made = plt.hexbin(x_made, y_made, gridsize=gridNum, extent=(-250,250,425,-50),cmap=plt.cm.Reds);
175 | plt.close()
176 |
177 | #compute shooting percentage
178 | ShootingPctLocs = hb_made.get_array() / hb_shot.get_array()
179 | ShootingPctLocs[np.isnan(ShootingPctLocs)] = 0 #makes 0/0s=0
180 | return (ShootingPctLocs, hb_shot)
181 |
182 | #####SCRAPE PLAYER IMAGE#####
183 | def acquire_playerPic(player_id, zoom, offset=(-165,400)):
184 | from matplotlib import offsetbox as osb
185 | ID = str(player_id.unique()[0])
186 |
187 | url = "http://stats.nba.com/media/players/230x185/"+ ID +".png"
188 | pic = requests.get(url,stream=True)
189 |
190 | with open('scraped_images/player_images/' + ID + '.png', 'wb') as out_file:
191 | shutil.copyfileobj(pic.raw, out_file)
192 |
193 | player_pic = plt.imread('scraped_images/player_images/' + ID + '.png')
194 | img = osb.OffsetImage(player_pic, zoom)
195 | img = osb.AnnotationBbox(img, offset,xycoords='data',pad=0.0, box_alignment=(1,0), frameon=False)
196 |
197 | return img
198 |
199 | #####SCRAPE TEAM LOGO#####
200 | def get_team_logo(team_acronym, zoom, offset=(-185,400)):
201 | from matplotlib import offsetbox as osb
202 |
203 | URL = 'https://www.nba.com/assets/logos/teams/primary/web/' + team_acronym + '.png'
204 |
205 | pic = requests.get(URL,stream=True)
206 |
207 | with open('scraped_images/team_images/' + str(team_acronym) + '.png', 'wb') as out_file:
208 | shutil.copyfileobj(pic.raw, out_file)
209 |
210 | team_pic = plt.imread('scraped_images/team_images/' + str(team_acronym) + '.png')
211 | img = osb.OffsetImage(team_pic, zoom)
212 | img = osb.AnnotationBbox(img, offset,xycoords='data',pad=0.0, box_alignment=(1,0), frameon=False)
213 |
214 | return img
215 |
216 | #####COLOR MAP DICTIONARY#####
217 | cdict = {
218 | 'blue': [(0.0, 0.6313725709915161, 0.6313725709915161), (0.25, 0.4470588266849518, 0.4470588266849518), (0.5, 0.29019609093666077, 0.29019609093666077), (0.75, 0.11372549086809158, 0.11372549086809158), (1.0, 0.05098039284348488, 0.05098039284348488)],
219 | 'green': [(0.0, 0.7333333492279053, 0.7333333492279053), (0.25, 0.572549045085907, 0.572549045085907), (0.5, 0.4156862795352936, 0.4156862795352936), (0.75, 0.0941176488995552, 0.0941176488995552), (1.0, 0.0, 0.0)],
220 | 'red': [(0.0, 0.9882352948188782, 0.9882352948188782), (0.25, 0.9882352948188782, 0.9882352948188782), (0.5, 0.9843137264251709, 0.9843137264251709), (0.75, 0.7960784435272217, 0.7960784435272217), (1.0, 0.40392157435417175, 0.40392157435417175)]}
221 | mymap = matplotlib.colors.LinearSegmentedColormap('my_colormap', cdict, 1024)
222 | mymap = mymap.from_list('Color Map',[(0,'#ff0000'),(.45,'#ffff00'),(1,'#00ff00')])
223 |
224 | ####################CALCULATE SEASON STATS TO ADD TO CHART####################
225 | def get_season_stats(player_name):
226 | player = df[df.name==player_name]
227 |
228 | stats = {}
229 |
230 | stats['NUM_GAMES'] = len(player.game_date.unique())
231 | stats['FG_PCT'] = player.groupby(by=['season']).mean().shot_made_flag.sum()
232 | stats['THREE_PT_PCT'] = player[player.shot_type==3].groupby(by=['season']).mean().shot_made_flag.sum()
233 |
234 | twos = player.groupby(['shot_type']).sum().iloc[0].shot_made_flag
235 | threes = player.groupby(['shot_type']).sum().iloc[1].shot_made_flag * 1.5
236 | stats['EFFECTIVE_FG_PCT'] = (twos+threes)/player.shape[0]
237 |
238 | stats['POINTS_PER_SHOT'] = round(player.pps.mean(),3)
239 | stats['AVG_SHOT_DISTANCE'] = round(player.shot_distance.mean())
240 |
241 | printout = """Games: {}\nFG: {:4.1%}\n3PT: {:4.1%}\nEFG: {:4.1%}\nPoints per Shot: {}\nAvg Shot Dist.: {} ft.""".format(*[stats.get(k) for k in stats.keys()])
242 |
243 | return stats, printout
244 |
245 | ##################CALCULATE TEAM STATS TO ADD TO CHART########################
246 | def get_team_stats(team):
247 | team_df = df[df.team_name==team]
248 | stats = {}
249 |
250 | stats['FG_PCT'] = team_df.groupby(by=['season']).mean().shot_made_flag.sum()
251 | stats['THREE_PT_PCT'] = team_df[team_df.shot_type==3].groupby(by=['season']).mean().shot_made_flag.sum()
252 |
253 | twos = team_df.groupby(['shot_type']).sum().iloc[0].shot_made_flag
254 | threes = team_df.groupby(['shot_type']).sum().iloc[1].shot_made_flag * 1.5
255 | stats['EFFECTIVE_FG_PCT'] = (twos+threes)/team_df.shape[0]
256 |
257 | stats['POINTS_PER_SHOT'] = round(team_df.pps.mean(),3)
258 | stats['AVG_SHOT_DISTANCE'] = round(team_df.shot_distance.mean())
259 |
260 | printout = """FG: {:4.1%}\n3PT: {:4.1%}\nEFG: {:4.1%}\nPoints per Shot: {}\nAvg Shot Dist.: {} ft.""".format(*[stats.get(k) for k in stats.keys()])
261 |
262 | return stats, printout
263 |
264 | #################PLOT PLAYER FREQUENCY SHOT CHART (MATPLOTLIB)################
265 | def freq_shooting_plot(player_name,gridNum=25):
266 | plot_size=(12,8)
267 | shot_df = df[df.name==player_name]
268 |
269 | from matplotlib.patches import Circle
270 | x = shot_df.x[shot_df['y']<425.1]
271 | y = shot_df.y[shot_df['y']<425.1]
272 |
273 | #compute shooting percentage and # of shots
274 | (ShootingPctLocs, shotNumber) = find_shootingPcts(shot_df, gridNum)
275 |
276 | #draw figure and court
277 | fig = plt.figure(figsize=plot_size)#(12,7)
278 | cmap = mymap #my modified colormap
279 | ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure
280 | draw_court(outer_lines=False)
281 | plt.xlim(-250,250)
282 | plt.ylim(400, -25)
283 |
284 | #draw player image
285 | zoom = np.float(plot_size[0])/(12.0*2) #how much to zoom the player's pic. I have this hackily dependent on figure size
286 | img = acquire_playerPic(shot_df.player_id, zoom)
287 | ax.add_artist(img)
288 |
289 | #draw circles
290 | for i, shots in enumerate(ShootingPctLocs):
291 | restricted = Circle(shotNumber.get_offsets()[i], radius=shotNumber.get_array()[i],
292 | color=cmap(shots),alpha=1, fill=True)
293 | if restricted.radius > 240/gridNum: restricted.radius=240/gridNum
294 | ax.add_patch(restricted)
295 |
296 | #draw color bar
297 | ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8])
298 | cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical')
299 | cb.set_label('Field Goal %')
300 | cb.set_ticks([0.0, 0.25, 0.5, 0.75, 1.0])
301 | cb.set_ticklabels(['0%','25%', '50%','75%', '100%'])
302 |
303 | ax.set_title(shot_df.name.unique()[0] +' - Shot Chart 2014-15')
304 | #plot season stats
305 | ax.text(135,395,get_season_stats(player_name)[1])
306 | plt.show()
307 | return ax
308 |
309 | #################PLOT TEAM FREQUENCY SHOT CHART (MATPLOTLIB)#################
310 | def team_freq_plot(team, gridNum=25):
311 | plot_size=(12,8)
312 | team_df = df[df.team_name==team]
313 |
314 | from matplotlib.patches import Circle
315 |
316 | #compute shooting percentage and # of shots
317 | (ShootingPctLocs, shotNumber) = find_shootingPcts(team_df, gridNum)
318 |
319 | #draw figure and court
320 | fig = plt.figure(figsize=plot_size)
321 | cmap = mymap #my modified colormap
322 | ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure
323 | draw_court(outer_lines=False)
324 | plt.xlim(-250,250)
325 | plt.ylim(400, -25)
326 |
327 | #draw team image
328 | team_ac = team_df.htm[team_df.is_home==1].unique()[0]
329 | zoom = 1 #np.float(plot_size[0])/(8.0)
330 | img = get_team_logo(team_ac, zoom)
331 | ax.add_artist(img)
332 |
333 | #draw circles
334 | for i, shots in enumerate(ShootingPctLocs):
335 | restricted = Circle(shotNumber.get_offsets()[i], radius=shotNumber.get_array()[i],
336 | color=cmap(shots),alpha=.95, fill=True)
337 | if restricted.radius > 240/gridNum: restricted.radius=240/gridNum
338 | ax.add_patch(restricted)
339 |
340 | #draw color bar
341 | ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8])
342 | cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical')
343 | cb.set_label('Field Goal %')
344 | cb.set_ticks([0.0, 0.25, 0.5, 0.75, 1.0])
345 | cb.set_ticklabels(['0%','25%', '50%','75%', '100%'])
346 |
347 | ax.set_title(team_df.team_name.unique()[0] +' - Shot Chart 2014-15')
348 | #plot season stats
349 | ax.text(150,395,get_team_stats(team)[1])
350 | plt.show()
351 |
--------------------------------------------------------------------------------
/shallow_ML_models.py:
--------------------------------------------------------------------------------
1 | ############################### IMPORTS ###############################
2 | if True:
3 | import pandas as pd
4 | import numpy as np
5 | import matplotlib
6 | import matplotlib.pyplot as plt
7 | import seaborn as sns
8 | import itertools, math, time, re, pickle
9 |
10 | import warnings
11 | warnings.filterwarnings('ignore')
12 |
13 | import plotly
14 | import plotly.plotly as py
15 | import plotly.graph_objs as go
16 | plotly.offline.init_notebook_mode(connected=True)
17 |
18 | from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, ShuffleSplit
19 | from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
20 | from sklearn.linear_model import LogisticRegression
21 | from sklearn.metrics import accuracy_score, auc, confusion_matrix, precision_score, recall_score, roc_curve, f1_score
22 | from sklearn.preprocessing import MinMaxScaler
23 |
24 | from xgboost import XGBClassifier
25 |
26 | from pactools.grid_search import GridSearchCVProgressBar
27 |
28 | ############################## LOAD DATA ##############################
29 | if False:
30 | df = pd.read_csv('data/final_df.csv', index_col=0)
31 |
32 | X = df.drop(columns=['name', 'age', 'pos','player_id','team_id', 'opp_id', 'team_name', 'game_date', 'opponent', 'defender_name', 'game_id', 'action_type', 'season', 'htm', 'vtm', 'game_event_id', 'minutes_remaining', 'seconds_remaining',
33 | 'defender_id', 'shot_type', 'Heave', 'heave_pct', 'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'Above Break 3', 'Corner 3', 'Mid Range', 'Paint', 'Restricted Area', 'C', 'L', 'R', 'dribbles', 'shot_distance', 'shot_made_flag'])
34 | y = np.array(df.shot_made_flag)
35 |
36 | X_col_names = X.columns
37 | with open('./X_y_arrays/X_column_names', 'wb') as x_col:
38 | pickle.dump(X_col_names, x_col)
39 |
40 | minmax_scale = MinMaxScaler()
41 | X = minmax_scale.fit_transform(X)
42 |
43 | np.save('./X_y_arrays/X_shallow', X)
44 | np.save('./X_y_arrays/y_shallow', y)
45 |
46 | #new data
47 | if True:
48 | df = pd.read_csv('final_df_1415.csv', index_col=0)
49 | df[['zone_id', 'period']] = df[['zone_id', 'period']].astype('category')
50 |
51 | X = df.drop(columns=['name', 'team_name', 'game_date', 'season', 'team_id','minutes_remaining', 'seconds_remaining', 'shot_made_flag', 'shot_type', 'opponent', 'x', 'y', 'defender_name', 'opp_id', 'game_id', 'game_event_id',
52 | 'player_id', 'shot_zone_basic', 'shot_zone_area', 'shot_zone_range', 'htm', 'vtm', 'pos', 'age', 'defender_id', 'zone', 'pps', 'zone_id', 'zone_minus_lg_avg', 'lg_zone_avg',
53 | 'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'dribbles', 'period', 'action_type', 'ts%', 'dbpm', '3par', 'usg%', 'blk_pct', 'def_rating'])
54 | y = np.array(df.shot_made_flag)
55 |
56 | X_col_names = X.columns
57 | with open('./X_y_arrays/X_column_names', 'wb') as x_col:
58 | pickle.dump(X_col_names, x_col)
59 |
60 | minmax_scale = MinMaxScaler()
61 | X = minmax_scale.fit_transform(X)
62 |
63 | np.save('./X_y_arrays/X_shallow', X)
64 | np.save('./X_y_arrays/y_shallow', y)
65 | ################### SPLIT DATA INTO TRAIN/TEST SETS ###################
66 | if True:
67 | with open ('./X_y_arrays/X_column_names', 'rb') as fp:
68 | X_col_names = pickle.load(fp)
69 |
70 | X = np.load('./X_y_arrays/X_shallow.npy')
71 | y = np.load('./X_y_arrays/y_shallow.npy')
72 |
73 | X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=23,test_size=.2)
74 |
75 | ########################## HELPER FUNCTIONS ##########################
76 | def build_model(model, path, X_train, X_test, y_train, y_test, decision_function=True):
77 | start = time.time()
78 |
79 | clf = model
80 | clf.fit(X_train,y_train)
81 | y_hat_test = clf.predict(X_test)
82 |
83 | if decision_function==True:
84 | y_score = clf.decision_function(X_test)
85 | else:
86 | y_score = clf.predict_proba(X_test)[:, 1]
87 |
88 | fpr, tpr, thresholds = roc_curve(y_test, y_score)
89 |
90 | #Save model
91 | with open('./models/'+ path + '/' + str(path) + '_' + time.asctime().replace(' ', '_'), 'wb') as f:
92 | pickle.dump(clf, f)
93 |
94 | print('Total Runtime: {} seconds'.format(time.time()-start))
95 | return clf, y_hat_test, y_score, fpr, tpr
96 |
97 | def plot_feature_importances(model, path):
98 | matplotlib.style.use('fivethirtyeight')
99 | n_features = X.shape[1]
100 | plt.figure(figsize=(10,6))
101 | plt.barh(range(n_features), model.feature_importances_, align='center')
102 | plt.yticks(np.arange(n_features), X_col_names)
103 | plt.xlabel("Feature importance")
104 | plt.ylabel("Features")
105 | #Save output
106 | plt.savefig('./models/'+ path + '/feature_importances/' + time.asctime().replace(' ', '_') + '.png')
107 | plt.show()
108 |
109 | def plot_confusion_matrix(cm, path, title='Confusion matrix', cmap=plt.cm.Blues):
110 | #Create the basic matrix.
111 | plt.imshow(cm, cmap)
112 |
113 | #Add title and Axis Labels
114 | plt.title(title)
115 | plt.xlabel('Predicted')
116 | plt.ylabel('Actual')
117 | #Add appropriate Axis Scales
118 | class_names = set(y)
119 | tick_marks = np.arange(len(class_names))
120 | plt.xticks(tick_marks, class_names)
121 | plt.yticks(tick_marks, class_names)
122 |
123 | #Add Labels to Each Cell
124 | thresh = cm.max()*.75
125 |
126 | #Add a Side Bar Legend Showing Colors
127 | plt.colorbar()
128 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
129 | plt.text(j, i, cm[i, j],
130 | horizontalalignment="center",
131 | color="white" if cm[i, j] > thresh else "black")
132 | #Save output
133 | plt.savefig('./models/'+ path + '/cm/' + time.asctime().replace(' ', '_') + '.png', bbox_inches='tight', dpi=480)
134 | plt.show()
135 |
136 | def print_model_metrics(y_pred, y_score, path):
137 | cm = confusion_matrix(y_test, y_pred)
138 | plot_confusion_matrix(cm, path, title='Confusion matrix', cmap=plt.cm.Blues)
139 |
140 | accuracy = accuracy_score(y_test,y_pred)
141 | precision = precision_score(y_test,y_pred)
142 | recall = recall_score(y_test,y_pred)
143 | f1 = f1_score(y_test,y_pred)
144 | fpr, tpr, thresholds = roc_curve(y_test, y_score)
145 | auc_ = auc(fpr, tpr)
146 |
147 | print('Accuracy: {}'.format(round(accuracy,4)))
148 | print('Precision: {}'.format(round(precision,4)))
149 | print('Recall: {}'.format(round(recall,4)))
150 | print('F1 {}'.format(round(f1,4)))
151 | print('AUC: {}'.format(round(auc_,4)))
152 |
153 | #Save output
154 | metrics = np.array([accuracy, precision, recall, f1, auc_])
155 | np.save('./models/'+ path + '/metrics/' + time.asctime().replace(' ', '_'), metrics)
156 |
157 | def plot_roc_curve(fpr, tpr, path):
158 | sns.set_style("darkgrid", {"axes.facecolor": ".9"})
159 |
160 | plt.figure(figsize=(10,6))
161 | lw = 2
162 | plt.plot(fpr, tpr, color='darkorange',
163 | lw=lw, label='ROC curve')
164 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
165 | plt.xlim([0.0, 1.0])
166 | plt.ylim([0.0, 1.05])
167 | plt.yticks([i/20.0 for i in range(21)])
168 | plt.xticks([i/20.0 for i in range(21)])
169 | plt.xlabel('False Positive Rate')
170 | plt.ylabel('True Positive Rate')
171 | plt.title('Receiver operating characteristic (ROC) Curve')
172 | plt.legend(loc="lower right")
173 |
174 | #Save output
175 | plt.savefig('./models/'+ path + '/roc_curves/' + time.asctime().replace(' ', '_') + '.png', bbox_inches='tight', dpi=480)
176 | plt.show()
177 | ######################################################################
178 |
179 | ############################# GRID SEARCH ############################
180 | def run_grid_search(model, path, param_grid, X, y, cv=3):
181 | start = time.time()
182 |
183 | search = GridSearchCVProgressBar(model, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
184 | search.fit(X,y)
185 |
186 | print("Total Runtime for Grid Search: {:.4} seconds".format(round(time.time() - start, 2)))
187 |
188 | best_score = search.best_score_
189 | best_params = search.best_params_
190 |
191 | print("Testing Accuracy: {:.4}%".format(best_score * 100))
192 | print("\nOptimal Parameters: {}".format(best_params))
193 |
194 | search_results = pd.DataFrame.from_dict(search.cv_results_)
195 |
196 | search_results.to_csv('./grid_search_results/'+ path + '_' + str(round(best_score,4)).replace('.','') + '_' + time.asctime().replace(' ', '_'))
197 |
198 | return search_results, best_score, best_params
199 | ######################################################################
200 |
201 | ########################## PARAMETER GRIDS ###########################
202 | if True:
203 | log_reg_param_grid = {'penalty':['l1','l2'],
204 | 'C': np.logspace(0, 4, 10)
205 | }
206 |
207 | rf_param_grid = {'n_estimators':[100,250],
208 | 'criterion':['gini', 'entropy'],
209 | 'min_samples_leaf':[2,5,10],
210 | 'min_samples_split':[2,5,10],
211 | 'n_jobs':[-1]
212 | }
213 |
214 | gb_param_grid = {'n_estimators':[50, 100, 250],
215 | 'learning_rate':[.01, .05, .1, 1],
216 | 'min_samples_leaf':[2, 5, 10],
217 | 'min_samples_split':[2, 5, 10],
218 | 'max_depth':[2, 5, 10]
219 | }
220 |
221 | xgb_param_grid = {'learning_rate':[.01, .05, .1, 1],
222 | 'n_estimators':[100, 250],
223 | 'max_depth':[2, 5, 10],
224 | 'min_child_weight': [1, 5, 10],
225 | 'gamma': [0.5, 1, 2],
226 | }
227 |
228 | ######################################################################
229 |
230 |
231 | ######################## LOGISTIC REGRESSION #########################
232 | if True:
233 | # log_reg, log_y_preds, log_y_score, log_fpr, log_tpr = build_model(LogisticRegression(C=1, class_weight='balanced'),
234 | # 'logreg', X_train, X_test, y_train, y_test)
235 | #
236 | # print_model_metrics(log_y_preds, log_y_score, 'logreg')
237 | # plot_roc_curve(log_fpr, log_tpr, 'logreg')
238 |
239 | log_reg_search_results, log_reg_best_score, log_reg_best_params = run_grid_search(LogisticRegression(random_state=23),'logreg', log_reg_param_grid, X, y, cv=10)
240 | ######################################################################
241 |
242 |
243 | ###################### RANDOM FOREST CLASSIFIER ######################
244 | if False:
245 | rf, rf_y_preds, rf_y_score, rf_fpr, rf_tpr = build_model(RandomForestClassifier(n_estimators=500, criterion='gini', min_samples_leaf=10, min_samples_split=10, verbose=.5, class_weight='balanced', n_jobs=-1, random_state=23),
246 | 'rf', X_train, X_test, y_train, y_test, decision_function=False)
247 |
248 | print_model_metrics(rf_y_preds, rf_y_score, 'rf')
249 | plot_roc_curve(rf_fpr, rf_tpr, 'rf')
250 | plot_feature_importances(rf, 'rf')
251 |
252 | # rf_search_results, rf_best_score, rf_best_params = run_grid_search(RandomForestClassifier(random_state=23),'rf', rf_param_grid, X, y, cv=3)
253 |
254 | # [ParallelProgressBar(n_jobs=-1)]: Done 108 out of 108 | elapsed: 67.1min finished
255 | # Total Runtime for Grid Search: 4.095e+03 seconds
256 | # Testing Accuracy: 70.82%
257 | #
258 | # Optimal Parameters: {'criterion': 'gini', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 250, 'n_jobs': -1}
259 | ######################################################################
260 |
261 |
262 | #################### GRADIENT BOOSTING CLASSIFIER ####################
263 | if False:
264 | gb, gb_y_preds, gb_y_score, gb_fpr, gb_tpr = build_model(GradientBoostingClassifier(learning_rate=0.05, n_estimators=500, max_depth=5, min_samples_leaf=7, min_samples_split=7, verbose=1, random_state=23),
265 | 'gb', X_train, X_test, y_train, y_test)
266 |
267 | print_model_metrics(gb_y_preds, gb_y_score, 'gb')
268 | plot_roc_curve(gb_fpr, gb_tpr, 'gb')
269 | plot_feature_importances(gb, 'gb')
270 | ######################################################################
271 |
272 |
273 | ######################### ADABOOST CLASSIFIER #########################
274 | if False:
275 | ada, ada_y_preds, ada_y_score, ada_fpr, ada_tpr = build_model(AdaBoostClassifier(learning_rate=.01, n_estimators=500, algorithm='SAMME.R', random_state=23),
276 | 'ada', X_train, X_test, y_train, y_test)
277 |
278 | print_model_metrics(ada_y_preds, ada_y_score, 'ada')
279 | plot_roc_curve(ada_fpr, ada_tpr, 'ada')
280 | plot_feature_importances(ada, 'ada')
281 | ######################################################################
282 |
283 |
284 | ######################### XGBOOST CLASSIFIER #########################
285 | if False:
286 | xgb, xgb_y_preds, xgb_y_score, xgb_fpr, xgb_tpr = build_model(XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=1, algorithm='SAMME.R', objective='binary:logistic', reg_alpha=0, reg_lambda=0, n_jobs=-1, random_state=23),
287 | 'xgb', X_train, X_test, y_train, y_test, decision_function=False)
288 |
289 | print_model_metrics(xgb_y_preds, xgb_y_score, 'xgb')
290 | plot_roc_curve(xgb_fpr, xgb_tpr, 'xgb')
291 | plot_feature_importances(xgb, 'xgb')
292 |
293 | # xgb_search_results, xgb_best_score, xgb_best_params = run_grid_search(XGBClassifier(random_state=23),'xgb', xgb_param_grid, X_train, y_train)
294 |
295 | # Testing Accuracy: 72.23%
296 | #
297 | # Optimal Parameters: {'gamma': 1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 250}
298 | ######################################################################
299 |
300 |
301 | ######################## STACKED ENSEMBLE MODEL ######################
302 | def create_ensemble_model(X,y):
303 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23, test_size=.2)
304 |
305 | rf = RandomForestClassifier(n_estimators=500, criterion='gini', max_features='sqrt', min_samples_leaf=10, min_samples_split=2, verbose=1, class_weight='balanced', n_jobs=-1, random_state=23)
306 |
307 | xgb = XGBClassifier(learning_rate=0.1, n_estimators=250, max_depth=5, min_child_weight=1, gamma=1, algorithm='SAMME.R', objective='binary:logistic', n_jobs=-1, random_state=23)
308 |
309 | ada = AdaBoostClassifier(learning_rate=.75, n_estimators=500, algorithm='SAMME.R', random_state=23)
310 |
311 | rf.fit(X_train, y_train)
312 | rf_train_preds = pd.DataFrame(rf.predict_proba(X_train))
313 | rf_test_preds = pd.DataFrame(rf.predict_proba(X_test))
314 |
315 | xgb.fit(X_train, y_train)
316 | xgb_train_preds = pd.DataFrame(xgb.predict_proba(X_train))
317 | xgb_test_preds = pd.DataFrame(xgb.predict_proba(X_test))
318 |
319 | ada.fit(X_train, y_train)
320 | ada_train_preds = pd.DataFrame(ada.predict_proba(X_train))
321 | ada_test_preds = pd.DataFrame(ada.predict_proba(X_test))
322 |
323 | train_df = pd.concat([rf_train_preds, xgb_train_preds, ada_train_preds], names=['rf','xgb','ada'], axis=1)
324 | test_df = pd.concat([rf_test_preds, xgb_test_preds, ada_test_preds], names=['rf','xgb','ada'], axis=1)
325 |
326 | model = LogisticRegression(random_state=1)
327 | model.fit(train_df,y_train)
328 | y_preds = model.predict(test_df)
329 | # y_score = model.score(y_preds, y_test)
330 |
331 | return train_df, test_df, y_preds
332 | ######################################################################
333 |
--------------------------------------------------------------------------------
/new_ETL.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | import matplotlib.pyplot as plt
3 | import seaborn as sns
4 | import numpy as np
5 | import pandas as pd
6 | pd.set_option('display.max_columns',100)
7 |
8 | import warnings
9 | warnings.filterwarnings('ignore')
10 |
11 | import itertools, math, time, re
12 |
13 | ############################--LOAD DATA--#############################
14 | def load_data_to_df():
15 | oct_nov_ = pd.read_csv('./data/nba_savant/oct-nov-14-15.csv')
16 | dec_ = pd.read_csv('./data/nba_savant/dec-14-15.csv')
17 | jan_ = pd.read_csv('./data/nba_savant/jan-14-15.csv')
18 | feb_ = pd.read_csv('./data/nba_savant/feb-14-15.csv')
19 | mar_ = pd.read_csv('./data/nba_savant/mar-14-15.csv')
20 | apr_ = pd.read_csv('./data/nba_savant/apr-14-15.csv')
21 |
22 | df = pd.concat([oct_nov_,dec_,jan_,feb_,mar_,apr_])
23 | #reverse x values to plot correctly
24 | df.x = -df.x
25 | df.game_date = pd.to_datetime(df.game_date)
26 | df = df.reset_index(drop=True)
27 | return df
28 | df = load_data_to_df()
29 | ######################################################################
30 |
31 | ###########################--BASIC CLEANING--#########################
32 | df.shot_type = np.where(df.shot_type=='2PT Field Goal', 2, 3)
33 | df.period[df.period>5]=5
34 | df['pps'] = df.shot_type*df.shot_made_flag
35 | df.touch_time[df.touch_time<0]=0
36 | df.touch_time[df.touch_time>24]=24
37 |
38 | def create_team_ids(df):
39 | team_id_dict = {}
40 | for id_, team in enumerate(list(set(df.team_name))):
41 | team_id_dict[team]=id_+1
42 |
43 | df['opp_id']=0
44 | #get team ids from 1-30
45 | for k,v in team_id_dict.items():
46 | df['team_id'] = np.where(df.team_name==k, v, df['team_id'])
47 | df['opp_id'] = np.where(df.opponent==k, v, df['opp_id'])
48 | create_team_ids(df)
49 | ######################################################################
50 |
51 |
52 | ####################--LOAD NBA SCRAPED DATA--######################
53 | nba_shots = pd.read_csv('./data/shots_1415.csv',index_col=0)
54 | nba_shots.GAME_DATE = nba_shots.GAME_DATE.astype('str')
55 |
56 | #Adds dashes to date string so it can be converted to datetime format
57 | def add_dashes(string):
58 | date = string[:4] + '-' + string[4:6] + '-' + string[-2:]
59 | return date
60 |
61 | def clean_scraped_nba_data():
62 | nba_shots.GAME_DATE = nba_shots.GAME_DATE.apply(lambda x: add_dashes(x))
63 | nba_shots.GAME_DATE = pd.to_datetime(nba_shots.GAME_DATE)
64 | nba_shots.LOC_X = -nba_shots.LOC_X
65 | clean_scraped_nba_data()
66 | ######################################################################
67 |
68 | ########################--MERGE NBA AND SAVANT--######################
69 | def merge_nba_and_savant_data(df,nba_shots):
70 | merged_df = df.merge(nba_shots, left_on=['team_name','game_date','period', 'minutes_remaining','seconds_remaining','x','y'], right_on=['TEAM_NAME','GAME_DATE','PERIOD','MINUTES_REMAINING', 'SECONDS_REMAINING','LOC_X','LOC_Y'])
71 |
72 | merged_df = merged_df.drop(columns=['GRID_TYPE','PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING', 'SECONDS_REMAINING','SHOT_DISTANCE','LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE', 'espn_player_id', 'espn_game_id', 'EVENT_TYPE', 'SHOT_TYPE', 'ACTION_TYPE'])
73 |
74 | return merged_df
75 | merged_df = merge_nba_and_savant_data(df,nba_shots)
76 | ######################################################################
77 |
78 | ########################--FEATURE ENGINEERING--######################
79 | #helper function to get dictionary matching team names to home and away team acronyms
80 | def create_home_acronym_dict():
81 | team_acronyms = sorted(list(merged_df.HTM.unique()))
82 | team_names = sorted(list(merged_df.team_name.unique()))
83 |
84 | team_name_ac_dict = dict(zip(team_names,team_acronyms))
85 | team_name_ac_dict['Boston Celtics'] = 'BOS'
86 | team_name_ac_dict['Brooklyn Nets'] = 'BKN'
87 | return team_name_ac_dict
88 |
89 | #Function to determing if the shooter is playing at home
90 | def get_home_team():
91 | start = time.time()
92 | is_home_arr = []
93 |
94 | team_name_ac_dict=create_home_acronym_dict()
95 |
96 | for index, row in merged_df.iterrows():
97 | if team_name_ac_dict[row.team_name]==row.HTM:
98 | is_home_arr.append(1)
99 | else:
100 | is_home_arr.append(0)
101 | if index%100000==0:
102 | print('Runtime: {} seconds. {} iterations to go.'.format(round(time.time()-start,2), len(merged_df)-index))
103 | return is_home_arr
104 | merged_df['is_home'] = get_home_team()
105 |
106 | #sort the dataframe by date, game_id, player_name, and game_event_id
107 | sorted_df = merged_df.copy().sort_values(by=['game_date','GAME_ID','name','GAME_EVENT_ID']).reset_index(drop=True)
108 |
109 | #Function to calculate whether player is hot, i.e. whether they have hit 1, 2, or 3 previous shots
110 | def is_player_hot(df):
111 | start=time.time()
112 |
113 | #create array that stores whether previous 1, 2, or 3 shots were made, respectively
114 | heat_check_array=np.zeros((len(df),3))
115 |
116 | for index, row in df.iterrows():
117 | #If index < 3, cant check prior three shots
118 | if index==0:
119 | heat_check_array[index,:]+=[0,0,0]
120 | elif index==1:
121 | if (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1):
122 | heat_check_array[index,:]+=[1,0,0]
123 | else:
124 | heat_check_array[index,:]+=[0,0,0]
125 | elif index==2:
126 | if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1):
127 | heat_check_array[index,:]+=[1,1,0]
128 | elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==0):
129 | heat_check_array[index,:]+=[1,0,0]
130 | else:
131 | heat_check_array[index,:]+=[0,0,0]
132 | #If index >=3
133 | else:
134 | if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==1):
135 | heat_check_array[index,:]+=[1,1,1]
136 | elif (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==0):
137 | heat_check_array[index,:]+=[1,1,0]
138 | elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1):
139 | heat_check_array[index,:]+=[1,0,0]
140 | else:
141 | heat_check_array[index,:]+=[0,0,0]
142 |
143 | if index%50000==0:
144 | print('Runtime: {} seconds. {} iterations remaining.'.format(round(time.time()-start,2), len(df)-index))
145 |
146 | return heat_check_array
147 |
148 | def add_heat_check_to_df(df):
149 | heat_check_array = is_player_hot(df)
150 | df['prev_shot_made'] = heat_check_array[:,0]
151 | df['prev_2_made'] = heat_check_array[:,1]
152 | df['prev_3_made'] = heat_check_array[:,2]
153 | add_heat_check_to_df(sorted_df)
154 | ######################################################################
155 |
156 |
157 | ####################--LOAD ADVANCED STATS--######################
158 | stats = pd.read_excel('./data/adv-stats-14-15.xlsx',index_col=0)
159 | stats['DWS/48'] = round(stats.DWS/stats.MP*48,3)
160 |
161 | # Clean up name discrepancies between two dfs
162 | def clean_name_discrepancies(df,stats):
163 | stats.Player = stats.Player.apply(lambda x: re.sub(r'([^\s\w]|_)+', '', x))
164 | df.name[df.name=='Jose Juan Barea'] = 'JJ Barea'
165 | df.name[df.name=='Tim Hardaway Jr'] = 'Tim Hardaway'
166 | df.name[df.name=='Charles Hayes'] = 'Chuck Hayes'
167 | df.name[df.name=='Glen Rice Jr'] = 'Glen Rice'
168 | df.name[df.name=='Louis Williams'] = 'Lou Williams'
169 |
170 | stats.Player[stats.Player=='Nene Hilario'] = 'Nene'
171 | stats.Player[stats.Player=='Jeffery Taylor'] = 'Jeff Taylor'
172 | stats.Player[stats.Player== 'Luigi Datome'] = 'Gigi Datome'
173 |
174 | #convert defender name to first name last name format
175 | df.defender_name[df.defender_name.isnull()] = 'None'
176 | clean_name_discrepancies(sorted_df, stats)
177 |
178 | #convert defender names from last,first to first,last
179 | def convert_defender_names(player):
180 | if player =='None':
181 | return 'None'
182 | elif player=='Nene':
183 | return 'Nene'
184 | else:
185 | name = player.split(', ')
186 | full_name = ' '.join((name[1],name[0]))
187 | return re.sub(r'([^\s\w]|_)+', '', full_name)
188 | sorted_df.defender_name = sorted_df.defender_name.apply(convert_defender_names)
189 |
190 | # Clean up name discrepancies between two dfs
191 | def clean_defender_names(df):
192 | df.defender_name[df.defender_name=='Jose Juan Barea'] = 'JJ Barea'
193 | df.defender_name[df.defender_name=='Tim Hardaway Jr'] = 'Tim Hardaway'
194 | df.defender_name[df.defender_name=='Charles Hayes'] = 'Chuck Hayes'
195 | df.defender_name[df.defender_name=='Glen Rice Jr'] = 'Glen Rice'
196 | df.defender_name[df.defender_name=='Louis Williams'] = 'Lou Williams'
197 | clean_defender_names(sorted_df)
198 |
199 | ############# OFFENSE ###########
200 | def merge_off_stats(df,stats):
201 | off_stats = stats[['Player','Pos','Age','TS%','3PAr','USG%']]
202 | df = df.merge(off_stats, left_on='name', right_on='Player').drop(columns=['Player'])
203 | df.columns = map(str.lower, df.columns)
204 | return df
205 | sorted_df = merge_off_stats(sorted_df,stats)
206 |
207 | ############ DEFENSE ###########
208 | #map player ids to new df column matching to defender name
209 | def add_defender_ids(df):
210 | player_ids_df = df[['name','player_id']].rename(columns={'name': 'defender_name', 'player_id':'defender_id'})
211 | player_ids_df = player_ids_df.groupby('defender_name').max()
212 |
213 | none_id = pd.DataFrame(data=[('None',0)],
214 | columns=['defender_name', 'defender_id']).set_index('defender_name')
215 | player_ids_df = pd.concat((player_ids_df,none_id))
216 |
217 | #merge two dataframes with defender ids
218 | df = df.merge(player_ids_df, on='defender_name')
219 | return df
220 | sorted_df = add_defender_ids(sorted_df)
221 |
222 | def merge_def_stats(df,stats):
223 | def_stats = stats[['Player', 'BLK%', 'DWS/48', 'DBPM']].rename(columns={'Player':'defender_name', 'BLK%':'blk_pct', 'DWS/48':'dws/48', 'DBPM':'dbpm'})
224 |
225 | #add dummy stats for no defender (id=0) and append to defense stats
226 | none_stats = pd.DataFrame(data = [('None', 0, 0, 0)], columns=['defender_name', 'blk_pct', 'dws/48', 'dbpm'])
227 |
228 | #add player advanced def stats
229 | def_stats = pd.concat((def_stats, none_stats)).reset_index(drop= True)
230 | df = df.merge(def_stats, on='defender_name')
231 |
232 | #add team defensive rating
233 | d_rating_14 = pd.read_excel('./data/drating_2014.xlsx')
234 | df = df.merge(d_rating_14, left_on='team_name', right_on='Team').drop(columns='Team')
235 |
236 | return df
237 | sorted_df = merge_def_stats(sorted_df,stats)
238 |
239 | ######################################################################
240 |
241 |
242 | ########################--ADDITIONAL CLEANING--#######################
243 | def clean_positions(df):
244 | df.pos[df.name=='Giannis Antetokounmpo'] = 'SF'
245 | df.pos[df.pos=='PG-SG'] = 'SG'
246 | df.pos[df.pos=='SF-SG'] = 'SF'
247 | df.pos[df.pos=='SG-PG'] = 'PG'
248 | df.pos[df.pos=='PF-SF'] = 'SF'
249 | df.pos[df.pos=='SF-PF'] = 'PF'
250 | df.pos[df.pos=='SG-SF'] = 'SF'
251 | clean_positions(sorted_df)
252 |
253 | def clean_shot_zones(df):
254 | df.shot_zone_basic[df.shot_zone_basic=='In The Paint (Non-RA)'] = 'Paint'
255 | #change shots misclassified as above_break_3 to backcourt
256 | df.shot_zone_basic[(df.shot_zone_area=='Back Court(BC)') & (df.shot_zone_basic=='Above the Break 3')] = 'Backcourt'
257 | clean_shot_zones(sorted_df)
258 |
259 | def reduce_action_types(df):
260 | df.action_type=df.action_type.str.lower()
261 | new_action_types=[]
262 | for i, row in df.action_type.iteritems():
263 | if 'dunk' in row:
264 | new_action_types.append('dunk')
265 | elif 'layup' in row:
266 | new_action_types.append('layup')
267 | elif ('driving') in row or ('running') in row:
268 | new_action_types.append('driving_running')
269 | elif 'pullup' in row:
270 | new_action_types.append('pullup')
271 | elif ('fadeaway') in row or ('turnaround') in row or 'step back' in row:
272 | new_action_types.append('fade_turn_step')
273 | elif 'hook' in row:
274 | new_action_types.append('hook_shot')
275 | elif 'jump' in row:
276 | new_action_types.append('jump_shot')
277 | else:
278 | new_action_types.append(row)
279 | return new_action_types
280 | sorted_df.action_type = reduce_action_types(sorted_df)
281 | ######################################################################
282 |
283 | sorted_df.to_csv('data/mid_etl_checkpoint_df.csv')
284 |
285 | ########################--GET FG % FOR EACH ZONE--####################
286 | def get_zone_fg_pct(df, date=None, event=None):
287 | fg_pct_list = []
288 | column_names = []
289 |
290 | # if date:
291 | # df = df[df.game_date 0:
326 | zone_ids.append((id_, zone_, area_))
327 | id_+=1
328 |
329 | zone_id_df = pd.DataFrame.from_records(zone_ids, columns=['zone_id', 'shot_zone_basic', 'shot_zone_area'])
330 | return zone_id_df
331 | zone_ids = create_zone_ids_df(sorted_df)
332 |
333 | def add_zone_to_zone_ids(zone_ids):
334 | list_ = []
335 | for index, row in zone_ids.iterrows():
336 | list_.append(('_'.join([row.shot_zone_area,
337 | row.shot_zone_basic]).replace(' ','_').replace(')','').split('(')[1],
338 | row.zone_id))
339 |
340 | zone_ids = zone_ids.merge(pd.DataFrame(list_, columns=['zone', 'zone_id']),on='zone_id')
341 | return zone_ids
342 | zone_ids = add_zone_to_zone_ids(zone_ids)
343 |
344 | #add zone_id, zone to df
345 | sorted_df = sorted_df.merge(zone_ids, on=['shot_zone_basic', 'shot_zone_area'])
346 |
347 | #get player avg for each zone they are shooting in
348 | def get_zone_avg(df):
349 | start = time.time()
350 | df_slice = df[['name','zone']]
351 | zone_avg = []
352 |
353 | for index, row in df_slice.iterrows():
354 | zone_slice= zone_fg_pct[zone_fg_pct.name==row[0]]
355 | zone_avg.append(zone_slice[row[1]].sum())
356 |
357 | if index % 25000==0:
358 | print('Runtime: {} seconds. Iterations remaining: {}.'.format(round(time.time()-start,2), len(df_slice)-index))
359 | return zone_avg
360 | zone_avgs = get_zone_avg(sorted_df)
361 | sorted_df['zone_avg']=zone_avgs
362 |
363 | #add league avg for each zone
364 | sorted_df = sorted_df.merge(sorted_df.groupby('zone').mean().zone_avg.reset_index().rename(columns={'zone_avg': 'lg_zone_avg'}), on='zone')
365 | #add fg% relative to lg avg for each zone
366 | sorted_df['zone_minus_lg_avg'] = sorted_df.zone_avg-sorted_df.lg_zone_avg
367 |
368 | sorted_df.to_csv('final_df_1415.csv')
369 | zone_fg_pct.to_csv('data/zone_fg_pct.csv')
370 | zone_ids.to_csv('data/zone_ids.csv')
371 |
372 | ######################################################################
373 | #rearrange columns for better visability
374 | # clean = sorted_df[['name','pos','age','player_id', 'team_name', 'team_id', 'game_date',
375 | # 'game_id', 'game_event_id','season', 'period',
376 | # 'minutes_remaining', 'seconds_remaining', 'shot_made_flag',
377 | # 'action_type', 'shot_zone_basic', 'shot_zone_area', 'shot_zone_range',
378 | # 'shot_type', 'shot_distance', 'x', 'y', 'dribbles', 'touch_time',
379 | # 'opponent', 'opp_id', 'defender_name', 'defender_distance', 'shot_clock', 'htm', 'vtm',
380 | # 'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'ts%', '3par', 'usg%']]
381 |
--------------------------------------------------------------------------------
/presentation.py:
--------------------------------------------------------------------------------
1 | ############################### IMPORTS ###############################
2 | if True:
3 | import itertools, math, time, re, pickle
4 |
5 | import plotly
6 | import plotly.plotly as py
7 | import plotly.graph_objs as go
8 | plotly.offline.init_notebook_mode(connected=True)
9 |
10 | import matplotlib
11 | import matplotlib.pyplot as plt
12 | import seaborn as sns
13 | import numpy as np
14 | import pandas as pd
15 | pd.set_option('display.max_columns',100)
16 |
17 | import ipywidgets as widgets
18 | from ipywidgets import interact
19 |
20 | import warnings
21 | warnings.filterwarnings('ignore')
22 |
23 | from court import court_shapes
24 |
25 | from shot_chart_viz import acquire_playerPic, get_team_logo, get_season_stats, get_team_stats, draw_court
26 |
27 | cdict = {
28 | 'blue': [(0.0, 0.6313725709915161, 0.6313725709915161), (0.25, 0.4470588266849518, 0.4470588266849518), (0.5, 0.29019609093666077, 0.29019609093666077), (0.75, 0.11372549086809158, 0.11372549086809158), (1.0, 0.05098039284348488, 0.05098039284348488)],
29 | 'green': [(0.0, 0.7333333492279053, 0.7333333492279053), (0.25, 0.572549045085907, 0.572549045085907), (0.5, 0.4156862795352936, 0.4156862795352936), (0.75, 0.0941176488995552, 0.0941176488995552), (1.0, 0.0, 0.0)],
30 | 'red': [(0.0, 0.9882352948188782, 0.9882352948188782), (0.25, 0.9882352948188782, 0.9882352948188782), (0.5, 0.9843137264251709, 0.9843137264251709), (0.75, 0.7960784435272217, 0.7960784435272217), (1.0, 0.40392157435417175, 0.40392157435417175)]}
31 | mymap = matplotlib.colors.LinearSegmentedColormap('my_colormap', cdict, 1024)
32 | ############################## LOAD DATA ##############################
33 | df = pd.read_csv('final_df_1415.csv', index_col=0)
34 |
35 | ######################################################################
36 | ###########################--SHOT CHARTS--############################
37 | ######################################################################
38 |
39 | ########################--BUBBLE SHOT CHARTS--########################
40 | def find_shootingPcts(shot_df, gridNum):
41 | x2 = shot_df.x[(shot_df['y']<425.1) & (shot_df.shot_type==2)]
42 | y2 = shot_df.y[(shot_df['y']<425.1) & (shot_df.shot_type==2)]
43 |
44 | x2_made = shot_df.x[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==2)]
45 | y2_made = shot_df.y[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==2)]
46 |
47 | #compute number of shots made and taken from each hexbin location
48 | hb_shot2 = plt.hexbin(x2, y2, gridsize=gridNum, extent=(-250,250,425,-50));
49 | plt.close()
50 | hb_made2 = plt.hexbin(x2_made, y2_made, gridsize=gridNum, extent=(-250,250,425,-50),cmap=plt.cm.Reds);
51 | plt.close()
52 |
53 | #compute shooting percentage
54 | ShootingPctLocs2 = hb_made2.get_array() / hb_shot2.get_array()
55 | ShootingPctLocs2[np.isnan(ShootingPctLocs2)] = 0 #makes 0/0s=0
56 |
57 | #############################################################################################################
58 | #############################################################################################################
59 | ########################################### THREE POINTERS ################################################
60 | #############################################################################################################
61 | #############################################################################################################
62 |
63 | x3 = shot_df.x[(shot_df['y']<425.1) & (shot_df.shot_type==3)]
64 | y3 = shot_df.y[(shot_df['y']<425.1) & (shot_df.shot_type==3)]
65 |
66 | x3_made = shot_df.x[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==3)]
67 | y3_made = shot_df.y[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==3)]
68 |
69 | #compute number of shots made and taken from each hexbin location
70 | hb_shot3 = plt.hexbin(x3, y3, gridsize=gridNum, extent=(-250,250,425,-50));
71 | plt.close()
72 | hb_made3 = plt.hexbin(x3_made, y3_made, gridsize=gridNum, extent=(-250,250,425,-50),cmap=plt.cm.Reds);
73 | plt.close()
74 |
75 | #compute shooting percentage
76 | ShootingPctLocs3 = hb_made3.get_array() / hb_shot3.get_array()
77 | ShootingPctLocs3[np.isnan(ShootingPctLocs3)] = 0 #makes 0/0s=0
78 |
79 | return (ShootingPctLocs2, hb_shot2, ShootingPctLocs3, hb_shot3)
80 |
81 | def freq_shooting_plot(player_name, gridNum=25):
82 | plot_size=(10,8)
83 | shot_df = df[df.name==player_name]
84 |
85 | from matplotlib.patches import Circle
86 | #compute shooting percentage and # of shots
87 | (ShootingPctLocs2, shotNumber2) = find_shootingPcts(shot_df, gridNum)[0:2]
88 | (ShootingPctLocs3, shotNumber3) = find_shootingPcts(shot_df, gridNum)[2:]
89 |
90 | #draw figure and court
91 | fig = plt.figure(figsize=plot_size)#(12,7)
92 | ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure
93 | draw_court(outer_lines=False)
94 | plt.xlim(-250,250)
95 | plt.ylim(400, -25)
96 |
97 | #draw player image
98 | zoom = np.float(plot_size[0])/(12.0*2) #how much to zoom the player's pic. I have this hackily dependent on figure size
99 | img = acquire_playerPic(shot_df.player_id, zoom)
100 | ax.add_artist(img)
101 |
102 | ############################################ TWO POINTERS #################################################
103 | cmap = mymap.from_list('Color Map',[(0,'#ff0000'),(.45,'#ffff00'),(.6,'#00ff00'), (1,'#004d00')])
104 | #draw circles
105 | for i, shots in enumerate(ShootingPctLocs2):
106 | restricted2 = Circle(shotNumber2.get_offsets()[i], radius=shotNumber2.get_array()[i],
107 | color=cmap(shots),alpha=1, fill=True)
108 | if restricted2.radius > 240/gridNum: restricted2.radius=240/gridNum
109 | ax.add_patch(restricted2)
110 |
111 | #draw color bar
112 | ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8])
113 | cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical')
114 | cb.set_label('Field Goal %', labelpad=20)
115 | cb.set_ticks([0.0, 0.25, .485, 0.75, 1.0])
116 | cb.set_ticklabels(['0%','25%','48.5%\nLg Avg', '75%', '100%'])
117 |
118 | ########################################### THREE POINTERS ################################################
119 | #plotting 3 pointers separately to account for expected lower fg% from deep
120 | cmap3 = mymap.from_list('Color Map',[(0,'#ff0000'),(.35,'#ffff00'),(.6,'#00ff00'),(1,'#004d00')])
121 | #draw circles
122 | for i, shots in enumerate(ShootingPctLocs3):
123 | restricted3 = Circle(shotNumber3.get_offsets()[i], radius=shotNumber3.get_array()[i],
124 | color=cmap3(shots),alpha=1, fill=True)
125 | if restricted3.radius > 240/gridNum: restricted3.radius=240/gridNum
126 | ax.add_patch(restricted3)
127 |
128 | #draw color bar
129 | ax3 = fig.add_axes([1.1, 0.1, 0.02, 0.8])
130 | cb3 = matplotlib.colorbar.ColorbarBase(ax3,cmap=cmap3, orientation='vertical')
131 | cb3.set_label('Three Point %',labelpad=-8)
132 | cb3.set_ticks([0.0, 0.25,.35, 0.5, 0.75, 1.0])
133 | cb3.set_ticklabels(['0%','25%','35% - Lg Avg', '50%','75%', '100%'])
134 |
135 | ax.set_title(shot_df.name.unique()[0] +' - Shot Chart 2014-15')
136 | #plot season stats
137 | ax.text(135,395,get_season_stats(player_name)[1])
138 |
139 | plt.show()
140 | shot_recommender(player_name)
141 |
142 | #################PLOT TEAM FREQUENCY SHOT CHART (MATPLOTLIB)#################
143 | def team_freq_plot(team, gridNum=25):
144 | plot_size=(10,8)
145 | team_df = df[df.team_name==team]
146 |
147 | from matplotlib.patches import Circle
148 | #compute shooting percentage and # of shots
149 | (ShootingPctLocs2, shotNumber2) = find_shootingPcts(team_df, gridNum)[0:2]
150 | (ShootingPctLocs3, shotNumber3) = find_shootingPcts(team_df, gridNum)[2:]
151 |
152 | #draw figure and court
153 | fig = plt.figure(figsize=plot_size)
154 | ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure
155 | draw_court(outer_lines=False)
156 | plt.xlim(-250,250)
157 | plt.ylim(400, -25)
158 |
159 | #draw team image
160 | team_ac = team_df.htm[team_df.is_home==1].unique()[0]
161 | zoom = 1 #np.float(plot_size[0])/(8.0)
162 | img = get_team_logo(team_ac, zoom)
163 | ax.add_artist(img)
164 |
165 | ############################################ TWO POINTERS #################################################
166 | cmap = mymap.from_list('Color Map',[(0,'#ff0000'),(.45,'#ffff00'),(.6,'#00ff00'), (1,'#004d00')])
167 | #draw circles
168 | for i, shots in enumerate(ShootingPctLocs2):
169 | restricted2 = Circle(shotNumber2.get_offsets()[i], radius=shotNumber2.get_array()[i],
170 | color=cmap(shots),alpha=.9, fill=True)
171 | if restricted2.radius > 240/gridNum: restricted2.radius=240/gridNum
172 | ax.add_patch(restricted2)
173 |
174 | #draw color bar
175 | ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8])
176 | cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical')
177 | cb.set_label('Field Goal %', labelpad=20)
178 | cb.set_ticks([0.0, 0.25, .485, 0.75, 1.0])
179 | cb.set_ticklabels(['0%','25%','48.5%\nLg Avg', '75%', '100%'])
180 |
181 | ########################################### THREE POINTERS ################################################
182 | #plotting 3 pointers separately to account for expected lower fg% from deep
183 | cmap3 = mymap.from_list('Color Map',[(0,'#ff0000'),(.35,'#ffff00'),(.6,'#00ff00'),(1,'#004d00')])
184 | #draw circles
185 | for i, shots in enumerate(ShootingPctLocs3):
186 | restricted3 = Circle(shotNumber3.get_offsets()[i], radius=shotNumber3.get_array()[i],
187 | color=cmap3(shots),alpha=.9, fill=True)
188 | if restricted3.radius > 240/gridNum: restricted3.radius=240/gridNum
189 | ax.add_patch(restricted3)
190 |
191 | #draw color bar
192 | ax3 = fig.add_axes([1.1, 0.1, 0.02, 0.8])
193 | cb3 = matplotlib.colorbar.ColorbarBase(ax3,cmap=cmap3, orientation='vertical')
194 | cb3.set_label('Three Point %',labelpad=-8)
195 | cb3.set_ticks([0.0, 0.25,.35, 0.5, 0.75, 1.0])
196 | cb3.set_ticklabels(['0%','25%','35% - Lg Avg', '50%','75%', '100%'])
197 |
198 |
199 | ax.set_title(team_df.team_name.unique()[0] +' - Shot Chart 2014-15')
200 | #plot season stats
201 | ax.text(150,395,get_team_stats(team)[1])
202 | plt.show()
203 |
204 | ########################--GROUPED SHOT CHART--########################
205 | def grouped_plot(feature):
206 | groups = df.groupby(feature)
207 | colors = np.linspace(0,1,len(groups))
208 |
209 | color_list = ['aliceblue', 'aqua', 'steelblue','violet', 'blue',
210 | 'blueviolet', 'brown', 'cadetblue',
211 | 'chartreuse', 'darkgreen', 'darkmagenta', 'tomato',
212 | 'gold', 'red', 'slategray']
213 | counter=0
214 | data = []
215 | for g, c in zip(groups, colors):
216 | data.append(go.Scattergl(
217 | x = g[1].x,
218 | y = g[1].y,
219 | mode = 'markers',
220 | name = g[0],
221 | marker= dict(symbol='circle', size=7,
222 | line={'width':1}, opacity=0.7, color=color_list[counter]),
223 | text = g[0],
224 | hoverinfo = 'text')
225 | )
226 | counter+=1
227 |
228 | layout = go.Layout(
229 | title='Shot Distribution by ' + feature.title(),
230 | showlegend =True,
231 | xaxis={'showgrid':False, 'range':[-250,250]},
232 | yaxis={'showgrid':False, 'range':[-47.5,500]},
233 | height = 600,
234 | width = 750,
235 | hovermode='closest',
236 | shapes=court_shapes)
237 |
238 | fig = go.Figure(data=data, layout=layout)
239 | plotly.offline.iplot(fig, filename = 'Shot Zone Breakdown')
240 |
241 | ##########################--SHOT FREQ HEATMAP--#########################
242 | def shot_freq_heatmap(name):
243 | if name in df.name.unique():
244 | df_ = df[df.name==name]
245 | z_max=40
246 | z_min=0
247 | else:
248 | df_ = df[df.team_name==name]
249 | z_max=250
250 | z_min=5
251 |
252 | x_make = df_[df_.shot_made_flag == 1]['x']
253 | y_make = df_[df_.shot_made_flag == 1]['y']
254 | x_miss = df_[df_.shot_made_flag == 0]['x']
255 | y_miss = df_[df_.shot_made_flag == 0]['y']
256 |
257 | x = np.concatenate([x_make, x_miss])
258 | y = np.concatenate([y_make, y_miss])
259 |
260 | makes = go.Scatter(
261 | x=x_make,
262 | y=y_make,
263 | mode='markers',
264 | name='Make',
265 | showlegend=True,
266 | marker=dict(
267 | symbol='circle',
268 | opacity=0.7,
269 | color='green',
270 | size=4,
271 | line=dict(width=1),
272 | )
273 | )
274 | misses = go.Scatter(
275 | x=x_miss,
276 | y=y_miss,
277 | mode='markers',
278 | name='Miss',
279 | showlegend=True,
280 | marker=dict(
281 | symbol='x',
282 | opacity=0.7,
283 | color='yellow',
284 | size=4,
285 | line=dict(width=1),
286 | )
287 | )
288 | trace3 = go.Histogram2d(
289 | x=x,
290 | y=y,
291 | zmax=z_max,
292 | zmin=z_min,
293 | # nbinsx=20,
294 | # nbinsy=20,
295 | zsmooth='best',
296 | autobinx=True,
297 | autobiny=True,
298 | reversescale=False,
299 | opacity=.75,
300 | #zauto=True,
301 | #autocolorscale=True,
302 | )
303 |
304 | layout = go.Layout(
305 | xaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-250,250]),
306 | yaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-47.5,450]),
307 | autosize=False,
308 | height=600,
309 | width=750,
310 | hovermode='closest',
311 | shapes= court_shapes,
312 | title= name + ' - Shot Frequency Heatmap',
313 | showlegend=True,
314 | legend=dict(x=1.2, y=1),
315 | )
316 |
317 | data = [trace3]#, makes, misses]
318 | fig = go.Figure(data=data, layout=layout)
319 |
320 | plotly.offline.iplot(fig)
321 |
322 | ############################--PPS HEATMAP--#############################
323 | # def pps_heatmap(feature):
324 | # pps_tab=pd.crosstab(df.team_name, df[feature], values=df.pps, aggfunc='mean',margins=False).fillna(0)
325 | #
326 | # team_heatmap = go.Heatmap(z=[np.array((pps_tab[pps_tab.index==pps_tab.index[i]])) for i in range(len(pps_tab.index))],
327 | # x=pps_tab.columns, y= [team.split(' ')[-1] for team in pps_tab.index]
328 | # )
329 | #
330 | # layout = go.Layout(
331 | # title='Points Per Shot Heatmap',
332 | # xaxis = dict(ticks='', nticks=len(pps_tab.columns), automargin=True),
333 | # yaxis = dict(ticks='', nticks=len(pps_tab.index), automargin=True),
334 | # )
335 | #
336 | # fig = go.Figure(data=[team_heatmap], layout=layout)
337 | # plotly.offline.iplot(fig, filename='pps-heatmap')
338 | def pps_heatmap_sns(feature):
339 | pps_tab=pd.crosstab(df[feature], df.team_name, values=df.pps, aggfunc='mean',margins=False).fillna(0)
340 |
341 | plt.figure(figsize=(15,6))
342 | sns.heatmap(pps_tab, annot=False, robust=True)
343 | plt.show()
344 |
345 | ########################--FREQUENCY BAR PLOT--########################
346 | def freq_bar_plots(feature, round_=False):
347 | df_ = df.copy()
348 | if round_==True:
349 | df_[feature] = round(df_[feature])
350 |
351 | feat_tab = pd.crosstab(df_[feature], df_.shot_made_flag, margins=True)
352 | feat_tab['fg_pct'] = round(feat_tab[1]/feat_tab['All'],3)
353 |
354 | tab=feat_tab.drop(columns='All')[:-1]
355 | make_text= [str(round(t*100,1)) + '%' for t in tab.fg_pct]
356 | miss_text= [str(round((1-t)*100,1)) + '%' for t in tab.fg_pct]
357 |
358 | trace1 = go.Bar(
359 | x=tab.index,
360 | y=tab[1],
361 | name='Makes',
362 | text= make_text ,
363 | textposition = 'inside',
364 | textfont=dict(
365 | family='sans serif', size=12, color='white'),
366 | marker=dict(
367 | color='red'),
368 | opacity=0.75
369 | )
370 | trace2 = go.Bar(
371 | x=tab.index,
372 | y=tab[0],
373 | name='Misses',
374 | text= miss_text,
375 | textposition = 'inside',
376 | textfont=dict(
377 | family='sans serif', size=10, color='white'),
378 | marker=dict(
379 | color='blue'),
380 | opacity=0.75
381 | )
382 |
383 | line = go.Scatter(
384 | x=tab.index,
385 | y=tab[1],
386 | mode='markers+lines',
387 | name='# Makes',
388 | hoverinfo='skip',
389 | line=dict(
390 | color='black', width=.75)
391 | )
392 |
393 | data = [trace1, trace2]#, line]
394 | layout = go.Layout(
395 | barmode='stack',
396 | title='FG% by ' + feature.title().replace('_',' '),
397 | showlegend =True,
398 | xaxis=dict(
399 | automargin=True,
400 | autorange=True,
401 | ticks='',
402 | showticklabels=True,
403 | #tickangle=25,
404 | title=feature.replace('_',' ').title()
405 | ),
406 | yaxis=dict(
407 | automargin=True,
408 | ticks='',
409 | showticklabels=True,
410 | title='# of Shots'
411 | )
412 | )
413 |
414 | fig = go.Figure(data=data, layout=layout)
415 | plotly.offline.iplot(fig, filename='stacked-bar')
416 |
417 | #########################--PERCENTAGE BAR CHART--##########################
418 | def pct_bar_plots(feature, round_=False, player=None, team=None):
419 | if round_==True:
420 | df_ = df.copy()
421 | df_[feature] = round(df_[feature])
422 | else:
423 | df_ = df
424 |
425 | if player:
426 | df_ = df[df.name==player.title()]
427 | title= player.title() + ' - FG% by ' + feature.title().replace('_',' ')
428 | elif team:
429 | df_ = df[df.team_name==team.title()]
430 | title= team.title() + ' - FG% by ' + feature.title().replace('_',' ')
431 | else:
432 | df_ = df
433 | title= 'FG% by ' + feature.title().replace('_',' ')
434 |
435 |
436 | c_tab=pd.crosstab(df_[feature], df_.shot_made_flag, margins=True)
437 | c_tab['pct_made'] = c_tab[1]/c_tab.All
438 | c_tab['pct_missed'] = 1-c_tab.pct_made
439 |
440 | made_text= [str(round(t*100,1)) + '%' for t in c_tab.pct_made]
441 | missed_text= [str(round(t*100,1)) + '%' for t in c_tab.pct_missed]
442 |
443 | trace1 = go.Bar(
444 | x=c_tab.index,
445 | y=c_tab.pct_made,
446 | name='Makes',
447 | text= made_text,
448 | textposition = 'auto',
449 | textfont=dict(
450 | family='sans serif',
451 | size=12, color='white'),
452 | marker=dict(
453 | color='red'),
454 | opacity=0.75
455 | )
456 | trace2 = go.Bar(
457 | x=c_tab.index,
458 | y=c_tab.pct_missed,
459 | name='Misses',
460 | text= missed_text,
461 | textposition = 'auto',
462 | textfont=dict(
463 | family='sans serif',
464 | size=12, color='white'),
465 | marker=dict(
466 | color='blue'),
467 | opacity=0.75,
468 | )
469 |
470 | data = [trace1, trace2]
471 | layout = go.Layout(
472 | barmode='stack',
473 | title= title,
474 | showlegend =True,
475 | xaxis=dict(
476 | automargin=True,
477 | autorange=True,
478 | ticks='',
479 | showticklabels=True,
480 | title=feature.replace('_',' ').title()
481 | ),
482 | yaxis=dict(
483 | automargin=True,
484 | ticks='',
485 | showticklabels=True,
486 | title='FG %'
487 | )
488 | )
489 |
490 | fig = go.Figure(data=data, layout=layout)
491 | plotly.offline.iplot(fig, filename='stacked-bar')
492 |
493 |
494 | ###########################--SHOT RECOMMENDER--###########################
495 | def player_pps(name):
496 | player = df[df.name==name]
497 | pps_tab=pd.crosstab(player.zone, player.name,
498 | values=player.pps, aggfunc='mean',
499 | margins=False).fillna(0).rename(
500 | columns={list(set(player.name))[0]:'pps'})
501 |
502 | pps_freq = pd.concat([pps_tab,
503 | pd.DataFrame(
504 | player.zone.value_counts()).rename(
505 | columns={'zone':'count_'})],
506 | axis=1).sort_values(by='pps',
507 | ascending=False)
508 |
509 | pps_freq['freq_pct'] = pps_freq.count_/pps_freq.count_.sum()
510 |
511 | pps_freq=pps_freq.sort_values('freq_pct',ascending=False)
512 |
513 | return pps_freq[pps_freq.freq_pct>=.05]
514 |
515 | def pps_zone_percentiles(name):
516 | pps_per_zone = pd.crosstab(df.name, df.zone, df.pps, aggfunc='mean').fillna(0)
517 | pps_percentiles = pps_per_zone.quantile(np.round(np.arange(.1,1,.2)*10)/10)
518 | return pps_percentiles
519 |
520 | def shot_recommender(name):
521 | pps = player_pps(name)
522 | zone_percentiles = pps_zone_percentiles(name)
523 |
524 | more_freq = []
525 | less_freq = []
526 |
527 | for i in pps.index:
528 | if pps.loc[i].pps > zone_percentiles[i].loc[.7]:
529 | more_freq.append(i)
530 | #print(name + ' should shoot in ' + i + ' more frequently')
531 | elif pps.loc[i].pps < zone_percentiles[i].loc[.5]:
532 | if i != 'C_Restricted_Area':
533 | less_freq.append(i)
534 | #print(name + ' should shoot in ' + i + ' less frequently')
535 | if len(more_freq)>0:
536 | print(name + ' should shoot in the following zones more frequently:')
537 | [print(' - ' + zone) for zone in more_freq]
538 | if len(less_freq)>0:
539 | print(name + ' should shoot in the following zones less frequently:')
540 | [print(' - ' + zone) for zone in less_freq]
541 |
--------------------------------------------------------------------------------
/Data-Exploration.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 583,
6 | "metadata": {
7 | "extensions": {
8 | "jupyter_dashboards": {
9 | "version": 1,
10 | "views": {
11 | "grid_default": {
12 | "hidden": true
13 | },
14 | "report_default": {
15 | "hidden": true
16 | }
17 | }
18 | }
19 | },
20 | "scrolled": true
21 | },
22 | "outputs": [
23 | {
24 | "data": {
25 | "text/html": [
26 | ""
27 | ],
28 | "text/vnd.plotly.v1+html": [
29 | ""
30 | ]
31 | },
32 | "metadata": {},
33 | "output_type": "display_data"
34 | }
35 | ],
36 | "source": [
37 | "import matplotlib\n",
38 | "import matplotlib.pyplot as plt\n",
39 | "import seaborn as sns\n",
40 | "import numpy as np\n",
41 | "import pandas as pd\n",
42 | "pd.set_option('display.max_columns',100)\n",
43 | "\n",
44 | "import plotly\n",
45 | "import plotly.plotly as py\n",
46 | "import plotly.graph_objs as go\n",
47 | "plotly.offline.init_notebook_mode(connected=True)\n",
48 | "\n",
49 | "import warnings\n",
50 | "warnings.filterwarnings('ignore')\n",
51 | "from court import court_shapes\n",
52 | "\n",
53 | "import ipywidgets as widgets\n",
54 | "from ipywidgets import interact\n",
55 | "\n",
56 | "import itertools, math, time"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 589,
62 | "metadata": {
63 | "extensions": {
64 | "jupyter_dashboards": {
65 | "version": 1,
66 | "views": {
67 | "grid_default": {
68 | "hidden": true
69 | },
70 | "report_default": {
71 | "hidden": true
72 | }
73 | }
74 | }
75 | },
76 | "scrolled": true
77 | },
78 | "outputs": [],
79 | "source": [
80 | "oct_nov_ = pd.read_csv('./data/nba_savant/oct-nov-14-15.csv')\n",
81 | "dec_ = pd.read_csv('./data/nba_savant/dec-14-15.csv')\n",
82 | "jan_ = pd.read_csv('./data/nba_savant/jan-14-15.csv')\n",
83 | "feb_ = pd.read_csv('./data/nba_savant/feb-14-15.csv')\n",
84 | "mar_ = pd.read_csv('./data/nba_savant/mar-14-15.csv')\n",
85 | "apr_ = pd.read_csv('./data/nba_savant/apr-14-15.csv')"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 590,
91 | "metadata": {
92 | "extensions": {
93 | "jupyter_dashboards": {
94 | "version": 1,
95 | "views": {
96 | "grid_default": {
97 | "hidden": true
98 | },
99 | "report_default": {
100 | "hidden": true
101 | }
102 | }
103 | }
104 | },
105 | "scrolled": true
106 | },
107 | "outputs": [],
108 | "source": [
109 | "df = pd.concat([oct_nov_,dec_,jan_,feb_,mar_,apr_])\n",
110 | "#reverse x values to plot correctly\n",
111 | "df.x = -df.x\n",
112 | "df.game_date = pd.to_datetime(df.game_date)\n",
113 | "df = df.reset_index(drop=True)"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 591,
119 | "metadata": {
120 | "extensions": {
121 | "jupyter_dashboards": {
122 | "version": 1,
123 | "views": {
124 | "grid_default": {
125 | "hidden": true
126 | },
127 | "report_default": {
128 | "hidden": true
129 | }
130 | }
131 | }
132 | },
133 | "scrolled": true
134 | },
135 | "outputs": [],
136 | "source": [
137 | "stats = pd.read_excel('./data/adv-stats-14-15.xlsx',index_col=0)"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 592,
143 | "metadata": {
144 | "extensions": {
145 | "jupyter_dashboards": {
146 | "version": 1,
147 | "views": {
148 | "grid_default": {
149 | "hidden": true
150 | },
151 | "report_default": {
152 | "hidden": true
153 | }
154 | }
155 | }
156 | },
157 | "scrolled": true
158 | },
159 | "outputs": [],
160 | "source": [
161 | "# Clean up name discrepancies between two dfs\n",
162 | "import re\n",
163 | "stats.Player = stats.Player.apply(lambda x: re.sub(r'([^\\s\\w]|_)+', '', x))\n",
164 | "df.name[df.name=='Jose Juan Barea'] = 'JJ Barea'\n",
165 | "df.name[df.name=='Tim Hardaway Jr'] = 'Tim Hardaway'\n",
166 | "df.name[df.name=='Charles Hayes'] = 'Chuck Hayes'\n",
167 | "df.name[df.name=='Glen Rice Jr'] = 'Glen Rice'\n",
168 | "df.name[df.name=='Louis Williams'] = 'Lou Williams'\n",
169 | "\n",
170 | "stats.Player[stats.Player=='Nene Hilario'] = 'Nene'\n",
171 | "stats.Player[stats.Player=='Jeffery Taylor'] = 'Jeff Taylor'\n",
172 | "stats.Player[stats.Player== 'Luigi Datome'] = 'Gigi Datome'"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 593,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "# #convert defender name to first name last name format\n",
182 | "# df.defender_name[df.defender_name.isnull()] = 'None'\n",
183 | "\n",
184 | "# def convert_defender_names(player):\n",
185 | "# if player =='None':\n",
186 | "# return 'None'\n",
187 | "# elif player=='Nene':\n",
188 | "# return 'Nene'\n",
189 | "# else:\n",
190 | "# name = player.split(', ')\n",
191 | "# full_name = ' '.join((name[1],name[0]))\n",
192 | "# return re.sub(r'([^\\s\\w]|_)+', '', full_name)\n",
193 | " \n",
194 | "# df.defender_name = df.defender_name.apply(convert_defender_names)\n",
195 | "\n",
196 | "# # Clean up name discrepancies between two dfs\n",
197 | "# df.defender_name[df.defender_name=='Jose Juan Barea'] = 'JJ Barea'\n",
198 | "# df.defender_name[df.defender_name=='Tim Hardaway Jr'] = 'Tim Hardaway'\n",
199 | "# df.defender_name[df.defender_name=='Charles Hayes'] = 'Chuck Hayes'\n",
200 | "# df.defender_name[df.defender_name=='Glen Rice Jr'] = 'Glen Rice'\n",
201 | "# df.defender_name[df.defender_name=='Louis Williams'] = 'Lou Williams'"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": 594,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "# #map player ids to new df column matching to defender name\n",
211 | "# player_ids_df = df[['name','player_id']].rename(columns={'name':'defender_name','player_id':'defender_id'})\n",
212 | "# player_ids_df = player_ids_df.groupby('defender_name').max()\n",
213 | "\n",
214 | "# df = df.merge(player_ids_df, on='defender_name')"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 595,
220 | "metadata": {
221 | "extensions": {
222 | "jupyter_dashboards": {
223 | "version": 1,
224 | "views": {
225 | "grid_default": {
226 | "hidden": true
227 | },
228 | "report_default": {
229 | "hidden": true
230 | }
231 | }
232 | }
233 | },
234 | "scrolled": true
235 | },
236 | "outputs": [],
237 | "source": [
238 | "df.shot_type = np.where(df.shot_type=='2PT Field Goal', 2, 3)"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 596,
244 | "metadata": {
245 | "extensions": {
246 | "jupyter_dashboards": {
247 | "version": 1,
248 | "views": {
249 | "grid_default": {
250 | "hidden": true
251 | },
252 | "report_default": {}
253 | }
254 | }
255 | },
256 | "scrolled": true
257 | },
258 | "outputs": [],
259 | "source": [
260 | "def get_shot_distance(x,y):\n",
261 | " x_squared=x**2\n",
262 | " y_squared=y**2\n",
263 | " shot_distance = math.sqrt(x_squared + y_squared) / 10 # unit for distance is off by factor of 10, divide by 10 to convert to feet\n",
264 | " return round(shot_distance, 1)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 598,
270 | "metadata": {
271 | "extensions": {
272 | "jupyter_dashboards": {
273 | "version": 1,
274 | "views": {
275 | "grid_default": {
276 | "hidden": true
277 | },
278 | "report_default": {}
279 | }
280 | }
281 | },
282 | "scrolled": true
283 | },
284 | "outputs": [],
285 | "source": [
286 | "def get_shot_zone(row):\n",
287 | " x = row.x\n",
288 | " y = row.y\n",
289 | " \n",
290 | " shot_zone = ''\n",
291 | " shot_area = ''\n",
292 | " \n",
293 | " #restricted area, shots within 4ft of hoop\n",
294 | " if get_shot_distance(x,y)<=4:\n",
295 | " shot_zone = 'Restricted Area'\n",
296 | " \n",
297 | " #abov break 3 pointers\n",
298 | " elif (get_shot_distance(x,y)>=23.9) & (y>=92.5):\n",
299 | " shot_zone = 'Above Break 3'\n",
300 | " #corner 3s \n",
301 | " elif (y<92.5) & ((x<=-220) | (x>=220)):\n",
302 | " shot_zone = 'Corner 3'\n",
303 | " #in the paint shots excluding restricted area \n",
304 | " elif (-80<=x<=80) & (-47.5<=y<=143.5) & (get_shot_distance(x,y)>4):\n",
305 | " shot_zone = 'Paint'\n",
306 | " #mid range shots, left and right side\n",
307 | " elif (get_shot_distance(x,y)<23.9) & ((-22035:\n",
315 | " shot_zone = 'Heave'\n",
316 | " \n",
317 | " #Get area of court (left, right, or center)\n",
318 | " if shot_zone !='Paint':\n",
319 | " if (x <= 80) & (x>=-80):\n",
320 | " shot_area = 'C'\n",
321 | " elif (x>80):\n",
322 | " shot_area = 'L'\n",
323 | " else:\n",
324 | " shot_area = 'R' \n",
325 | " #for shots in paint, they have special designation for left, right, and center\n",
326 | " else:\n",
327 | " if x>40:\n",
328 | " shot_area = 'L'\n",
329 | " elif x<-40:\n",
330 | " shot_area = 'R'\n",
331 | " else:\n",
332 | " shot_area = 'C'\n",
333 | " return shot_zone, shot_area"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 599,
339 | "metadata": {
340 | "extensions": {
341 | "jupyter_dashboards": {
342 | "version": 1,
343 | "views": {
344 | "grid_default": {
345 | "hidden": true
346 | },
347 | "report_default": {}
348 | }
349 | }
350 | },
351 | "scrolled": true
352 | },
353 | "outputs": [],
354 | "source": [
355 | "def add_shot_zones_area_to_df(df):\n",
356 | " shot_zones = []\n",
357 | " shot_areas = []\n",
358 | "\n",
359 | " for index, row in df.iterrows():\n",
360 | " shot_zones.append(get_shot_zone(row)[0])\n",
361 | " shot_areas.append(get_shot_zone(row)[1])\n",
362 | "\n",
363 | " df['shot_zone'] = shot_zones\n",
364 | " df['shot_area'] = shot_areas\n",
365 | "\n",
366 | "add_shot_zones_area_to_df(df) "
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 600,
372 | "metadata": {
373 | "extensions": {
374 | "jupyter_dashboards": {
375 | "version": 1,
376 | "views": {
377 | "grid_default": {
378 | "col": 0,
379 | "height": 10,
380 | "hidden": false,
381 | "row": 61,
382 | "width": 12
383 | },
384 | "report_default": {}
385 | }
386 | }
387 | },
388 | "scrolled": true
389 | },
390 | "outputs": [
391 | {
392 | "data": {
393 | "text/html": [
394 | "\n",
395 | "\n",
408 | "
\n",
409 | " \n",
410 | " \n",
411 | " | \n",
412 | " name | \n",
413 | " team_name | \n",
414 | " game_date | \n",
415 | " season | \n",
416 | " espn_player_id | \n",
417 | " team_id | \n",
418 | " espn_game_id | \n",
419 | " period | \n",
420 | " minutes_remaining | \n",
421 | " seconds_remaining | \n",
422 | " shot_made_flag | \n",
423 | " action_type | \n",
424 | " shot_type | \n",
425 | " shot_distance | \n",
426 | " opponent | \n",
427 | " x | \n",
428 | " y | \n",
429 | " dribbles | \n",
430 | " touch_time | \n",
431 | " defender_name | \n",
432 | " defender_distance | \n",
433 | " shot_clock | \n",
434 | " shot_zone | \n",
435 | " shot_area | \n",
436 | "
\n",
437 | " \n",
438 | " \n",
439 | " \n",
440 | " | 205545 | \n",
441 | " Evan Turner | \n",
442 | " Boston Celtics | \n",
443 | " 2015-04-08 | \n",
444 | " 2014 | \n",
445 | " 4239.0 | \n",
446 | " 1610612738 | \n",
447 | " 400579456.0 | \n",
448 | " 1 | \n",
449 | " 10 | \n",
450 | " 29 | \n",
451 | " 1 | \n",
452 | " Turnaround Jump Shot | \n",
453 | " 2 | \n",
454 | " 13 | \n",
455 | " Detroit Pistons | \n",
456 | " 114 | \n",
457 | " 64 | \n",
458 | " 5 | \n",
459 | " 4.3 | \n",
460 | " Monroe, Greg | \n",
461 | " 4.9 | \n",
462 | " 8.0 | \n",
463 | " Mid Range | \n",
464 | " L | \n",
465 | "
\n",
466 | " \n",
467 | " | 205546 | \n",
468 | " PJ Tucker | \n",
469 | " Phoenix Suns | \n",
470 | " 2015-04-08 | \n",
471 | " 2014 | \n",
472 | " 3033.0 | \n",
473 | " 1610612756 | \n",
474 | " 400579463.0 | \n",
475 | " 1 | \n",
476 | " 9 | \n",
477 | " 23 | \n",
478 | " 0 | \n",
479 | " Turnaround Jump Shot | \n",
480 | " 2 | \n",
481 | " 7 | \n",
482 | " Dallas Mavericks | \n",
483 | " -73 | \n",
484 | " 26 | \n",
485 | " 1 | \n",
486 | " 1.6 | \n",
487 | " Rondo, Rajon | \n",
488 | " 2.9 | \n",
489 | " 17.7 | \n",
490 | " Paint | \n",
491 | " R | \n",
492 | "
\n",
493 | " \n",
494 | " | 205547 | \n",
495 | " Dion Waiters | \n",
496 | " Oklahoma City Thunder | \n",
497 | " 2015-04-01 | \n",
498 | " 2014 | \n",
499 | " 6628.0 | \n",
500 | " 1610612760 | \n",
501 | " NaN | \n",
502 | " 1 | \n",
503 | " 10 | \n",
504 | " 37 | \n",
505 | " 0 | \n",
506 | " Turnaround Jump Shot | \n",
507 | " 2 | \n",
508 | " 6 | \n",
509 | " Dallas Mavericks | \n",
510 | " -67 | \n",
511 | " -2 | \n",
512 | " 3 | \n",
513 | " 5.1 | \n",
514 | " Nowitzki, Dirk | \n",
515 | " 2.5 | \n",
516 | " 11.2 | \n",
517 | " Paint | \n",
518 | " R | \n",
519 | "
\n",
520 | " \n",
521 | " | 205548 | \n",
522 | " Dante Exum | \n",
523 | " Utah Jazz | \n",
524 | " 2015-04-08 | \n",
525 | " 2014 | \n",
526 | " 3102528.0 | \n",
527 | " 1610612762 | \n",
528 | " 400579462.0 | \n",
529 | " 1 | \n",
530 | " 2 | \n",
531 | " 58 | \n",
532 | " 0 | \n",
533 | " Turnaround Jump Shot | \n",
534 | " 2 | \n",
535 | " 8 | \n",
536 | " Sacramento Kings | \n",
537 | " 71 | \n",
538 | " 48 | \n",
539 | " 4 | \n",
540 | " 5.5 | \n",
541 | " Landry, Carl | \n",
542 | " 4.4 | \n",
543 | " 14.3 | \n",
544 | " Paint | \n",
545 | " L | \n",
546 | "
\n",
547 | " \n",
548 | " | 205549 | \n",
549 | " Jason Smith | \n",
550 | " New York Knicks | \n",
551 | " 2015-04-08 | \n",
552 | " 2014 | \n",
553 | " 3232.0 | \n",
554 | " 1610612752 | \n",
555 | " 400579457.0 | \n",
556 | " 2 | \n",
557 | " 3 | \n",
558 | " 32 | \n",
559 | " 1 | \n",
560 | " Turnaround Jump Shot | \n",
561 | " 2 | \n",
562 | " 7 | \n",
563 | " Indiana Pacers | \n",
564 | " 73 | \n",
565 | " -24 | \n",
566 | " 4 | \n",
567 | " 5.1 | \n",
568 | " Allen, Lavoy | \n",
569 | " 4.7 | \n",
570 | " 3.7 | \n",
571 | " Paint | \n",
572 | " L | \n",
573 | "
\n",
574 | " \n",
575 | "
\n",
576 | "
"
577 | ],
578 | "text/plain": [
579 | " name team_name game_date season \\\n",
580 | "205545 Evan Turner Boston Celtics 2015-04-08 2014 \n",
581 | "205546 PJ Tucker Phoenix Suns 2015-04-08 2014 \n",
582 | "205547 Dion Waiters Oklahoma City Thunder 2015-04-01 2014 \n",
583 | "205548 Dante Exum Utah Jazz 2015-04-08 2014 \n",
584 | "205549 Jason Smith New York Knicks 2015-04-08 2014 \n",
585 | "\n",
586 | " espn_player_id team_id espn_game_id period minutes_remaining \\\n",
587 | "205545 4239.0 1610612738 400579456.0 1 10 \n",
588 | "205546 3033.0 1610612756 400579463.0 1 9 \n",
589 | "205547 6628.0 1610612760 NaN 1 10 \n",
590 | "205548 3102528.0 1610612762 400579462.0 1 2 \n",
591 | "205549 3232.0 1610612752 400579457.0 2 3 \n",
592 | "\n",
593 | " seconds_remaining shot_made_flag action_type shot_type \\\n",
594 | "205545 29 1 Turnaround Jump Shot 2 \n",
595 | "205546 23 0 Turnaround Jump Shot 2 \n",
596 | "205547 37 0 Turnaround Jump Shot 2 \n",
597 | "205548 58 0 Turnaround Jump Shot 2 \n",
598 | "205549 32 1 Turnaround Jump Shot 2 \n",
599 | "\n",
600 | " shot_distance opponent x y dribbles touch_time \\\n",
601 | "205545 13 Detroit Pistons 114 64 5 4.3 \n",
602 | "205546 7 Dallas Mavericks -73 26 1 1.6 \n",
603 | "205547 6 Dallas Mavericks -67 -2 3 5.1 \n",
604 | "205548 8 Sacramento Kings 71 48 4 5.5 \n",
605 | "205549 7 Indiana Pacers 73 -24 4 5.1 \n",
606 | "\n",
607 | " defender_name defender_distance shot_clock shot_zone shot_area \n",
608 | "205545 Monroe, Greg 4.9 8.0 Mid Range L \n",
609 | "205546 Rondo, Rajon 2.9 17.7 Paint R \n",
610 | "205547 Nowitzki, Dirk 2.5 11.2 Paint R \n",
611 | "205548 Landry, Carl 4.4 14.3 Paint L \n",
612 | "205549 Allen, Lavoy 4.7 3.7 Paint L "
613 | ]
614 | },
615 | "execution_count": 600,
616 | "metadata": {},
617 | "output_type": "execute_result"
618 | }
619 | ],
620 | "source": [
621 | "df.tail()"
622 | ]
623 | },
624 | {
625 | "cell_type": "code",
626 | "execution_count": 601,
627 | "metadata": {
628 | "extensions": {
629 | "jupyter_dashboards": {
630 | "version": 1,
631 | "views": {
632 | "grid_default": {
633 | "hidden": true
634 | },
635 | "report_default": {}
636 | }
637 | }
638 | },
639 | "scrolled": true
640 | },
641 | "outputs": [],
642 | "source": [
643 | "def get_lg_avgs(shot_zone_area_tup, df):\n",
644 | " sz = shot_zone_area_tup[0]\n",
645 | " sa = shot_zone_area_tup[1]\n",
646 | " shots_made = len(df[(df.shot_zone==sz) & (df.shot_area==sa) & (df.shot_made_flag==1)])\n",
647 | " total_shots = len(df[(df.shot_zone==sz) & (df.shot_area==sa)])\n",
648 | " if total_shots ==0:\n",
649 | " make_pct = 0\n",
650 | " else:\n",
651 | " make_pct = round((shots_made / total_shots),4)\n",
652 | " return make_pct"
653 | ]
654 | },
655 | {
656 | "cell_type": "code",
657 | "execution_count": 602,
658 | "metadata": {
659 | "extensions": {
660 | "jupyter_dashboards": {
661 | "version": 1,
662 | "views": {
663 | "grid_default": {
664 | "hidden": true
665 | },
666 | "report_default": {}
667 | }
668 | }
669 | },
670 | "scrolled": true
671 | },
672 | "outputs": [],
673 | "source": [
674 | "sz = set(shot_zones)\n",
675 | "sa = set(shot_areas)\n",
676 | "sza_tups = list(itertools.product(sz,sa))\n",
677 | "\n",
678 | "sza_dict = {}\n",
679 | "for sza in sza_tups:\n",
680 | " sza_dict[sza] = get_lg_avgs(sza, df)"
681 | ]
682 | },
683 | {
684 | "cell_type": "code",
685 | "execution_count": 603,
686 | "metadata": {
687 | "scrolled": true
688 | },
689 | "outputs": [],
690 | "source": [
691 | "def add_lg_avg_to_df(df):\n",
692 | " df['lg_avg']=0\n",
693 | " for k,v in sza_dict.items():\n",
694 | " df['lg_avg'] = np.where((df.shot_zone==k[0]) & (df.shot_area==k[1]), v, df['lg_avg'])"
695 | ]
696 | },
697 | {
698 | "cell_type": "code",
699 | "execution_count": 604,
700 | "metadata": {
701 | "scrolled": true
702 | },
703 | "outputs": [],
704 | "source": [
705 | "add_lg_avg_to_df(df)"
706 | ]
707 | },
708 | {
709 | "cell_type": "code",
710 | "execution_count": 607,
711 | "metadata": {
712 | "extensions": {
713 | "jupyter_dashboards": {
714 | "version": 1,
715 | "views": {
716 | "grid_default": {
717 | "hidden": true
718 | },
719 | "report_default": {}
720 | }
721 | }
722 | },
723 | "scrolled": true
724 | },
725 | "outputs": [],
726 | "source": [
727 | "def create_team_ids(df):\n",
728 | " team_id_dict = {}\n",
729 | " for id_, team in enumerate(list(set(df.team_name))):\n",
730 | " team_id_dict[team]=id_+1\n",
731 | "\n",
732 | " df['opp_id']=0\n",
733 | " #get team ids from 1-30\n",
734 | " for k,v in team_id_dict.items():\n",
735 | " df['team_id'] = np.where(df.team_name==k, v, df['team_id'])\n",
736 | " df['opp_id'] = np.where(df.opponent==k, v, df['opp_id'])\n",
737 | " return team_id_dict\n",
738 | "\n",
739 | "create_team_ids(df)"
740 | ]
741 | },
742 | {
743 | "cell_type": "code",
744 | "execution_count": 608,
745 | "metadata": {
746 | "scrolled": true
747 | },
748 | "outputs": [],
749 | "source": [
750 | "#df.groupby(by=['game_date','team_id','opp_id']).mean()"
751 | ]
752 | },
753 | {
754 | "cell_type": "code",
755 | "execution_count": 609,
756 | "metadata": {},
757 | "outputs": [],
758 | "source": [
759 | "nba_shots = pd.read_csv('./data/shots_1415.csv',index_col=0)\n",
760 | "nba_shots.GAME_DATE = nba_shots.GAME_DATE.astype('str')\n",
761 | "\n",
762 | "#Adds dashes to date string so it can be converted to datetime format\n",
763 | "def add_dashes(string):\n",
764 | " date = string[:4] + '-' + string[4:6] + '-' + string[-2:]\n",
765 | " return date\n",
766 | "\n",
767 | "nba_shots.GAME_DATE = nba_shots.GAME_DATE.apply(lambda x: add_dashes(x))\n",
768 | "nba_shots.GAME_DATE = pd.to_datetime(nba_shots.GAME_DATE)\n",
769 | "nba_shots.LOC_X = -nba_shots.LOC_X"
770 | ]
771 | },
772 | {
773 | "cell_type": "markdown",
774 | "metadata": {},
775 | "source": [
776 | "### Merge Dataframes"
777 | ]
778 | },
779 | {
780 | "cell_type": "code",
781 | "execution_count": 610,
782 | "metadata": {},
783 | "outputs": [],
784 | "source": [
785 | "merged_df = df.merge(nba_shots, left_on=['team_name','game_date','period','minutes_remaining','seconds_remaining','x','y'],\n",
786 | " right_on=['TEAM_NAME','GAME_DATE','PERIOD','MINUTES_REMAINING','SECONDS_REMAINING','LOC_X','LOC_Y'])\n",
787 | "\n",
788 | "merged_df = merged_df.drop(columns=['GRID_TYPE','PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING',\n",
789 | " 'SECONDS_REMAINING','SHOT_DISTANCE','LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE',\n",
790 | " 'espn_player_id', 'espn_game_id', 'EVENT_TYPE','ACTION_TYPE', 'SHOT_TYPE','SHOT_ZONE_BASIC',\n",
791 | " 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE'])"
792 | ]
793 | },
794 | {
795 | "cell_type": "code",
796 | "execution_count": 611,
797 | "metadata": {
798 | "scrolled": true
799 | },
800 | "outputs": [],
801 | "source": [
802 | "#get dictionary matching team names to home and away team acronyms\n",
803 | "def create_home_acronym_dict():\n",
804 | " team_acronyms = sorted(list(merged_df.HTM.unique()))\n",
805 | " team_names = sorted(list(merged_df.team_name.unique()))\n",
806 | "\n",
807 | " team_name_ac_dict = dict(zip(team_names,team_acronyms))\n",
808 | " team_name_ac_dict['Boston Celtics'] = 'BOS'\n",
809 | " team_name_ac_dict['Brooklyn Nets'] = 'BKN'\n",
810 | " return team_name_ac_dict"
811 | ]
812 | },
813 | {
814 | "cell_type": "code",
815 | "execution_count": 612,
816 | "metadata": {
817 | "scrolled": false
818 | },
819 | "outputs": [
820 | {
821 | "name": "stdout",
822 | "output_type": "stream",
823 | "text": [
824 | "Runtime: 2.05 seconds. 205539 iterations to go.\n",
825 | "Runtime: 8.47 seconds. 155539 iterations to go.\n",
826 | "Runtime: 16.45 seconds. 105539 iterations to go.\n",
827 | "Runtime: 23.53 seconds. 55539 iterations to go.\n",
828 | "Runtime: 31.85 seconds. 5539 iterations to go.\n"
829 | ]
830 | }
831 | ],
832 | "source": [
833 | "def get_home_team():\n",
834 | " start = time.time()\n",
835 | " is_home_arr = []\n",
836 | "\n",
837 | " team_name_ac_dict=create_home_acronym_dict()\n",
838 | "\n",
839 | " for index, row in merged_df.iterrows():\n",
840 | " if team_name_ac_dict[row.team_name]==row.HTM:\n",
841 | " is_home_arr.append(1)\n",
842 | " else:\n",
843 | " is_home_arr.append(0)\n",
844 | " if index%50000==0:\n",
845 | " print('Runtime: {} seconds. {} iterations to go.'.format(round(time.time()-start,2), len(merged_df)-index))\n",
846 | " return is_home_arr\n",
847 | "\n",
848 | "merged_df['is_home'] = get_home_team()"
849 | ]
850 | },
851 | {
852 | "cell_type": "code",
853 | "execution_count": 613,
854 | "metadata": {
855 | "scrolled": true
856 | },
857 | "outputs": [],
858 | "source": [
859 | "#sort the dataframe by date, game_id, player_name, and game_event_id\n",
860 | "sorted_df = merged_df.copy().sort_values(by=['game_date','GAME_ID','name','GAME_EVENT_ID']).reset_index(drop=True)\n",
861 | "\n",
862 | "#adds to dataframe whether player has hit previous 1, 2, or 3 shots\n",
863 | "def is_player_hot(dataframe):\n",
864 | " start=time.time()\n",
865 | "\n",
866 | " df = dataframe\n",
867 | " #create array that stores whether previous 1, 2, or 3 shots were made, respectively\n",
868 | " heat_check_array=np.zeros((len(df),3))\n",
869 | "\n",
870 | " for index, row in df.iterrows():\n",
871 | " if index==0:\n",
872 | " heat_check_array[index,:]+=[0,0,0]\n",
873 | " elif index==1:\n",
874 | " if (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1):\n",
875 | " heat_check_array[index,:]+=[1,0,0]\n",
876 | " else:\n",
877 | " heat_check_array[index,:]+=[0,0,0]\n",
878 | " elif index==2:\n",
879 | " if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1):\n",
880 | " heat_check_array[index,:]+=[1,1,0]\n",
881 | " elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==0):\n",
882 | " heat_check_array[index,:]+=[1,0,0]\n",
883 | " else:\n",
884 | " heat_check_array[index,:]+=[0,0,0]\n",
885 | " else:\n",
886 | " if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==1):\n",
887 | " heat_check_array[index,:]+=[1,1,1]\n",
888 | " elif (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==0):\n",
889 | " heat_check_array[index,:]+=[1,1,0]\n",
890 | " elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1):\n",
891 | " heat_check_array[index,:]+=[1,0,0]\n",
892 | " else:\n",
893 | " heat_check_array[index,:]+=[0,0,0]\n",
894 | "\n",
895 | " if index%50000==0:\n",
896 | " print('Runtime: {} seconds. {} iterations remaining.'.format(round(time.time()-start,2),len(df)-index))\n",
897 | "\n",
898 | " return heat_check_array"
899 | ]
900 | },
901 | {
902 | "cell_type": "code",
903 | "execution_count": 614,
904 | "metadata": {
905 | "scrolled": true
906 | },
907 | "outputs": [
908 | {
909 | "name": "stdout",
910 | "output_type": "stream",
911 | "text": [
912 | "Runtime: 2.21 seconds. 205539 iterations remaining.\n",
913 | "Runtime: 38.93 seconds. 155539 iterations remaining.\n",
914 | "Runtime: 75.29 seconds. 105539 iterations remaining.\n",
915 | "Runtime: 117.37 seconds. 55539 iterations remaining.\n",
916 | "Runtime: 157.18 seconds. 5539 iterations remaining.\n"
917 | ]
918 | }
919 | ],
920 | "source": [
921 | "heat_check_array = is_player_hot(sorted_df)"
922 | ]
923 | },
924 | {
925 | "cell_type": "code",
926 | "execution_count": 632,
927 | "metadata": {},
928 | "outputs": [
929 | {
930 | "data": {
931 | "text/html": [
932 | "\n",
933 | "\n",
946 | "
\n",
947 | " \n",
948 | " \n",
949 | " | \n",
950 | " name | \n",
951 | " shot_made_flag | \n",
952 | " prev_shot_made | \n",
953 | " prev_2_made | \n",
954 | " prev_3_made | \n",
955 | " game_date | \n",
956 | " GAME_EVENT_ID | \n",
957 | "
\n",
958 | " \n",
959 | " \n",
960 | " \n",
961 | " | 210 | \n",
962 | " Cory Joseph | \n",
963 | " 1 | \n",
964 | " 0.0 | \n",
965 | " 0.0 | \n",
966 | " 0.0 | \n",
967 | " 2014-10-28 | \n",
968 | " 380 | \n",
969 | "
\n",
970 | " \n",
971 | " | 211 | \n",
972 | " Cory Joseph | \n",
973 | " 1 | \n",
974 | " 1.0 | \n",
975 | " 0.0 | \n",
976 | " 0.0 | \n",
977 | " 2014-10-28 | \n",
978 | " 387 | \n",
979 | "
\n",
980 | " \n",
981 | " | 212 | \n",
982 | " Danny Green | \n",
983 | " 0 | \n",
984 | " 0.0 | \n",
985 | " 0.0 | \n",
986 | " 0.0 | \n",
987 | " 2014-10-28 | \n",
988 | " 9 | \n",
989 | "
\n",
990 | " \n",
991 | " | 213 | \n",
992 | " Danny Green | \n",
993 | " 1 | \n",
994 | " 0.0 | \n",
995 | " 0.0 | \n",
996 | " 0.0 | \n",
997 | " 2014-10-28 | \n",
998 | " 15 | \n",
999 | "
\n",
1000 | " \n",
1001 | " | 214 | \n",
1002 | " Danny Green | \n",
1003 | " 1 | \n",
1004 | " 1.0 | \n",
1005 | " 0.0 | \n",
1006 | " 0.0 | \n",
1007 | " 2014-10-28 | \n",
1008 | " 102 | \n",
1009 | "
\n",
1010 | " \n",
1011 | " | 215 | \n",
1012 | " Danny Green | \n",
1013 | " 1 | \n",
1014 | " 1.0 | \n",
1015 | " 1.0 | \n",
1016 | " 0.0 | \n",
1017 | " 2014-10-28 | \n",
1018 | " 132 | \n",
1019 | "
\n",
1020 | " \n",
1021 | " | 216 | \n",
1022 | " Danny Green | \n",
1023 | " 0 | \n",
1024 | " 1.0 | \n",
1025 | " 1.0 | \n",
1026 | " 1.0 | \n",
1027 | " 2014-10-28 | \n",
1028 | " 150 | \n",
1029 | "
\n",
1030 | " \n",
1031 | " | 217 | \n",
1032 | " Danny Green | \n",
1033 | " 0 | \n",
1034 | " 0.0 | \n",
1035 | " 0.0 | \n",
1036 | " 0.0 | \n",
1037 | " 2014-10-28 | \n",
1038 | " 175 | \n",
1039 | "
\n",
1040 | " \n",
1041 | " | 218 | \n",
1042 | " Danny Green | \n",
1043 | " 1 | \n",
1044 | " 0.0 | \n",
1045 | " 0.0 | \n",
1046 | " 0.0 | \n",
1047 | " 2014-10-28 | \n",
1048 | " 259 | \n",
1049 | "
\n",
1050 | " \n",
1051 | " | 219 | \n",
1052 | " Danny Green | \n",
1053 | " 0 | \n",
1054 | " 1.0 | \n",
1055 | " 0.0 | \n",
1056 | " 0.0 | \n",
1057 | " 2014-10-28 | \n",
1058 | " 284 | \n",
1059 | "
\n",
1060 | " \n",
1061 | "
\n",
1062 | "
"
1063 | ],
1064 | "text/plain": [
1065 | " name shot_made_flag prev_shot_made prev_2_made prev_3_made \\\n",
1066 | "210 Cory Joseph 1 0.0 0.0 0.0 \n",
1067 | "211 Cory Joseph 1 1.0 0.0 0.0 \n",
1068 | "212 Danny Green 0 0.0 0.0 0.0 \n",
1069 | "213 Danny Green 1 0.0 0.0 0.0 \n",
1070 | "214 Danny Green 1 1.0 0.0 0.0 \n",
1071 | "215 Danny Green 1 1.0 1.0 0.0 \n",
1072 | "216 Danny Green 0 1.0 1.0 1.0 \n",
1073 | "217 Danny Green 0 0.0 0.0 0.0 \n",
1074 | "218 Danny Green 1 0.0 0.0 0.0 \n",
1075 | "219 Danny Green 0 1.0 0.0 0.0 \n",
1076 | "\n",
1077 | " game_date GAME_EVENT_ID \n",
1078 | "210 2014-10-28 380 \n",
1079 | "211 2014-10-28 387 \n",
1080 | "212 2014-10-28 9 \n",
1081 | "213 2014-10-28 15 \n",
1082 | "214 2014-10-28 102 \n",
1083 | "215 2014-10-28 132 \n",
1084 | "216 2014-10-28 150 \n",
1085 | "217 2014-10-28 175 \n",
1086 | "218 2014-10-28 259 \n",
1087 | "219 2014-10-28 284 "
1088 | ]
1089 | },
1090 | "execution_count": 632,
1091 | "metadata": {},
1092 | "output_type": "execute_result"
1093 | }
1094 | ],
1095 | "source": [
1096 | "#add heat check stats to dataframe\n",
1097 | "sorted_df['prev_shot_made'] = heat_check_array[:,0]\n",
1098 | "sorted_df['prev_2_made'] = heat_check_array[:,1]\n",
1099 | "sorted_df['prev_3_made'] = heat_check_array[:,2]\n",
1100 | "sorted_df[210:220][['name','shot_made_flag','prev_shot_made','prev_2_made','prev_3_made','game_date','GAME_EVENT_ID']]"
1101 | ]
1102 | },
1103 | {
1104 | "cell_type": "code",
1105 | "execution_count": 37,
1106 | "metadata": {
1107 | "scrolled": true
1108 | },
1109 | "outputs": [
1110 | {
1111 | "data": {
1112 | "text/html": [
1113 | "\n",
1114 | "\n",
1127 | "
\n",
1128 | " \n",
1129 | " \n",
1130 | " | \n",
1131 | " name | \n",
1132 | " team_name | \n",
1133 | " game_date | \n",
1134 | " season | \n",
1135 | " team_id | \n",
1136 | " period | \n",
1137 | " minutes_remaining | \n",
1138 | " seconds_remaining | \n",
1139 | " shot_made_flag | \n",
1140 | " action_type | \n",
1141 | " shot_type | \n",
1142 | " shot_distance | \n",
1143 | " opponent | \n",
1144 | " x | \n",
1145 | " y | \n",
1146 | " dribbles | \n",
1147 | " touch_time | \n",
1148 | " defender_name | \n",
1149 | " defender_distance | \n",
1150 | " shot_clock | \n",
1151 | " shot_zone | \n",
1152 | " shot_area | \n",
1153 | " lg_avg | \n",
1154 | " opp_id | \n",
1155 | " GAME_ID | \n",
1156 | " GAME_EVENT_ID | \n",
1157 | " PLAYER_ID | \n",
1158 | " HTM | \n",
1159 | " VTM | \n",
1160 | " is_home | \n",
1161 | " prev_shot_made | \n",
1162 | " prev_2_made | \n",
1163 | " prev_3_made | \n",
1164 | "
\n",
1165 | " \n",
1166 | " \n",
1167 | " \n",
1168 | " | 0 | \n",
1169 | " Aaron Gordon | \n",
1170 | " Orlando Magic | \n",
1171 | " 2014-10-28 | \n",
1172 | " 2014 | \n",
1173 | " 10 | \n",
1174 | " 2 | \n",
1175 | " 11 | \n",
1176 | " 34 | \n",
1177 | " 1 | \n",
1178 | " Jump Shot | \n",
1179 | " 2 | \n",
1180 | " 4 | \n",
1181 | " New Orleans Pelicans | \n",
1182 | " -10 | \n",
1183 | " 44 | \n",
1184 | " 4 | \n",
1185 | " 5.1 | \n",
1186 | " Anderson, Ryan | \n",
1187 | " 3.9 | \n",
1188 | " 0.6 | \n",
1189 | " Paint | \n",
1190 | " C | \n",
1191 | " 0.4011 | \n",
1192 | " 13 | \n",
1193 | " 21400001 | \n",
1194 | " 164 | \n",
1195 | " 203932 | \n",
1196 | " NOP | \n",
1197 | " ORL | \n",
1198 | " 0 | \n",
1199 | " 0 | \n",
1200 | " 0 | \n",
1201 | " 0 | \n",
1202 | "
\n",
1203 | " \n",
1204 | " | 1 | \n",
1205 | " Aaron Gordon | \n",
1206 | " Orlando Magic | \n",
1207 | " 2014-10-28 | \n",
1208 | " 2014 | \n",
1209 | " 10 | \n",
1210 | " 2 | \n",
1211 | " 9 | \n",
1212 | " 13 | \n",
1213 | " 1 | \n",
1214 | " Jump Shot | \n",
1215 | " 3 | \n",
1216 | " 23 | \n",
1217 | " New Orleans Pelicans | \n",
1218 | " -233 | \n",
1219 | " 20 | \n",
1220 | " 0 | \n",
1221 | " 0.7 | \n",
1222 | " Evans, Tyreke | \n",
1223 | " 4.3 | \n",
1224 | " 7.4 | \n",
1225 | " Corner 3 | \n",
1226 | " R | \n",
1227 | " 0.3915 | \n",
1228 | " 13 | \n",
1229 | " 21400001 | \n",
1230 | " 198 | \n",
1231 | " 203932 | \n",
1232 | " NOP | \n",
1233 | " ORL | \n",
1234 | " 0 | \n",
1235 | " 1 | \n",
1236 | " 0 | \n",
1237 | " 0 | \n",
1238 | "
\n",
1239 | " \n",
1240 | " | 2 | \n",
1241 | " Aaron Gordon | \n",
1242 | " Orlando Magic | \n",
1243 | " 2014-10-28 | \n",
1244 | " 2014 | \n",
1245 | " 10 | \n",
1246 | " 2 | \n",
1247 | " 2 | \n",
1248 | " 55 | \n",
1249 | " 0 | \n",
1250 | " Jump Shot | \n",
1251 | " 3 | \n",
1252 | " 23 | \n",
1253 | " New Orleans Pelicans | \n",
1254 | " -234 | \n",
1255 | " 0 | \n",
1256 | " 0 | \n",
1257 | " 0.9 | \n",
1258 | " Gordon, Eric | \n",
1259 | " 12.5 | \n",
1260 | " 14.8 | \n",
1261 | " Corner 3 | \n",
1262 | " R | \n",
1263 | " 0.3915 | \n",
1264 | " 13 | \n",
1265 | " 21400001 | \n",
1266 | " 275 | \n",
1267 | " 203932 | \n",
1268 | " NOP | \n",
1269 | " ORL | \n",
1270 | " 0 | \n",
1271 | " 1 | \n",
1272 | " 1 | \n",
1273 | " 0 | \n",
1274 | "
\n",
1275 | " \n",
1276 | " | 3 | \n",
1277 | " Aaron Gordon | \n",
1278 | " Orlando Magic | \n",
1279 | " 2014-10-28 | \n",
1280 | " 2014 | \n",
1281 | " 10 | \n",
1282 | " 3 | \n",
1283 | " 5 | \n",
1284 | " 1 | \n",
1285 | " 1 | \n",
1286 | " Jump Shot | \n",
1287 | " 2 | \n",
1288 | " 5 | \n",
1289 | " New Orleans Pelicans | \n",
1290 | " -9 | \n",
1291 | " 58 | \n",
1292 | " 2 | \n",
1293 | " 2.6 | \n",
1294 | " Asik, Omer | \n",
1295 | " 3.5 | \n",
1296 | " 8.3 | \n",
1297 | " Paint | \n",
1298 | " C | \n",
1299 | " 0.4011 | \n",
1300 | " 13 | \n",
1301 | " 21400001 | \n",
1302 | " 381 | \n",
1303 | " 203932 | \n",
1304 | " NOP | \n",
1305 | " ORL | \n",
1306 | " 0 | \n",
1307 | " 0 | \n",
1308 | " 0 | \n",
1309 | " 0 | \n",
1310 | "
\n",
1311 | " \n",
1312 | " | 4 | \n",
1313 | " Aaron Gordon | \n",
1314 | " Orlando Magic | \n",
1315 | " 2014-10-28 | \n",
1316 | " 2014 | \n",
1317 | " 10 | \n",
1318 | " 4 | \n",
1319 | " 5 | \n",
1320 | " 58 | \n",
1321 | " 0 | \n",
1322 | " Jump Shot | \n",
1323 | " 2 | \n",
1324 | " 11 | \n",
1325 | " New Orleans Pelicans | \n",
1326 | " 46 | \n",
1327 | " 105 | \n",
1328 | " 7 | \n",
1329 | " 6.2 | \n",
1330 | " Davis, Anthony | \n",
1331 | " 4.8 | \n",
1332 | " 1.5 | \n",
1333 | " Paint | \n",
1334 | " L | \n",
1335 | " 0.3841 | \n",
1336 | " 13 | \n",
1337 | " 21400001 | \n",
1338 | " 524 | \n",
1339 | " 203932 | \n",
1340 | " NOP | \n",
1341 | " ORL | \n",
1342 | " 0 | \n",
1343 | " 1 | \n",
1344 | " 0 | \n",
1345 | " 0 | \n",
1346 | "
\n",
1347 | " \n",
1348 | "
\n",
1349 | "
"
1350 | ],
1351 | "text/plain": [
1352 | " name team_name game_date season team_id period \\\n",
1353 | "0 Aaron Gordon Orlando Magic 2014-10-28 2014 10 2 \n",
1354 | "1 Aaron Gordon Orlando Magic 2014-10-28 2014 10 2 \n",
1355 | "2 Aaron Gordon Orlando Magic 2014-10-28 2014 10 2 \n",
1356 | "3 Aaron Gordon Orlando Magic 2014-10-28 2014 10 3 \n",
1357 | "4 Aaron Gordon Orlando Magic 2014-10-28 2014 10 4 \n",
1358 | "\n",
1359 | " minutes_remaining seconds_remaining shot_made_flag action_type \\\n",
1360 | "0 11 34 1 Jump Shot \n",
1361 | "1 9 13 1 Jump Shot \n",
1362 | "2 2 55 0 Jump Shot \n",
1363 | "3 5 1 1 Jump Shot \n",
1364 | "4 5 58 0 Jump Shot \n",
1365 | "\n",
1366 | " shot_type shot_distance opponent x y dribbles \\\n",
1367 | "0 2 4 New Orleans Pelicans -10 44 4 \n",
1368 | "1 3 23 New Orleans Pelicans -233 20 0 \n",
1369 | "2 3 23 New Orleans Pelicans -234 0 0 \n",
1370 | "3 2 5 New Orleans Pelicans -9 58 2 \n",
1371 | "4 2 11 New Orleans Pelicans 46 105 7 \n",
1372 | "\n",
1373 | " touch_time defender_name defender_distance shot_clock shot_zone \\\n",
1374 | "0 5.1 Anderson, Ryan 3.9 0.6 Paint \n",
1375 | "1 0.7 Evans, Tyreke 4.3 7.4 Corner 3 \n",
1376 | "2 0.9 Gordon, Eric 12.5 14.8 Corner 3 \n",
1377 | "3 2.6 Asik, Omer 3.5 8.3 Paint \n",
1378 | "4 6.2 Davis, Anthony 4.8 1.5 Paint \n",
1379 | "\n",
1380 | " shot_area lg_avg opp_id GAME_ID GAME_EVENT_ID PLAYER_ID HTM VTM \\\n",
1381 | "0 C 0.4011 13 21400001 164 203932 NOP ORL \n",
1382 | "1 R 0.3915 13 21400001 198 203932 NOP ORL \n",
1383 | "2 R 0.3915 13 21400001 275 203932 NOP ORL \n",
1384 | "3 C 0.4011 13 21400001 381 203932 NOP ORL \n",
1385 | "4 L 0.3841 13 21400001 524 203932 NOP ORL \n",
1386 | "\n",
1387 | " is_home prev_shot_made prev_2_made prev_3_made \n",
1388 | "0 0 0 0 0 \n",
1389 | "1 0 1 0 0 \n",
1390 | "2 0 1 1 0 \n",
1391 | "3 0 0 0 0 \n",
1392 | "4 0 1 0 0 "
1393 | ]
1394 | },
1395 | "execution_count": 37,
1396 | "metadata": {},
1397 | "output_type": "execute_result"
1398 | }
1399 | ],
1400 | "source": [
1401 | "sorted_df.head()"
1402 | ]
1403 | },
1404 | {
1405 | "cell_type": "code",
1406 | "execution_count": 39,
1407 | "metadata": {},
1408 | "outputs": [],
1409 | "source": [
1410 | "positions = stats[['Player','Pos','Age']]"
1411 | ]
1412 | },
1413 | {
1414 | "cell_type": "code",
1415 | "execution_count": 46,
1416 | "metadata": {},
1417 | "outputs": [],
1418 | "source": [
1419 | "sorted_df = sorted_df.merge(positions, left_on='name', right_on='Player').drop(columns=['Player'])\n",
1420 | "sorted_df.columns = map(str.lower, sorted_df.columns)"
1421 | ]
1422 | },
1423 | {
1424 | "cell_type": "code",
1425 | "execution_count": 55,
1426 | "metadata": {},
1427 | "outputs": [],
1428 | "source": [
1429 | "#rearrange columns for better visability\n",
1430 | "sorted_df = sorted_df[['name','pos','age','player_id', 'team_name', 'team_id', 'game_date',\n",
1431 | " 'game_id', 'game_event_id','season', 'period',\n",
1432 | " 'minutes_remaining', 'seconds_remaining', 'shot_made_flag',\n",
1433 | " 'action_type', 'shot_type', 'shot_distance', 'x', 'y',\n",
1434 | " 'dribbles', 'touch_time', 'opponent', 'opp_id', 'defender_name', 'defender_distance',\n",
1435 | " 'shot_clock', 'shot_zone', 'shot_area', 'lg_avg','htm', 'vtm',\n",
1436 | " 'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made']]"
1437 | ]
1438 | },
1439 | {
1440 | "cell_type": "code",
1441 | "execution_count": 58,
1442 | "metadata": {
1443 | "scrolled": true
1444 | },
1445 | "outputs": [
1446 | {
1447 | "data": {
1448 | "text/html": [
1449 | "\n",
1450 | "\n",
1463 | "
\n",
1464 | " \n",
1465 | " \n",
1466 | " | \n",
1467 | " name | \n",
1468 | " pos | \n",
1469 | " age | \n",
1470 | " player_id | \n",
1471 | " team_name | \n",
1472 | " team_id | \n",
1473 | " game_date | \n",
1474 | " game_id | \n",
1475 | " game_event_id | \n",
1476 | " season | \n",
1477 | " period | \n",
1478 | " minutes_remaining | \n",
1479 | " seconds_remaining | \n",
1480 | " shot_made_flag | \n",
1481 | " action_type | \n",
1482 | " shot_type | \n",
1483 | " shot_distance | \n",
1484 | " x | \n",
1485 | " y | \n",
1486 | " dribbles | \n",
1487 | " touch_time | \n",
1488 | " opponent | \n",
1489 | " opp_id | \n",
1490 | " defender_name | \n",
1491 | " defender_distance | \n",
1492 | " shot_clock | \n",
1493 | " shot_zone | \n",
1494 | " shot_area | \n",
1495 | " lg_avg | \n",
1496 | " htm | \n",
1497 | " vtm | \n",
1498 | " is_home | \n",
1499 | " prev_shot_made | \n",
1500 | " prev_2_made | \n",
1501 | " prev_3_made | \n",
1502 | "
\n",
1503 | " \n",
1504 | " \n",
1505 | " \n",
1506 | " | 205534 | \n",
1507 | " Vander Blue | \n",
1508 | " SG | \n",
1509 | " 22 | \n",
1510 | " 203505 | \n",
1511 | " Los Angeles Lakers | \n",
1512 | " 22 | \n",
1513 | " 2015-04-15 | \n",
1514 | " 21401230 | \n",
1515 | " 508 | \n",
1516 | " 2014 | \n",
1517 | " 4 | \n",
1518 | " 5 | \n",
1519 | " 25 | \n",
1520 | " 1 | \n",
1521 | " Turnaround Jump Shot | \n",
1522 | " 2 | \n",
1523 | " 20 | \n",
1524 | " 125 | \n",
1525 | " 165 | \n",
1526 | " 0 | \n",
1527 | " 1.1 | \n",
1528 | " Sacramento Kings | \n",
1529 | " 11 | \n",
1530 | " Stockton, David | \n",
1531 | " 9.6 | \n",
1532 | " 8.7 | \n",
1533 | " Mid Range | \n",
1534 | " L | \n",
1535 | " 0.3925 | \n",
1536 | " LAL | \n",
1537 | " SAC | \n",
1538 | " 1 | \n",
1539 | " 0 | \n",
1540 | " 0 | \n",
1541 | " 0 | \n",
1542 | "
\n",
1543 | " \n",
1544 | " | 205535 | \n",
1545 | " Vander Blue | \n",
1546 | " SG | \n",
1547 | " 22 | \n",
1548 | " 203505 | \n",
1549 | " Los Angeles Lakers | \n",
1550 | " 22 | \n",
1551 | " 2015-04-15 | \n",
1552 | " 21401230 | \n",
1553 | " 521 | \n",
1554 | " 2014 | \n",
1555 | " 4 | \n",
1556 | " 4 | \n",
1557 | " 4 | \n",
1558 | " 0 | \n",
1559 | " Jump Shot | \n",
1560 | " 2 | \n",
1561 | " 16 | \n",
1562 | " 109 | \n",
1563 | " 126 | \n",
1564 | " 10 | \n",
1565 | " 9.3 | \n",
1566 | " Sacramento Kings | \n",
1567 | " 11 | \n",
1568 | " Stockton, David | \n",
1569 | " 3.1 | \n",
1570 | " 12.7 | \n",
1571 | " Mid Range | \n",
1572 | " L | \n",
1573 | " 0.3925 | \n",
1574 | " LAL | \n",
1575 | " SAC | \n",
1576 | " 1 | \n",
1577 | " 1 | \n",
1578 | " 0 | \n",
1579 | " 0 | \n",
1580 | "
\n",
1581 | " \n",
1582 | " | 205536 | \n",
1583 | " Vander Blue | \n",
1584 | " SG | \n",
1585 | " 22 | \n",
1586 | " 203505 | \n",
1587 | " Los Angeles Lakers | \n",
1588 | " 22 | \n",
1589 | " 2015-04-15 | \n",
1590 | " 21401230 | \n",
1591 | " 565 | \n",
1592 | " 2014 | \n",
1593 | " 4 | \n",
1594 | " 1 | \n",
1595 | " 8 | \n",
1596 | " 0 | \n",
1597 | " Running Jump Shot | \n",
1598 | " 2 | \n",
1599 | " 16 | \n",
1600 | " 51 | \n",
1601 | " 154 | \n",
1602 | " 7 | \n",
1603 | " 7.9 | \n",
1604 | " Sacramento Kings | \n",
1605 | " 11 | \n",
1606 | " Stockton, David | \n",
1607 | " 1.4 | \n",
1608 | " 14.2 | \n",
1609 | " Mid Range | \n",
1610 | " C | \n",
1611 | " 0.3994 | \n",
1612 | " LAL | \n",
1613 | " SAC | \n",
1614 | " 1 | \n",
1615 | " 0 | \n",
1616 | " 0 | \n",
1617 | " 0 | \n",
1618 | "
\n",
1619 | " \n",
1620 | " | 205537 | \n",
1621 | " Jamaal Franklin | \n",
1622 | " SG | \n",
1623 | " 23 | \n",
1624 | " 203479 | \n",
1625 | " Denver Nuggets | \n",
1626 | " 19 | \n",
1627 | " 2015-04-15 | \n",
1628 | " 21401229 | \n",
1629 | " 500 | \n",
1630 | " 2014 | \n",
1631 | " 4 | \n",
1632 | " 5 | \n",
1633 | " 33 | \n",
1634 | " 1 | \n",
1635 | " Pullup Jump shot | \n",
1636 | " 3 | \n",
1637 | " 26 | \n",
1638 | " 59 | \n",
1639 | " 257 | \n",
1640 | " 1 | \n",
1641 | " 2.7 | \n",
1642 | " Golden State Warriors | \n",
1643 | " 15 | \n",
1644 | " Livingston, Shaun | \n",
1645 | " 3.5 | \n",
1646 | " 14.0 | \n",
1647 | " Above Break 3 | \n",
1648 | " C | \n",
1649 | " 0.3415 | \n",
1650 | " GSW | \n",
1651 | " DEN | \n",
1652 | " 0 | \n",
1653 | " 0 | \n",
1654 | " 0 | \n",
1655 | " 0 | \n",
1656 | "
\n",
1657 | " \n",
1658 | " | 205538 | \n",
1659 | " Jamaal Franklin | \n",
1660 | " SG | \n",
1661 | " 23 | \n",
1662 | " 203479 | \n",
1663 | " Denver Nuggets | \n",
1664 | " 19 | \n",
1665 | " 2015-04-15 | \n",
1666 | " 21401229 | \n",
1667 | " 563 | \n",
1668 | " 2014 | \n",
1669 | " 4 | \n",
1670 | " 2 | \n",
1671 | " 8 | \n",
1672 | " 0 | \n",
1673 | " Pullup Jump shot | \n",
1674 | " 3 | \n",
1675 | " 26 | \n",
1676 | " -72 | \n",
1677 | " 252 | \n",
1678 | " 1 | \n",
1679 | " 1.9 | \n",
1680 | " Golden State Warriors | \n",
1681 | " 15 | \n",
1682 | " Rush, Brandon | \n",
1683 | " 4.2 | \n",
1684 | " 11.8 | \n",
1685 | " Above Break 3 | \n",
1686 | " C | \n",
1687 | " 0.3415 | \n",
1688 | " GSW | \n",
1689 | " DEN | \n",
1690 | " 0 | \n",
1691 | " 1 | \n",
1692 | " 0 | \n",
1693 | " 0 | \n",
1694 | "
\n",
1695 | " \n",
1696 | "
\n",
1697 | "
"
1698 | ],
1699 | "text/plain": [
1700 | " name pos age player_id team_name team_id \\\n",
1701 | "205534 Vander Blue SG 22 203505 Los Angeles Lakers 22 \n",
1702 | "205535 Vander Blue SG 22 203505 Los Angeles Lakers 22 \n",
1703 | "205536 Vander Blue SG 22 203505 Los Angeles Lakers 22 \n",
1704 | "205537 Jamaal Franklin SG 23 203479 Denver Nuggets 19 \n",
1705 | "205538 Jamaal Franklin SG 23 203479 Denver Nuggets 19 \n",
1706 | "\n",
1707 | " game_date game_id game_event_id season period minutes_remaining \\\n",
1708 | "205534 2015-04-15 21401230 508 2014 4 5 \n",
1709 | "205535 2015-04-15 21401230 521 2014 4 4 \n",
1710 | "205536 2015-04-15 21401230 565 2014 4 1 \n",
1711 | "205537 2015-04-15 21401229 500 2014 4 5 \n",
1712 | "205538 2015-04-15 21401229 563 2014 4 2 \n",
1713 | "\n",
1714 | " seconds_remaining shot_made_flag action_type shot_type \\\n",
1715 | "205534 25 1 Turnaround Jump Shot 2 \n",
1716 | "205535 4 0 Jump Shot 2 \n",
1717 | "205536 8 0 Running Jump Shot 2 \n",
1718 | "205537 33 1 Pullup Jump shot 3 \n",
1719 | "205538 8 0 Pullup Jump shot 3 \n",
1720 | "\n",
1721 | " shot_distance x y dribbles touch_time opponent \\\n",
1722 | "205534 20 125 165 0 1.1 Sacramento Kings \n",
1723 | "205535 16 109 126 10 9.3 Sacramento Kings \n",
1724 | "205536 16 51 154 7 7.9 Sacramento Kings \n",
1725 | "205537 26 59 257 1 2.7 Golden State Warriors \n",
1726 | "205538 26 -72 252 1 1.9 Golden State Warriors \n",
1727 | "\n",
1728 | " opp_id defender_name defender_distance shot_clock \\\n",
1729 | "205534 11 Stockton, David 9.6 8.7 \n",
1730 | "205535 11 Stockton, David 3.1 12.7 \n",
1731 | "205536 11 Stockton, David 1.4 14.2 \n",
1732 | "205537 15 Livingston, Shaun 3.5 14.0 \n",
1733 | "205538 15 Rush, Brandon 4.2 11.8 \n",
1734 | "\n",
1735 | " shot_zone shot_area lg_avg htm vtm is_home prev_shot_made \\\n",
1736 | "205534 Mid Range L 0.3925 LAL SAC 1 0 \n",
1737 | "205535 Mid Range L 0.3925 LAL SAC 1 1 \n",
1738 | "205536 Mid Range C 0.3994 LAL SAC 1 0 \n",
1739 | "205537 Above Break 3 C 0.3415 GSW DEN 0 0 \n",
1740 | "205538 Above Break 3 C 0.3415 GSW DEN 0 1 \n",
1741 | "\n",
1742 | " prev_2_made prev_3_made \n",
1743 | "205534 0 0 \n",
1744 | "205535 0 0 \n",
1745 | "205536 0 0 \n",
1746 | "205537 0 0 \n",
1747 | "205538 0 0 "
1748 | ]
1749 | },
1750 | "execution_count": 58,
1751 | "metadata": {},
1752 | "output_type": "execute_result"
1753 | }
1754 | ],
1755 | "source": [
1756 | "sorted_df.tail()"
1757 | ]
1758 | },
1759 | {
1760 | "cell_type": "markdown",
1761 | "metadata": {},
1762 | "source": [
1763 | "## Final cleaning and export"
1764 | ]
1765 | },
1766 | {
1767 | "cell_type": "code",
1768 | "execution_count": 75,
1769 | "metadata": {},
1770 | "outputs": [],
1771 | "source": [
1772 | "#clean positions down to 5 standard positions (no combos)\n",
1773 | "sorted_df.pos[sorted_df.name=='Giannis Antetokounmpo'] = 'SF'\n",
1774 | "\n",
1775 | "sorted_df.pos[sorted_df.pos=='PG-SG']='SG'\n",
1776 | "sorted_df.pos[sorted_df.pos=='SF-SG'] = 'SF'\n",
1777 | "sorted_df.pos[sorted_df.pos=='SG-PG'] = 'PG'\n",
1778 | "sorted_df.pos[sorted_df.pos=='PF-SF'] = 'SF'\n",
1779 | "sorted_df.pos[sorted_df.pos=='SF-PF'] = 'PF'\n",
1780 | "sorted_df.pos[sorted_df.pos=='SG-SF'] = 'SF'\n",
1781 | "\n"
1782 | ]
1783 | },
1784 | {
1785 | "cell_type": "code",
1786 | "execution_count": null,
1787 | "metadata": {},
1788 | "outputs": [],
1789 | "source": []
1790 | },
1791 | {
1792 | "cell_type": "code",
1793 | "execution_count": 493,
1794 | "metadata": {},
1795 | "outputs": [],
1796 | "source": [
1797 | "# players \n",
1798 | " # name | team | \n",
1799 | "# shots \n",
1800 | " # |player_id| zone name| area| made? \n",
1801 | " # \n",
1802 | "# def player_shots() \n",
1803 | " # shots[shots[player_id] == id]\n",
1804 | " \n",
1805 | "# def shots_by_zone(shots):\n",
1806 | "# \"\"\" first zone\"\"\"\n",
1807 | "# returns {'2' = [[], , {}]}\n",
1808 | "\n",
1809 | "# shots = player_shots('bob koozie')\n",
1810 | "# shots_by_zone(shots)\n",
1811 | "\n",
1812 | "# iterate through every player \n",
1813 | "# retrieve each player's shots\n",
1814 | "# for each zone\n",
1815 | "# retreive shots taken\n",
1816 | "# retrieve shots scored \n",
1817 | "# \n",
1818 | "\n",
1819 | "def get_fg_pct_by_player_for_each_zone(df):\n",
1820 | " start = time.time()\n",
1821 | " player_names = list(df.name.unique())\n",
1822 | " df_list = []\n",
1823 | "\n",
1824 | " for c, player in enumerate(player_names):\n",
1825 | " df_ = df[df.name==player].reset_index(drop=True)\n",
1826 | " shot_arr = np.zeros((len(df_),26))\n",
1827 | "\n",
1828 | " if (c+1)%100==0:\n",
1829 | " print('Runtime: {} seconds. {} of {} players completed.'.format(round(time.time()-start,2), c+1, len(player_names)))\n",
1830 | " for index, row in df_.iterrows():\n",
1831 | " if index != 0:\n",
1832 | " shot_arr[index,:] = shot_arr[index-1,:]\n",
1833 | " if row.shot_zone=='Mid Range':\n",
1834 | " if row.shot_area=='R':\n",
1835 | " if row.shot_made_flag==1:\n",
1836 | " shot_arr[index,0:2]+=[1,1]\n",
1837 | " else:\n",
1838 | " shot_arr[index,0:2]+=[0,1]\n",
1839 | " elif row.shot_area=='C':\n",
1840 | " if row.shot_made_flag==1:\n",
1841 | " shot_arr[index,2:4]+=[1,1]\n",
1842 | " else:\n",
1843 | " shot_arr[index,2:4]+=[0,1]\n",
1844 | " else:\n",
1845 | " if row.shot_made_flag==1:\n",
1846 | " shot_arr[index,4:6]+=[1,1]\n",
1847 | " else:\n",
1848 | " shot_arr[index,4:6]+=[0,1]\n",
1849 | " elif row.shot_zone=='Restricted Area':\n",
1850 | " if row.shot_made_flag==1:\n",
1851 | " shot_arr[index,6:8]+=[1,1]\n",
1852 | " else:\n",
1853 | " shot_arr[index,6:8]+=[0,1]\n",
1854 | " elif row.shot_zone=='Heave':\n",
1855 | " if row.shot_made_flag==1:\n",
1856 | " shot_arr[index,8:10]+=[1,1]\n",
1857 | " else:\n",
1858 | " shot_arr[index,8:10]+=[0,1]\n",
1859 | " elif row.shot_zone=='Above Break 3':\n",
1860 | " if row.shot_area=='R':\n",
1861 | " if row.shot_made_flag==1:\n",
1862 | " shot_arr[index,10:12]+=[1,1]\n",
1863 | " else:\n",
1864 | " shot_arr[index,10:12]+=[0,1]\n",
1865 | " elif row.shot_area=='C':\n",
1866 | " if row.shot_made_flag==1:\n",
1867 | " shot_arr[index,12:14]+=[1,1]\n",
1868 | " else:\n",
1869 | " shot_arr[index,12:14]+=[0,1]\n",
1870 | " else:\n",
1871 | " if row.shot_made_flag==1:\n",
1872 | " shot_arr[index,14:16]+=[1,1]\n",
1873 | " else:\n",
1874 | " shot_arr[index,14:16]+=[0,1]\n",
1875 | " elif row.shot_zone=='Paint':\n",
1876 | " if row.shot_area=='R':\n",
1877 | " if row.shot_made_flag==1:\n",
1878 | " shot_arr[index,16:18]+=[1,1]\n",
1879 | " else:\n",
1880 | " shot_arr[index,16:18]+=[0,1]\n",
1881 | " elif row.shot_area=='C':\n",
1882 | " if row.shot_made_flag==1:\n",
1883 | " shot_arr[index,18:20]+=[1,1]\n",
1884 | " else:\n",
1885 | " shot_arr[index,18:20]+=[0,1]\n",
1886 | " else:\n",
1887 | " if row.shot_made_flag==1:\n",
1888 | " shot_arr[index,20:22]+=[1,1]\n",
1889 | " else:\n",
1890 | " shot_arr[index,20:22]+=[0,1]\n",
1891 | " elif row.shot_zone=='Corner 3':\n",
1892 | " if row.shot_area=='R':\n",
1893 | " if row.shot_made_flag==1:\n",
1894 | " shot_arr[index,22:24]+=[1,1]\n",
1895 | " else:\n",
1896 | " shot_arr[index,22:24]+=[0,1]\n",
1897 | " else:\n",
1898 | " if row.shot_made_flag==1:\n",
1899 | " shot_arr[index,24:26]+=[1,1]\n",
1900 | " else:\n",
1901 | " shot_arr[index,24:26]+=[0,1]\n",
1902 | "\n",
1903 | " df_list.append(pd.DataFrame(shot_arr,index=df_.name))\n",
1904 | "\n",
1905 | " print('Total Runtime: {} seconds.'.format(round(time.time()-start,2),\n",
1906 | " c, len(player_names)))\n",
1907 | " return df_list"
1908 | ]
1909 | },
1910 | {
1911 | "cell_type": "code",
1912 | "execution_count": 574,
1913 | "metadata": {},
1914 | "outputs": [],
1915 | "source": [
1916 | "def add_zone_fg_pct_to_df(df):\n",
1917 | " df_list = get_fg_pct_by_player_for_each_zone(df)\n",
1918 | " zone_df = pd.concat([df_ for df_ in df_list])\n",
1919 | " \n",
1920 | " column_names = ['mid_R_pct', 'mid_C_pct', 'mid_L_pct', 'restricted_pct', 'heave_pct', 'ab_3_R_pct', 'ab_3_C_pct',\n",
1921 | " 'ab_3_L_pct', 'paint_R_pct', 'paint_C_pct', 'paint_L_pct', 'corner_3_R_pct', 'corner_3_L_pct',] \n",
1922 | "\n",
1923 | " counter = 0\n",
1924 | " for col in column_names:\n",
1925 | " zone_df[col] = np.round(zone_df[counter]/zone_df[counter+1],4)\n",
1926 | " counter+=2\n",
1927 | " \n",
1928 | " zone_df = zone_df.drop(columns=list(range(0,26))).reset_index().rename(columns={\n",
1929 | " 'name':'player_name'})\n",
1930 | " zone_fg_df = pd.concat((sorted_df,zone_df),axis=1)\n",
1931 | " \n",
1932 | " return zone_fg_df.drop(columns=['player_name'])\n"
1933 | ]
1934 | },
1935 | {
1936 | "cell_type": "code",
1937 | "execution_count": 575,
1938 | "metadata": {
1939 | "scrolled": true
1940 | },
1941 | "outputs": [
1942 | {
1943 | "name": "stdout",
1944 | "output_type": "stream",
1945 | "text": [
1946 | "Runtime: 13.34 seconds. 100 of 490 players completed.\n",
1947 | "Runtime: 27.03 seconds. 200 of 490 players completed.\n",
1948 | "Runtime: 39.58 seconds. 300 of 490 players completed.\n",
1949 | "Runtime: 47.93 seconds. 400 of 490 players completed.\n",
1950 | "Total Runtime: 51.96 seconds.\n"
1951 | ]
1952 | }
1953 | ],
1954 | "source": [
1955 | "zone_fg_df = add_zone_fg_pct_to_df(sorted_df)"
1956 | ]
1957 | },
1958 | {
1959 | "cell_type": "code",
1960 | "execution_count": 581,
1961 | "metadata": {},
1962 | "outputs": [
1963 | {
1964 | "data": {
1965 | "text/plain": [
1966 | "name 0\n",
1967 | "pos 0\n",
1968 | "age 0\n",
1969 | "player_id 0\n",
1970 | "team_name 0\n",
1971 | "team_id 0\n",
1972 | "game_date 0\n",
1973 | "game_id 0\n",
1974 | "game_event_id 0\n",
1975 | "season 0\n",
1976 | "period 0\n",
1977 | "minutes_remaining 0\n",
1978 | "seconds_remaining 0\n",
1979 | "shot_made_flag 0\n",
1980 | "action_type 0\n",
1981 | "shot_type 0\n",
1982 | "shot_distance 0\n",
1983 | "x 0\n",
1984 | "y 0\n",
1985 | "dribbles 0\n",
1986 | "touch_time 0\n",
1987 | "opponent 0\n",
1988 | "opp_id 0\n",
1989 | "defender_name 0\n",
1990 | "defender_distance 0\n",
1991 | "shot_clock 0\n",
1992 | "shot_zone 0\n",
1993 | "shot_area 0\n",
1994 | "lg_avg 0\n",
1995 | "htm 0\n",
1996 | "vtm 0\n",
1997 | "is_home 0\n",
1998 | "prev_shot_made 0\n",
1999 | "prev_2_made 0\n",
2000 | "prev_3_made 0\n",
2001 | "mid_R_pct 0\n",
2002 | "mid_C_pct 0\n",
2003 | "mid_L_pct 0\n",
2004 | "restricted_pct 0\n",
2005 | "heave_pct 0\n",
2006 | "ab_3_R_pct 0\n",
2007 | "ab_3_C_pct 0\n",
2008 | "ab_3_L_pct 0\n",
2009 | "paint_R_pct 0\n",
2010 | "paint_C_pct 0\n",
2011 | "paint_L_pct 0\n",
2012 | "corner_3_R_pct 0\n",
2013 | "corner_3_L_pct 0\n",
2014 | "dtype: int64"
2015 | ]
2016 | },
2017 | "execution_count": 581,
2018 | "metadata": {},
2019 | "output_type": "execute_result"
2020 | }
2021 | ],
2022 | "source": [
2023 | "#fill null values with 0\n",
2024 | "zone_fg_df = zone_fg_df.fillna(value=0)"
2025 | ]
2026 | },
2027 | {
2028 | "cell_type": "code",
2029 | "execution_count": 582,
2030 | "metadata": {},
2031 | "outputs": [],
2032 | "source": [
2033 | "#export as csv\n",
2034 | "zone_fg_df.to_csv('./data/sorted_df_14_15.csv')"
2035 | ]
2036 | },
2037 | {
2038 | "cell_type": "code",
2039 | "execution_count": null,
2040 | "metadata": {},
2041 | "outputs": [],
2042 | "source": []
2043 | }
2044 | ],
2045 | "metadata": {
2046 | "extensions": {
2047 | "jupyter_dashboards": {
2048 | "activeView": "grid_default",
2049 | "version": 1,
2050 | "views": {
2051 | "grid_default": {
2052 | "cellMargin": 10,
2053 | "defaultCellHeight": 20,
2054 | "maxColumns": 12,
2055 | "name": "grid",
2056 | "type": "grid"
2057 | },
2058 | "report_default": {
2059 | "name": "report",
2060 | "type": "report"
2061 | }
2062 | }
2063 | }
2064 | },
2065 | "kernelspec": {
2066 | "display_name": "Python 3",
2067 | "language": "python",
2068 | "name": "python3"
2069 | },
2070 | "language_info": {
2071 | "codemirror_mode": {
2072 | "name": "ipython",
2073 | "version": 3
2074 | },
2075 | "file_extension": ".py",
2076 | "mimetype": "text/x-python",
2077 | "name": "python",
2078 | "nbconvert_exporter": "python",
2079 | "pygments_lexer": "ipython3",
2080 | "version": "3.6.5"
2081 | }
2082 | },
2083 | "nbformat": 4,
2084 | "nbformat_minor": 2
2085 | }
2086 |
--------------------------------------------------------------------------------