├── images
    ├── gb cm.png
    ├── nn cm.png
    ├── rockets.png
    ├── cm logreg.png
    ├── fg by zone.png
    ├── gb feats.png
    ├── shot_dist.png
    ├── shot_zones.png
    ├── harden recs.png
    ├── team heatmap.png
    ├── all_roc_curves.png
    ├── harden heatmap.png
    ├── model results.png
    └── sc_shot_chart.png
├── nba_shots_scraper.py
├── neural_net.py
├── README.md
├── plotly_viz.py
├── shot_chart_viz.py
├── shallow_ML_models.py
├── new_ETL.py
├── presentation.py
└── Data-Exploration.ipynb


/images/gb cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/gb cm.png


--------------------------------------------------------------------------------
/images/nn cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/nn cm.png


--------------------------------------------------------------------------------
/images/rockets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/rockets.png


--------------------------------------------------------------------------------
/images/cm logreg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/cm logreg.png


--------------------------------------------------------------------------------
/images/fg by zone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/fg by zone.png


--------------------------------------------------------------------------------
/images/gb feats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/gb feats.png


--------------------------------------------------------------------------------
/images/shot_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/shot_dist.png


--------------------------------------------------------------------------------
/images/shot_zones.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/shot_zones.png


--------------------------------------------------------------------------------
/images/harden recs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/harden recs.png


--------------------------------------------------------------------------------
/images/team heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/team heatmap.png


--------------------------------------------------------------------------------
/images/all_roc_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/all_roc_curves.png


--------------------------------------------------------------------------------
/images/harden heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/harden heatmap.png


--------------------------------------------------------------------------------
/images/model results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/model results.png


--------------------------------------------------------------------------------
/images/sc_shot_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slieb74/NBA-Shot-Analysis/HEAD/images/sc_shot_chart.png


--------------------------------------------------------------------------------
/nba_shots_scraper.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pandas as pd
 3 | import numpy as np
 4 | from data.all_players_list import players_list
 5 | import time
 6 | from court import court_shapes
 7 | 
 8 | vets = [player[0:5] for player in players_list if (player[3] >1990) & (player[4] >2014)]
 9 | 
10 | vets_df = pd.DataFrame(vets, columns=['ID', 'Name', 'Active', 'RookieYear', 'LastSeasonPlayed'])
11 | vets_df = vets_df.drop(columns=['Active', 'RookieYear', 'LastSeasonPlayed'])
12 | 
13 | player_ids = [player[0] for player in vets]
14 | 
15 | #MULTIPLE YEARS
16 | sc_url_1 = 'https://stats.nba.com/stats/shotchartdetail?AheadBehind=&CFID=33&CFPARAMS='
17 | sc_url_2 = '&ClutchTime=&Conference=&ContextFilter=&ContextMeasure=FGA&DateFrom=&DateTo=&Division=&EndPeriod=10&EndRange=28800&GROUP_ID=&GameEventID=&GameID=&GameSegment=&GroupID=&GroupMode=&GroupQuantity=5&LastNGames=0&LeagueID=00&Location=&Month=0&OnOff=&OpponentTeamID=0&Outcome=&PORound=0&Period=0&PlayerID='
18 | sc_url_3 = '&PlayerID1=&PlayerID2=&PlayerID3=&PlayerID4=&PlayerID5=&PlayerPosition=&PointDiff=&Position=&RangeType=0&RookieYear=&Season='
19 | sc_url_4 = '&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StartPeriod=1&StartRange=0&StarterBench=&TeamID=0&VsConference=&VsDivision=&VsPlayerID1=&VsPlayerID2=&VsPlayerID3=&VsPlayerID4=&VsPlayerID5=&VsTeamID='
20 | 
21 | headers = requests.utils.default_headers()
22 | headers.update({
23 |     "user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
24 |     })
25 | 
26 | #year in yyyy-yy format (i.e. '2017-18')
27 | def get_all_players_shot_data(player_ids, year):
28 |     all_shots = []
29 |     c=0
30 |     start=time.time()
31 |     for player_id in player_ids:
32 |         full_url = sc_url_1 + str(year) + sc_url_2 + str(player_id) + sc_url_3 + str(year) + sc_url_4
33 |         shots = requests.get(full_url, headers=headers).json()
34 |         all_shots.append(shots)
35 |         time.sleep(1)
36 |         c+=1
37 |         if c%50==0:
38 |             print('Runtime: {} seconds. {} players completed'.format(time.time()-start, c))
39 |     return all_shots
40 | 
41 | def convert_dict_to_df(all_shot_data):
42 |     start=time.time()
43 | 
44 |     league_avgs = all_shot_data[0]['resultSets'][1]['rowSet']
45 |     league_avg_columns = all_shot_data[0]['resultSets'][1]['headers']
46 |     league_avgs_df = pd.DataFrame.from_records(league_avgs, columns=league_avg_columns)
47 | 
48 |     columns = all_shot_data[0]['resultSets'][0]['headers']
49 | 
50 |     df_list=[]
51 | 
52 |     for player in all_shot_data:
53 |         data = player['resultSets'][0]['rowSet']
54 |         player_df = pd.DataFrame.from_records(data, columns=columns)
55 |         df_list.append(player_df)
56 | 
57 |     df = pd.concat(df_list, ignore_index=True)
58 |     print('Total Runtime: {} seconds.'.format(time.time()-start))
59 | 
60 |     return df, league_avgs_df
61 | 
62 | all_shots_1415 = get_all_players_shot_data(player_ids, '2014-15')
63 | shots_1415_df, lg_avgs_1415 = convert_dict_to_df(all_shots_1415)
64 | 
65 | shots_1415_df.to_csv('data/shots_1415.csv')
66 | 


--------------------------------------------------------------------------------
/neural_net.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import os, itertools
  5 | 
  6 | from sklearn.preprocessing import MinMaxScaler
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.metrics import confusion_matrix, classification_report
  9 | 
 10 | import keras
 11 | from keras.layers import Dense, Dropout, LSTM
 12 | from keras.models import Sequential, load_model
 13 | from keras.callbacks import EarlyStopping, TensorBoard
 14 | 
 15 | #####LOAD DATA#####
 16 | if False:
 17 |     df = pd.read_csv('data/final_df.csv', index_col=0)
 18 | 
 19 |     X = df.drop(columns=['name', 'pos', 'age', 'player_id', 'team_name', 'team_id', 'game_date', 'game_id', 'game_event_id', 'season', 'minutes_remaining', 'seconds_remaining', 'action_type', 'shot_type', 'opponent','opp_id',
 20 |     'defender_name', 'htm', 'vtm', 'defender_id', 'prev_shot_made', 'prev_2_made', 'prev_3_made',  'Heave', 'dribbles','shot_distance', 'shot_made_flag'])
 21 |     y = np.array(df.shot_made_flag)
 22 | 
 23 |     minmax_scale = MinMaxScaler()
 24 |     X = minmax_scale.fit_transform(X)
 25 | 
 26 |     np.save('X_y_arrays/X_', X)
 27 |     np.save('X_y_arrays/y_', y)
 28 | #####SPLIT DATA INTO TRAIN/TEST SETS#####
 29 | if True:
 30 |     X = np.load('X_y_arrays/X_.npy')
 31 |     y = np.load('X_y_arrays/y_.npy')
 32 | 
 33 |     X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=23,test_size=.2)
 34 | 
 35 | #####HELPER FUNCTION TO PLOT CM#####
 36 | def plot_confusion_matrix(cm, name, cmap=plt.cm.Blues):
 37 |     #Create the basic matrix.
 38 |     fig = plt.figure(figsize=(6, 6))
 39 |     plt.imshow(cm, cmap)
 40 | 
 41 |     #Add title and Axis Labels
 42 |     plt.title(name + ' - ' 'Confusion Matrix')
 43 |     plt.xlabel('Predicted')
 44 |     plt.ylabel('Actual')
 45 | 
 46 |     #Add appropriate Axis Scales
 47 |     tick_marks = np.arange(0,2)
 48 |     plt.xticks(tick_marks, ['Miss', 'Make'])
 49 |     plt.yticks(tick_marks, ['Miss', 'Make'])
 50 | 
 51 |     #Add Labels to Each Cell
 52 |     thresh = 0.75 * cm.max()
 53 | 
 54 |     #Add a Side Bar Legend Showing Colors
 55 |     plt.colorbar()
 56 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
 57 |         plt.text(j, i, cm[i, j],
 58 |                  horizontalalignment="center",
 59 |                  color="black" if cm[i, j] <= thresh else "white")
 60 | 
 61 |     plt.tight_layout()
 62 |     fig.savefig('./models/nn/cm/' + name + '.png', bbox_inches='tight', dpi=480)
 63 |     plt.show()
 64 | 
 65 | def plot_val_loss_acc(model, name):
 66 |     model_val_dict = model.history.history
 67 |     loss_values = model_val_dict['loss']
 68 |     val_loss_values = model_val_dict['val_loss']
 69 |     acc_values = model_val_dict['acc']
 70 |     val_acc_values = model_val_dict['val_acc']
 71 | 
 72 |     epochs_ = range(1, len(loss_values) + 1)
 73 |     plt.plot(epochs_, loss_values, 'g', label='Training loss')
 74 |     plt.plot(epochs_, val_loss_values, 'g.', label='Validation loss')
 75 |     plt.plot(epochs_, acc_values, 'r', label='Training acc')
 76 |     plt.plot(epochs_, val_acc_values, 'r.', label='Validation acc')
 77 | 
 78 |     plt.title(name + ' - Training & validation loss / accuracy')
 79 |     plt.xlabel('Epochs')
 80 |     plt.ylabel('Loss')
 81 |     plt.legend()
 82 |     plt.savefig('models/nn/val_loss_acc/' + name + '.png', bbox_inches='tight')
 83 |     plt.show()
 84 | 
 85 | #####NEURAL NETWORK GENERATOR#####
 86 | def build_nn__(X_train, X_test, y_train, y_test, activation, epochs, batch_size, name, nodes, dropout):
 87 | 
 88 |     adam = keras.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
 89 | 
 90 |     nn_ = Sequential()
 91 | 
 92 |     #First layer
 93 |     nn_.add(Dense(X_train.shape[1], input_shape=(X_train.shape[1],), activation=activation))
 94 |     #Iterate through number of nodes and add hidden layers
 95 |     for i, node in enumerate(nodes):
 96 |         nn_.add(Dense(node, activation=activation))
 97 |         if dropout[i]==True:
 98 |             nn_.add(Dropout(0.2))
 99 |     #Output layer, use 'sigmoid' activation for binary classfication
100 |     nn_.add(Dense(1, activation='sigmoid'))
101 | 
102 |     #Show NN summary
103 |     nn_.summary()
104 |     #Compile model
105 |     nn_.compile(loss='binary_crossentropy', optimizer=adam, metrics=['acc'])
106 | 
107 |     #Add early stopping and tensorboard callbacks
108 |     early_stopping = EarlyStopping(monitor='val_loss', min_delta = 0.001, patience = 15, verbose=1, mode='auto', baseline=None)
109 |     tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=False, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, embeddings_data=None)
110 | 
111 |     #Fit model
112 |     nn_.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, verbose = 1, validation_split=0.1, callbacks = [early_stopping, tensorboard])
113 | 
114 |     plot_val_loss_acc(nn_, activation + '_' + name)
115 | 
116 |     nn_.save('./models/nn/' + name + '_' + activation +'_' + str(epochs) + '_' + str(batch_size) + '_' + str(len(nodes)) + '_' + '_'.join([str(i) for i in nodes]) + '.h5')
117 | 
118 |     print(nn_.evaluate(X_test, y_test))
119 | 
120 |     cm = confusion_matrix(nn_.predict_classes(X_test), y_test)
121 |     print(cm)
122 |     plot_confusion_matrix(cm, activation + '_' + name)
123 | 
124 |     print('Test Set Classification Report')
125 |     print(classification_report(nn_.predict_classes(X_test), y_test, target_names=['Miss','Make']))
126 |     return nn_
127 | 
128 | nn = build_nn__(X_train, X_test, y_train, y_test, activation='relu', epochs=50, batch_size=32, name='16th_run_101', nodes=[128,128,64,64,32,32,16,8], dropout=[False, False, False, False, False, False, False, False])
129 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NBA Shot Analysis
  2 | 
  3 | ## Goal
  4 | Build a classification model to predict whether and NBA shot will go in or not, and create visualizations to help general managers, coaches, and players identify shooting patterns, eliminate bad shots, and optimize their strategy to increase shooting efficiency.
  5 | 
  6 | ## ETL
  7 | I gathered my data from three sources:
  8 |  - Shot location data scraped from stats.nba.com (see my <a href="https://towardsdatascience.com/using-python-pandas-and-plotly-to-generate-nba-shot-charts-e28f873a99cb">blog post</a> for more detail)
  9 |  - Player tracking data from <a href="http://nbasavant.com/index.php">nbasavant.com</a> 
 10 |  - Defensive stats from <a href="https://www.basketball-reference.com/">basketball-reference</a> 
 11 |  
 12 | Since the NBA stopped providing tracking data such as the number of dribbles, and defender distance in the middle of the 2016 season, I focused my project on the 2014-15 season. I gathered data on over 200,000 shots, with features including, but not limited to:
 13 |  - Shot distance, (x,y) coordinates, and shot zone
 14 |  - Touch time and number of dribbles
 15 |  - Name and distance of the closest defender
 16 |  - Game context stats such as shot clock remaining, period, game clock
 17 |  - Shot type (jump shot, dunk, etc.)
 18 | 
 19 | I wanted to add more context to each shot, so I added advanced defensive stats for each defender (Block %, Defensive Win Shares/48, Defensive Box Score Plus Minus) and team (Defensive Rating). 
 20 | 
 21 | The data I gathered had two different zone breakdowns, one which detailed the directional area of the court (left, right or center) and the other which detailed a more precise location (paint, corner 3, etc.). I combined these into 15 zones, as seen below, and for every player I calculated their Field Goal % (FG%) in each zone so that my model would have a better understanding of the quality of the shot. 
 22 | 
 23 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/shot_zones.png">
 24 | 
 25 | I have never been a fan of the argument that momentum impacts basketball games, and have often argued against the concept of a "hot hand" which posits that a player is more likely to hit a shot if they have hit consecutive prior shots. In an attempt to disprove this hypothesis, I engineered new features that detailed whether the shooter has scored their previous 1, 2, and 3 shots. My models found that hitting prior shots did not have a significant impact on whether a player will score their next shot.
 26 | 
 27 | ## Visualizations
 28 | I wanted to create a wide range of visualizations that would show the frequency and efficiency of player's and team's shots.
 29 | 
 30 | #### Binned Shot Chart
 31 | The first visualization I made is a binned shot chart that breaks the court down into equally sized hexes and groups nearby shots into bubbles, with the size determined by frequency and color by FG%. The color scale differed for two's and three's to account for the point value of each shot. I also added the player's image and some additional stats to the chart. In my dashboard, there is a dropdown where you can select any player, and there is also an option to change the bubble size depending on if you want to see a more precise or broad shot chart. 
 32 | 
 33 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/sc_shot_chart.png">
 34 | 
 35 | I made similar charts for each team, where you can get a strong sense of their shooting efficiency and frequency distribution.
 36 | 
 37 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/rockets.png">
 38 | 
 39 | #### Frequency Shot Heatmap
 40 | In order to get a better sense of where players and teams are shooting from, disregarding efficiency, I designed a heatmap to show the locations where they most frequently shoot from, complete with a dropdown that allows you to select any player or team.
 41 | 
 42 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/harden%20heatmap.png">
 43 | 
 44 | #### FG Frequency Bar Plot
 45 | To visualize how the league distributes its shots, I added an interactive bar plot to my dashboard that shows FG% and the number of shots for a given feature that can be selected from a dropdown.
 46 | 
 47 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/shot_dist.png">
 48 | 
 49 | #### FG Percentage Bar Plot
 50 | To visualize FG% without focusing on frequency, I built an interactive bar plot that shows leaguewide FG% and the number of shots for a range of features that can be selected from a dropdown.
 51 | 
 52 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/fg%20by%20zone.png">
 53 | 
 54 | #### Team Points Per Shot Heatmap Matrix
 55 | I wanted to compare how teams perform in different contexts, so created a heatmap matrix that helps visualize which teams under- and overperform in certain aspects. The color of each box is determined by the team's points per shot (PPS) provided the selected feature/context. This gives teams a better sense of where they need to improve and how they stack up among the rest of the league.
 56 | 
 57 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/team%20heatmap.png">
 58 | 
 59 | ## Machine Learning Models
 60 | I trained 6 different machine learning classification models to predict whether a given shot would go in. The models I used were the following:
 61 |  - Logistic Regression
 62 |  - Random Forest
 63 |  - Gradient Boosting
 64 |  - AdaBoost
 65 |  - XGBoost
 66 |  - Neural Network
 67 |  
 68 | For each model, I went through a cross-validation process to help narrow down my feature set into only the most important ones that did not show signs of multicollinearity with other included features. I ultimately narrowed down my initial set of over 20 features to the following 6:
 69 |  - Shot Distance
 70 |  - Zone FG%
 71 |  - Defensive Win Shares per 48 Minutes
 72 |  - Defender Distance
 73 |  - Touch Time
 74 |  - Shot Clock Remaining
 75 | 
 76 | ###### Feature Importances (Gradient Boosting Classifier)
 77 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/gb%20feats.png">
 78 | 
 79 | Due to the inconsistency in scale of my numeric features (FG% is a decimal but shot distance is measured in feet), I used Scikit-Learn's MinMaxScaler to normalize and vectorize my data. My cross-validation process included hyperparameter tuning for each of my models by running a grid search with Stratified Kfold splits to ensure that the class balance remained consistent across all splits. 
 80 | For the Neural Network, I used one hidden layer that contained 50 nodes, 'relu' activation due to the lack of negative values, and the 'adam' optimizer to obtain my best results.
 81 | 
 82 | ###### ROC curves
 83 | <p align="center">
 84 |   <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/all_roc_curves.png" height="500" width="600">
 85 | </p>
 86 | 
 87 | ###### Confusion Matrix Comparisons (left: Logistic Regression, center: Gradient Boosting, right: Neural Network)
 88 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/cm%20logreg.png" height="250" width="270"/> <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/gb%20cm.png" height="250" width="270"/> <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/nn%20cm.png" height="250" width="270"/>
 89 | 
 90 | My best performing model depends on how a team values the bias/variance tradeoff and whether they would prefer to minimize false negatives (predicting a miss when its actually a make) or false positives (predicting a make when its in fact a miss). A more aggressive team would prefer the Neural Network, which only recommended not to shoot when it was extremely confident the shot would miss, but often recommended the player should shoot, albeit with less than a 40% accuracy. An aggressive team would be fine with this model because it limited false negatives and gave the team more chances to score.
 91 | 
 92 | On the other hand, a more conservative team might prefer the Gradient Boosting model, which correctly classified makes with a much higher accuracy, yet only recommended a shot ~30% of the time. It would likely lead to a higher FG%, but limits the potential scoring opportunities by recommending a team take fewer shots. The Logistic Regression model is far more balanced, sacrificing a lower overall accuracy for better precision and recall.
 93 | 
 94 | ###### Model Results
 95 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/model%20results.png" height="300" width="750">
 96 | 
 97 | In addition to my individual models, I built a stacked ensemble model that trained the XGBoost, Random Forest, and AdaBoost classifiers, and then trained a Gradient Boosting model on output. This would, in theory, give less biased predictions by weighing multiple models; however, its results were unfortunately worse than my single layer models, so I discarded it.
 98 | 
 99 | ## Shot Recommender
100 | For each player, I built a recommender system that outputs certain zones where the player should shoot more or less frequently from. The concept is based on the player's PPS relative to the league average in each zone. A player who has a high expected PPS relative to the league average in a zone would be recommended to shoot there more frequently. Conversely, a player who shoots poorly in a zone would be recommended to shoot less. In the future, I want to tune this recommender by accounting for the player's frequency of shots in each zone, so that it does not recommend a player shoot more in a zone that already contains a high percentage of their total shots.
101 | ###### Recommender Output
102 | <img src="https://github.com/slieb74/NBA-Shot-Analysis/blob/master/images/harden%20recs.png">
103 | 
104 | ## Next Steps 
105 | - Adjust the color scale of binned plots to display efficiency relative to the league average, either in terms of FG% or PPS
106 | - Tune the shot recommender to provide ideal shot distributions
107 | - Classify 2s and 3s differently in my models to see if certain models predict one shot type with higher accuracy than others
108 | - Cluster similarly skilled shooters and recommend an optimal shooting lineup that covers each shot zone
109 | - Host the project online using Dash and Flask instead of the Jupyter Notebook dashboard
110 | 
111 | ## Credits
112 | * <a href="https://grantland.com/contributors/kirk-goldsberry/">Kirk Goldsberry</a> for inspiring me to work on this project
113 | * <a href="http://savvastjortjoglou.com/nba-shot-sharts.html">Savvas Tjortjoglou</a> for his court dimensions 
114 | 


--------------------------------------------------------------------------------
/plotly_viz.py:
--------------------------------------------------------------------------------
  1 | ############################### IMPORTS ###############################
  2 | if True:
  3 |     import plotly
  4 |     import plotly.plotly as py
  5 |     import plotly.graph_objs as go
  6 |     plotly.offline.init_notebook_mode(connected=True)
  7 | 
  8 |     import matplotlib
  9 |     import matplotlib.pyplot as plt
 10 |     import seaborn as sns
 11 |     import numpy as np
 12 |     import pandas as pd
 13 |     pd.set_option('display.max_columns',100)
 14 | 
 15 |     from court import court_shapes
 16 | 
 17 |     import warnings
 18 |     warnings.filterwarnings('ignore')
 19 | 
 20 |     import itertools, math, time, re, pickle
 21 | 
 22 | ############################## LOAD DATA ##############################
 23 | df = pd.read_csv('data/clean_df_1415.csv',index_col=0)
 24 | zone_ids = pd.read_csv('data/zone_ids.csv',index_col=0)
 25 | zone_fg_pct = pd.read_csv('data/zone_fg_pct.csv',index_col=0)
 26 | 
 27 | ############################## CLEANING DATA ############################
 28 | def basic_cleaning(df):
 29 |     df.period[df.period>5]=5
 30 |     df.touch_time[df.touch_time<0]=0
 31 |     df.touch_time[df.touch_time>24]=24
 32 |     #df.touch_time=round(df.touch_time*4)/4
 33 |     df.defender_distance[df.defender_distance>10]=10
 34 |     #df.shot_clock[df.shot_clock>3] = round(df.shot_clock[df.shot_clock>3]*4)/4
 35 |     df.shot_distance[df.shot_distance>40]=40
 36 |     df.blk_pct[df.blk_pct>10]=10
 37 |     df.dbpm[df.dbpm>5.5]=5.5
 38 |     df['pps'] = df.shot_type*df.shot_made_flag
 39 | #basic_cleaning(df)
 40 | 
 41 | ######################################################################
 42 | ######################################################################
 43 | ###########################--SHOT CHARTS--############################
 44 | ######################################################################
 45 | ######################################################################
 46 | 
 47 | ######################--DRAW PLAYER SHOT CHART--######################
 48 | def draw_shot_chart(name):
 49 |     player = df[df.name==name]
 50 | 
 51 |     missed_shot_trace = go.Scattergl(
 52 |         x = player[player.shot_made_flag == 0]['x'],
 53 |         y = player[player.shot_made_flag == 0]['y'],
 54 |         mode = 'markers',
 55 |         name = 'Make',
 56 |         marker= dict(color='blue', symbol='x', size=8, line={'width':1}, opacity=0.7),
 57 |         text = [str(sd) for sd in player[player.shot_made_flag == 0]['action_type']],
 58 |         hoverinfo = 'text'
 59 |     )
 60 |     made_shot_trace = go.Scattergl(
 61 |         x = player[player.shot_made_flag == 1]['x'],
 62 |         y = player[player.shot_made_flag == 1]['y'],
 63 |         mode = 'markers',
 64 |         name='Make',
 65 |         marker= dict(color='red', symbol='circle', size=8, line={'width':1}, opacity=0.7),
 66 |         text = [str(sd) for sd in player[player.shot_made_flag == 1]['action_type']],
 67 |         hoverinfo = 'text'
 68 |     )
 69 | 
 70 |     data = [missed_shot_trace, made_shot_trace]
 71 |     layout = go.Layout(
 72 |         title= name + ' Shot Chart 2014-2015',
 73 |         showlegend =True,
 74 |         xaxis={'showgrid':False, 'range':[-250,250]},
 75 |         yaxis={'showgrid':False, 'range':[-47.5,500]},
 76 |         height = 600,
 77 |         width = 650,
 78 |         shapes=court_shapes)
 79 | 
 80 |     fig = go.Figure(data=data, layout=layout)
 81 |     plotly.offline.iplot(fig, filename = name + ' Shot Chart')
 82 | 
 83 | ########################--GROUPED SHOT CHART--########################
 84 | def grouped_plot(feature):
 85 |     groups = df.groupby(feature)
 86 |     colors = np.linspace(0,1,len(groups))
 87 | 
 88 |     color_list = ['aliceblue', 'aqua', 'steelblue','violet', 'blue',
 89 |               'blueviolet', 'brown', 'cadetblue',
 90 |               'chartreuse', 'darkgreen', 'darkmagenta', 'tomato',
 91 |              'gold', 'red', 'slategray']
 92 |     counter=0
 93 |     data = []
 94 |     for g, c in zip(groups, colors):
 95 |         data.append(go.Scattergl(
 96 |             x = g[1].x,
 97 |             y = g[1].y,
 98 |             mode = 'markers',
 99 |             name = g[0],
100 |             marker= dict(symbol='circle', size=7,
101 |                          line={'width':1}, opacity=0.7, color=color_list[counter]),
102 |             text = g[0],
103 |             hoverinfo = 'text')
104 |         )
105 |         counter+=1
106 | 
107 |     layout = go.Layout(
108 |         title='Shot Distribution by ' + feature.title(),
109 |         showlegend =True,
110 |         xaxis={'showgrid':False, 'range':[-250,250]},
111 |         yaxis={'showgrid':False, 'range':[-47.5,500]},
112 |         height = 600,
113 |         width = 750,
114 |         shapes=court_shapes)
115 | 
116 |     fig = go.Figure(data=data, layout=layout)
117 |     plotly.offline.iplot(fig, filename = 'Shot Zone Breakdown')
118 | 
119 | ########################--FREQUENCY BAR PLOT--########################
120 | def freq_bar_plots(df, feature, round_=False):
121 |     df_ = df.copy()
122 |     if round_==True:
123 |         df_[feature] = round(df_[feature])
124 | 
125 |     feat_tab = pd.crosstab(df_[feature], df_.shot_made_flag, margins=True)
126 |     feat_tab['fg_pct'] = round(feat_tab[1]/feat_tab['All'],3)
127 | 
128 |     tab=feat_tab.drop(columns='All')[:-1]
129 |     make_text= [str(round(t*100,1)) + '%' for t in tab.fg_pct]
130 |     miss_text= [str(round((1-t)*100,1)) + '%' for t in tab.fg_pct]
131 | 
132 |     trace1 = go.Bar(
133 |         x=tab.index,
134 |         y=tab[1],
135 |         name='Makes',
136 |         text= make_text ,
137 |         textposition = 'inside',
138 |         textfont=dict(
139 |             family='sans serif', size=12, color='white'),
140 |         marker=dict(
141 |             color='red'),
142 |         opacity=0.75
143 |     )
144 |     trace2 = go.Bar(
145 |         x=tab.index,
146 |         y=tab[0],
147 |         name='Misses',
148 |         text= miss_text,
149 |         textposition = 'inside',
150 |         textfont=dict(
151 |             family='sans serif', size=10, color='white'),
152 |         marker=dict(
153 |             color='blue'),
154 |         opacity=0.75
155 |     )
156 | 
157 |     line = go.Scatter(
158 |         x=tab.index,
159 |         y=tab[1],
160 |         mode='markers+lines',
161 |         name='# Makes',
162 |         hoverinfo='skip',
163 |         line=dict(
164 |         color='black', width=.75)
165 |     )
166 | 
167 |     data = [trace1, trace2, line]
168 |     layout = go.Layout(
169 |         barmode='stack',
170 |         title='FG% by ' + feature.title().replace('_',' '),
171 |         showlegend =True,
172 |         xaxis=dict(
173 |             automargin=True,
174 |             autorange=True,
175 |             ticks='',
176 |             showticklabels=True,
177 |             #tickangle=25,
178 |             title=feature.replace('_',' ').title()
179 |         ),
180 |         yaxis=dict(
181 |             automargin=True,
182 |             ticks='',
183 |             showticklabels=True,
184 |             title='# of Shots'
185 |         )
186 |     )
187 | 
188 |     fig = go.Figure(data=data, layout=layout)
189 |     plotly.offline.iplot(fig, filename='stacked-bar')
190 | 
191 | ########################--PERCENTAGE BAR CHART--########################
192 | def pct_bar_plots(feature, dataframe, round_=False, player=None, team=None):
193 |     if round_==True:
194 |         df_ = dataframe.copy()
195 |         df_[feature] = round(df_[feature])
196 |     else:
197 |         df_ = dataframe
198 | 
199 |     if player:
200 |         df = df_[df_.name==player.title()]
201 |         title= player.title() + ' - FG% by ' + feature.title().replace('_',' ')
202 |     elif team:
203 |         df = df_[df_.team_name==team.title()]
204 |         title= team.title() + ' - FG% by ' + feature.title().replace('_',' ')
205 |     else:
206 |         df = df_
207 |         title= 'FG% by ' + feature.title().replace('_',' ')
208 | 
209 | 
210 |     test=pd.crosstab(df[feature], df.shot_made_flag, margins=True)
211 |     test['pct_made'] = test[1]/test.All
212 |     test['pct_missed'] = 1-test.pct_made
213 | 
214 |     made_text= [str(round(t*100,1)) + '%' for t in test.pct_made]
215 |     missed_text= [str(round(t*100,1)) + '%' for t in test.pct_missed]
216 | 
217 |     trace1 = go.Bar(
218 |         x=test.index,
219 |         y=test.pct_made,
220 |         name='Makes',
221 |         text= made_text,
222 |         textposition = 'auto',
223 |         textfont=dict(
224 |             family='sans serif',
225 |             size=12, color='white'),
226 |         marker=dict(
227 |             color='red'),
228 |         opacity=0.75
229 |     )
230 |     trace2 = go.Bar(
231 |         x=test.index,
232 |         y=test.pct_missed,
233 |         name='Misses',
234 |         text= missed_text,
235 |         textposition = 'auto',
236 |         textfont=dict(
237 |             family='sans serif',
238 |             size=12, color='white'),
239 |         marker=dict(
240 |             color='blue'),
241 |         opacity=0.75,
242 |     )
243 | 
244 |     data = [trace1, trace2]
245 |     layout = go.Layout(
246 |         barmode='stack',
247 |         title= title,
248 |         showlegend =True,
249 |     )
250 | 
251 |     fig = go.Figure(data=data, layout=layout)
252 |     plotly.offline.iplot(fig, filename='stacked-bar')
253 | 
254 | ############################--PPS HEATMAP--#############################
255 | #FIX FUNCTION - CHANGE ZONE TO FEATURE
256 | def pps_heatmap(df, feature):
257 |     pps_tab=pd.crosstab(df.team_name, df[feature], values=df.pps, aggfunc='mean',margins=False).fillna(0)
258 | 
259 |     team_heatmap = go.Heatmap(z=[np.array((pps_tab[pps_tab.index==pps_tab.index[i]])) for i in range(len(pps_tab.index))],
260 |                        x=pps_tab.columns,
261 |                        y= [team.split(' ')[-1] for team in pps_tab.index]
262 |                       )
263 | 
264 |     layout = go.Layout(
265 |         title='Points Per Shot Heatmap',
266 |         xaxis = dict(ticks='', nticks=len(pps_tab.columns)),
267 |         yaxis = dict(ticks='', nticks=len(pps_tab.index)),
268 |     )
269 | 
270 |     fig = go.Figure(data=[team_heatmap], layout=layout)
271 |     plotly.offline.iplot(fig, filename='labelled-heatmap')
272 | 
273 | #############################--PIE CHART--#############################
274 | def feature_pie_charts(feature):
275 |     labels = df[feature].unique()
276 |     values = df[feature].value_counts()
277 |     colors = ['#FEBFB3', '#E1396C', '#005eff', '#D0F9B1']
278 | 
279 |     trace = go.Pie(labels=labels, values=values,
280 |                    hoverinfo='label+percent', textinfo='value+percent',
281 |                    textfont=dict(size=20),
282 |                    marker=dict(colors=colors,
283 |                                line=dict(color='#000000', width=1)))
284 | 
285 |     plotly.offline.iplot([trace], filename='styled_pie_chart')
286 | 
287 | ##########################--SHOT FREQ HEATMAP--#########################
288 | def shot_freq_heatmap(name):
289 |     player = df[df.name==name]
290 | 
291 |     x_make = player[player.shot_made_flag == 1]['x']
292 |     y_make = player[player.shot_made_flag == 1]['y']
293 |     x_miss = player[player.shot_made_flag == 0]['x']
294 |     y_miss = player[player.shot_made_flag == 0]['y']
295 | 
296 |     x = np.concatenate([x_make, x_miss])
297 |     y = np.concatenate([y_make, y_miss])
298 | 
299 |     makes = go.Scatter(
300 |         x=x_make,
301 |         y=y_make,
302 |         mode='markers',
303 |         name='Make',
304 |         showlegend=True,
305 |         marker=dict(
306 |             symbol='circle',
307 |             opacity=0.7,
308 |             color='green',
309 |             size=4,
310 |             line=dict(width=1),
311 |         )
312 |     )
313 |     misses = go.Scatter(
314 |         x=x_miss,
315 |         y=y_miss,
316 |         mode='markers',
317 |         name='Miss',
318 |         showlegend=True,
319 |         marker=dict(
320 |             symbol='x',
321 |             opacity=0.7,
322 |             color='yellow',
323 |             size=4,
324 |             line=dict(width=1),
325 |         )
326 |     )
327 |     trace3 = go.Histogram2d(
328 |         x=x,
329 |         y=y,
330 |         zmax=40,
331 |         zmin=0,
332 |     #     nbinsx=20,
333 |     #     nbinsy=20,
334 |         zsmooth='best',
335 |         autobinx=True,
336 |         autobiny=True,
337 |         reversescale=False,
338 |         opacity=.75,
339 |         #zauto=True,
340 |         #autocolorscale=True,
341 |     )
342 | 
343 |     layout = go.Layout(
344 |         xaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-250,250]),
345 |         yaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-47.5,450]),
346 |         autosize=False,
347 |         height=600,
348 |         width=750,
349 |         hovermode='closest',
350 |         shapes= court_shapes,
351 |         title= name + ' - Shot Frequency',
352 |         showlegend=True,
353 |         legend=dict(x=1.2, y=1),
354 |     )
355 | 
356 |     data = [trace3, makes, misses]
357 |     fig = go.Figure(data=data, layout=layout)
358 | 
359 |     plotly.offline.iplot(fig)
360 | 


--------------------------------------------------------------------------------
/shot_chart_viz.py:
--------------------------------------------------------------------------------
  1 | ############################### IMPORTS ###############################
  2 | if True:
  3 |     import requests, time, itertools, math, shutil, matplotlib
  4 |     import pandas as pd
  5 |     import matplotlib.pyplot as plt
  6 |     # %matplotlib inline
  7 |     import seaborn as sns
  8 |     import numpy as np
  9 | 
 10 |     from court import court_shapes
 11 | 
 12 |     pd.set_option('display.max_columns',40)
 13 |     import warnings
 14 |     warnings.filterwarnings('ignore')
 15 | 
 16 |     import ipywidgets as widgets
 17 |     from ipywidgets import interact
 18 | 
 19 |     import plotly
 20 |     import plotly.plotly as py
 21 |     import plotly.graph_objs as go
 22 |     plotly.offline.init_notebook_mode(connected=True)
 23 | 
 24 | #####READ DATAFRAME#####
 25 | df = pd.read_csv('final_df_1415.csv',index_col=0)
 26 | 
 27 | #####DRAW PLAYER SHOT CHART (PLOTLY)#####
 28 | def draw_shot_chart(name):
 29 |     player = df[df.name==name]
 30 | 
 31 |     missed_shot_trace = go.Scattergl(
 32 |         x = player[player.shot_made_flag == 0]['x'],
 33 |         y = player[player.shot_made_flag == 0]['y'],
 34 |         mode = 'markers',
 35 |         name = 'Miss',
 36 |         marker={'color':'blue', 'size':5}
 37 |     )
 38 |     made_shot_trace = go.Scattergl(
 39 |         x = player[player.shot_made_flag == 1]['x'],
 40 |         y = player[player.shot_made_flag == 1]['y'],
 41 |         mode = 'markers',
 42 |         name='Make',
 43 |         marker={'color':'red', 'size':5}
 44 |     )
 45 | 
 46 |     data = [missed_shot_trace, made_shot_trace]
 47 |     layout = go.Layout(
 48 |         title= name + ' Shot Chart 2014-2015',
 49 |         showlegend =True,
 50 |         xaxis={'showgrid':False, 'range':[-300,300]},
 51 |         yaxis={'showgrid':False, 'range':[-100,500]},
 52 |         height = 600,
 53 |         width = 650,
 54 |         shapes=court_shapes)
 55 | 
 56 |     fig = go.Figure(data=data, layout=layout)
 57 |     plotly.offline.iplot(fig, filename = name + ' Shot Chart')
 58 | 
 59 | #####DRAW TEAM SHOT CHART (PLOTLY)#####
 60 | def draw_team_sc(team):
 61 |     team_df = df[df.team_name==team]
 62 | 
 63 |     missed_shot_trace = go.Scattergl(
 64 |         x = team_df[team_df['shot_made_flag'] == 0]['x'],
 65 |         y = team_df[team_df['shot_made_flag'] == 0]['y'],
 66 |         mode = 'markers',
 67 |         name = 'Miss',
 68 |         marker={'color':'blue', 'size':5}
 69 |     )
 70 |     made_shot_trace = go.Scattergl(
 71 |         x = team_df[team_df['shot_made_flag'] == 1]['x'],
 72 |         y = team_df[team_df['shot_made_flag'] == 1]['y'],
 73 |         mode = 'markers',
 74 |         name='Make',
 75 |         marker={'color':'red', 'size':5}
 76 |     )
 77 | 
 78 |     data = [missed_shot_trace, made_shot_trace]
 79 |     layout = go.Layout(
 80 |         title= team + ' Shot Chart 2014-2015',
 81 |         showlegend =True,
 82 |         xaxis={'showgrid':False, 'range':[-300,300]},
 83 |         yaxis={'showgrid':False, 'range':[-100,500]},
 84 |         height = 600,
 85 |         width = 650,
 86 |         shapes=court_shapes)
 87 | 
 88 |     fig = go.Figure(data=data, layout=layout)
 89 |     plotly.offline.iplot(fig, filename = team + ' Shot Chart')
 90 | 
 91 | #####DROPDOWNS#####
 92 | if False:
 93 |     # team_dropdown = widgets.Dropdown(
 94 |     #     options = sorted(list(set(df.team_name))),
 95 |     #     value='New York Knicks',
 96 |     #     description='Team:',
 97 |     #     disabled=False,
 98 |     # )
 99 |     #
100 |     # interact(draw_team_sc, team=team_dropdown);
101 | 
102 |     player_dropdown = widgets.Dropdown(
103 |     options = sorted(list(set(df.name))),
104 |     value='James Harden',
105 |     description='Player:',
106 |     disabled=False
107 |     )
108 | 
109 |     grid_slider = widgets.IntSlider(
110 |     value=15,
111 |     min=5, max=60,
112 |     step=5,
113 |     description='Bubble Size:',
114 |     disabled=False,
115 |     )
116 | 
117 |     interact(freq_shooting_plot, player_name=player_dropdown, gridNum=grid_slider);
118 | 
119 | #####DRAW COURT MATPLOTLIB#####
120 | def draw_court(ax=None, color='black', lw=2, outer_lines=False):
121 |     from matplotlib.patches import Circle, Rectangle, Arc
122 |     if ax is None:
123 |         ax = plt.gca()
124 |     hoop = Circle((0, 0), radius=7.5, linewidth=lw, color=color, fill=False)
125 |     backboard = Rectangle((-30, -7.5), 60, -1, linewidth=lw, color=color)
126 |     outer_box = Rectangle((-80, -47.5), 160, 190, linewidth=lw, color=color,
127 |                           fill=False)
128 |     inner_box = Rectangle((-60, -47.5), 120, 190, linewidth=lw, color=color,
129 |                           fill=False)
130 |     top_free_throw = Arc((0, 142.5), 120, 120, theta1=0, theta2=180,
131 |                          linewidth=lw, color=color, fill=False)
132 |     bottom_free_throw = Arc((0, 142.5), 120, 120, theta1=180, theta2=0,
133 |                             linewidth=lw, color=color, linestyle='dashed')
134 |     restricted = Arc((0, 0), 80, 80, theta1=0, theta2=180, linewidth=lw,
135 |                      color=color)
136 |     corner_three_a = Rectangle((-220, -47.5), 0, 140, linewidth=lw,
137 |                                color=color)
138 |     corner_three_b = Rectangle((220, -47.5), 0, 140, linewidth=lw, color=color)
139 |     three_arc = Arc((0, 0), 475, 475, theta1=22, theta2=158, linewidth=lw,
140 |                     color=color)
141 |     center_outer_arc = Arc((0, 422.5), 120, 120, theta1=180, theta2=0,
142 |                            linewidth=lw, color=color)
143 |     center_inner_arc = Arc((0, 422.5), 40, 40, theta1=180, theta2=0,
144 |                            linewidth=lw, color=color)
145 |     court_elements = [hoop, backboard, outer_box, inner_box, top_free_throw,
146 |                       bottom_free_throw, restricted, corner_three_a,
147 |                       corner_three_b, three_arc, center_outer_arc,
148 |                       center_inner_arc]
149 |     if outer_lines:
150 |         outer_lines = Rectangle((-250, -47.5), 500, 470, linewidth=lw,
151 |                                 color=color, fill=False)
152 |         court_elements.append(outer_lines)
153 | 
154 |     for element in court_elements:
155 |         ax.add_patch(element)
156 | 
157 |     ax.set_xticklabels([])
158 |     ax.set_yticklabels([])
159 |     ax.set_xticks([])
160 |     ax.set_yticks([])
161 |     return ax
162 | 
163 | #####FIND PLAYER FG% FOR EACH HEX#####
164 | def find_shootingPcts(shot_df, gridNum):
165 |     x = shot_df.x[shot_df['y']<425.1]
166 |     y = shot_df.y[shot_df['y']<425.1]
167 | 
168 |     x_made = shot_df.x[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1)]
169 |     y_made = shot_df.y[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1)]
170 | 
171 |     #compute number of shots made and taken from each hexbin location
172 |     hb_shot = plt.hexbin(x, y, gridsize=gridNum, extent=(-250,250,425,-50));
173 |     plt.close()
174 |     hb_made = plt.hexbin(x_made, y_made, gridsize=gridNum, extent=(-250,250,425,-50),cmap=plt.cm.Reds);
175 |     plt.close()
176 | 
177 |     #compute shooting percentage
178 |     ShootingPctLocs = hb_made.get_array() / hb_shot.get_array()
179 |     ShootingPctLocs[np.isnan(ShootingPctLocs)] = 0 #makes 0/0s=0
180 |     return (ShootingPctLocs, hb_shot)
181 | 
182 | #####SCRAPE PLAYER IMAGE#####
183 | def acquire_playerPic(player_id, zoom, offset=(-165,400)):
184 |     from matplotlib import offsetbox as osb
185 |     ID = str(player_id.unique()[0])
186 | 
187 |     url = "http://stats.nba.com/media/players/230x185/"+ ID +".png"
188 |     pic = requests.get(url,stream=True)
189 | 
190 |     with open('scraped_images/player_images/' + ID + '.png', 'wb') as out_file:
191 |         shutil.copyfileobj(pic.raw, out_file)
192 | 
193 |     player_pic = plt.imread('scraped_images/player_images/' + ID + '.png')
194 |     img = osb.OffsetImage(player_pic, zoom)
195 |     img = osb.AnnotationBbox(img, offset,xycoords='data',pad=0.0, box_alignment=(1,0), frameon=False)
196 | 
197 |     return img
198 | 
199 | #####SCRAPE TEAM LOGO#####
200 | def get_team_logo(team_acronym, zoom, offset=(-185,400)):
201 |     from matplotlib import offsetbox as osb
202 | 
203 |     URL = 'https://www.nba.com/assets/logos/teams/primary/web/' + team_acronym + '.png'
204 | 
205 |     pic = requests.get(URL,stream=True)
206 | 
207 |     with open('scraped_images/team_images/' + str(team_acronym) + '.png', 'wb') as out_file:
208 |         shutil.copyfileobj(pic.raw, out_file)
209 | 
210 |     team_pic = plt.imread('scraped_images/team_images/' + str(team_acronym) + '.png')
211 |     img = osb.OffsetImage(team_pic, zoom)
212 |     img = osb.AnnotationBbox(img, offset,xycoords='data',pad=0.0, box_alignment=(1,0), frameon=False)
213 | 
214 |     return img
215 | 
216 | #####COLOR MAP DICTIONARY#####
217 | cdict = {
218 |     'blue': [(0.0, 0.6313725709915161, 0.6313725709915161), (0.25, 0.4470588266849518, 0.4470588266849518), (0.5, 0.29019609093666077, 0.29019609093666077), (0.75, 0.11372549086809158, 0.11372549086809158), (1.0, 0.05098039284348488, 0.05098039284348488)],
219 |     'green': [(0.0, 0.7333333492279053, 0.7333333492279053), (0.25, 0.572549045085907, 0.572549045085907), (0.5, 0.4156862795352936, 0.4156862795352936), (0.75, 0.0941176488995552, 0.0941176488995552), (1.0, 0.0, 0.0)],
220 |     'red': [(0.0, 0.9882352948188782, 0.9882352948188782), (0.25, 0.9882352948188782, 0.9882352948188782), (0.5, 0.9843137264251709, 0.9843137264251709), (0.75, 0.7960784435272217, 0.7960784435272217), (1.0, 0.40392157435417175, 0.40392157435417175)]}
221 | mymap = matplotlib.colors.LinearSegmentedColormap('my_colormap', cdict, 1024)
222 | mymap = mymap.from_list('Color Map',[(0,'#ff0000'),(.45,'#ffff00'),(1,'#00ff00')])
223 | 
224 | ####################CALCULATE SEASON STATS TO ADD TO CHART####################
225 | def get_season_stats(player_name):
226 |     player = df[df.name==player_name]
227 | 
228 |     stats = {}
229 | 
230 |     stats['NUM_GAMES'] = len(player.game_date.unique())
231 |     stats['FG_PCT'] = player.groupby(by=['season']).mean().shot_made_flag.sum()
232 |     stats['THREE_PT_PCT'] = player[player.shot_type==3].groupby(by=['season']).mean().shot_made_flag.sum()
233 | 
234 |     twos = player.groupby(['shot_type']).sum().iloc[0].shot_made_flag
235 |     threes = player.groupby(['shot_type']).sum().iloc[1].shot_made_flag * 1.5
236 |     stats['EFFECTIVE_FG_PCT'] = (twos+threes)/player.shape[0]
237 | 
238 |     stats['POINTS_PER_SHOT'] = round(player.pps.mean(),3)
239 |     stats['AVG_SHOT_DISTANCE'] = round(player.shot_distance.mean())
240 | 
241 |     printout = """Games: {}\nFG: {:4.1%}\n3PT: {:4.1%}\nEFG: {:4.1%}\nPoints per Shot: {}\nAvg Shot Dist.: {} ft.""".format(*[stats.get(k) for k in stats.keys()])
242 | 
243 |     return stats, printout
244 | 
245 | ##################CALCULATE TEAM STATS TO ADD TO CHART########################
246 | def get_team_stats(team):
247 |     team_df = df[df.team_name==team]
248 |     stats = {}
249 | 
250 |     stats['FG_PCT'] = team_df.groupby(by=['season']).mean().shot_made_flag.sum()
251 |     stats['THREE_PT_PCT'] = team_df[team_df.shot_type==3].groupby(by=['season']).mean().shot_made_flag.sum()
252 | 
253 |     twos = team_df.groupby(['shot_type']).sum().iloc[0].shot_made_flag
254 |     threes = team_df.groupby(['shot_type']).sum().iloc[1].shot_made_flag * 1.5
255 |     stats['EFFECTIVE_FG_PCT'] = (twos+threes)/team_df.shape[0]
256 | 
257 |     stats['POINTS_PER_SHOT'] = round(team_df.pps.mean(),3)
258 |     stats['AVG_SHOT_DISTANCE'] = round(team_df.shot_distance.mean())
259 | 
260 |     printout = """FG: {:4.1%}\n3PT: {:4.1%}\nEFG: {:4.1%}\nPoints per Shot: {}\nAvg Shot Dist.: {} ft.""".format(*[stats.get(k) for k in stats.keys()])
261 | 
262 |     return stats, printout
263 | 
264 | #################PLOT PLAYER FREQUENCY SHOT CHART (MATPLOTLIB)################
265 | def freq_shooting_plot(player_name,gridNum=25):
266 |     plot_size=(12,8)
267 |     shot_df = df[df.name==player_name]
268 | 
269 |     from matplotlib.patches import Circle
270 |     x = shot_df.x[shot_df['y']<425.1]
271 |     y = shot_df.y[shot_df['y']<425.1]
272 | 
273 |     #compute shooting percentage and # of shots
274 |     (ShootingPctLocs, shotNumber) = find_shootingPcts(shot_df, gridNum)
275 | 
276 |     #draw figure and court
277 |     fig = plt.figure(figsize=plot_size)#(12,7)
278 |     cmap = mymap #my modified colormap
279 |     ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure
280 |     draw_court(outer_lines=False)
281 |     plt.xlim(-250,250)
282 |     plt.ylim(400, -25)
283 | 
284 |     #draw player image
285 |     zoom = np.float(plot_size[0])/(12.0*2) #how much to zoom the player's pic. I have this hackily dependent on figure size
286 |     img = acquire_playerPic(shot_df.player_id, zoom)
287 |     ax.add_artist(img)
288 | 
289 |     #draw circles
290 |     for i, shots in enumerate(ShootingPctLocs):
291 |         restricted = Circle(shotNumber.get_offsets()[i], radius=shotNumber.get_array()[i],
292 |                             color=cmap(shots),alpha=1, fill=True)
293 |         if restricted.radius > 240/gridNum: restricted.radius=240/gridNum
294 |         ax.add_patch(restricted)
295 | 
296 |     #draw color bar
297 |     ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8])
298 |     cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical')
299 |     cb.set_label('Field Goal %')
300 |     cb.set_ticks([0.0, 0.25, 0.5, 0.75, 1.0])
301 |     cb.set_ticklabels(['0%','25%', '50%','75%', '100%'])
302 | 
303 |     ax.set_title(shot_df.name.unique()[0] +' - Shot Chart 2014-15')
304 |     #plot season stats
305 |     ax.text(135,395,get_season_stats(player_name)[1])
306 |     plt.show()
307 |     return ax
308 | 
309 | #################PLOT TEAM FREQUENCY SHOT CHART (MATPLOTLIB)#################
310 | def team_freq_plot(team, gridNum=25):
311 |     plot_size=(12,8)
312 |     team_df = df[df.team_name==team]
313 | 
314 |     from matplotlib.patches import Circle
315 | 
316 |     #compute shooting percentage and # of shots
317 |     (ShootingPctLocs, shotNumber) = find_shootingPcts(team_df, gridNum)
318 | 
319 |     #draw figure and court
320 |     fig = plt.figure(figsize=plot_size)
321 |     cmap = mymap #my modified colormap
322 |     ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure
323 |     draw_court(outer_lines=False)
324 |     plt.xlim(-250,250)
325 |     plt.ylim(400, -25)
326 | 
327 |     #draw team image
328 |     team_ac = team_df.htm[team_df.is_home==1].unique()[0]
329 |     zoom = 1 #np.float(plot_size[0])/(8.0)
330 |     img = get_team_logo(team_ac, zoom)
331 |     ax.add_artist(img)
332 | 
333 |     #draw circles
334 |     for i, shots in enumerate(ShootingPctLocs):
335 |         restricted = Circle(shotNumber.get_offsets()[i], radius=shotNumber.get_array()[i],
336 |                             color=cmap(shots),alpha=.95, fill=True)
337 |         if restricted.radius > 240/gridNum: restricted.radius=240/gridNum
338 |         ax.add_patch(restricted)
339 | 
340 |     #draw color bar
341 |     ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8])
342 |     cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical')
343 |     cb.set_label('Field Goal %')
344 |     cb.set_ticks([0.0, 0.25, 0.5, 0.75, 1.0])
345 |     cb.set_ticklabels(['0%','25%', '50%','75%', '100%'])
346 | 
347 |     ax.set_title(team_df.team_name.unique()[0] +' - Shot Chart 2014-15')
348 |     #plot season stats
349 |     ax.text(150,395,get_team_stats(team)[1])
350 |     plt.show()
351 | 


--------------------------------------------------------------------------------
/shallow_ML_models.py:
--------------------------------------------------------------------------------
  1 | ############################### IMPORTS ###############################
  2 | if True:
  3 |     import pandas as pd
  4 |     import numpy as np
  5 |     import matplotlib
  6 |     import matplotlib.pyplot as plt
  7 |     import seaborn as sns
  8 |     import itertools, math, time, re, pickle
  9 | 
 10 |     import warnings
 11 |     warnings.filterwarnings('ignore')
 12 | 
 13 |     import plotly
 14 |     import plotly.plotly as py
 15 |     import plotly.graph_objs as go
 16 |     plotly.offline.init_notebook_mode(connected=True)
 17 | 
 18 |     from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, ShuffleSplit
 19 |     from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
 20 |     from sklearn.linear_model import LogisticRegression
 21 |     from sklearn.metrics import accuracy_score, auc, confusion_matrix, precision_score, recall_score, roc_curve, f1_score
 22 |     from sklearn.preprocessing import MinMaxScaler
 23 | 
 24 |     from xgboost import XGBClassifier
 25 | 
 26 |     from pactools.grid_search import GridSearchCVProgressBar
 27 | 
 28 | ############################## LOAD DATA ##############################
 29 | if False:
 30 |     df = pd.read_csv('data/final_df.csv', index_col=0)
 31 | 
 32 |     X = df.drop(columns=['name', 'age', 'pos','player_id','team_id', 'opp_id', 'team_name', 'game_date', 'opponent', 'defender_name', 'game_id', 'action_type', 'season', 'htm', 'vtm', 'game_event_id',  'minutes_remaining', 'seconds_remaining',
 33 |     'defender_id', 'shot_type', 'Heave', 'heave_pct', 'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'Above Break 3', 'Corner 3', 'Mid Range', 'Paint', 'Restricted Area', 'C', 'L', 'R', 'dribbles', 'shot_distance', 'shot_made_flag'])
 34 |     y = np.array(df.shot_made_flag)
 35 | 
 36 |     X_col_names = X.columns
 37 |     with open('./X_y_arrays/X_column_names', 'wb') as x_col:
 38 |         pickle.dump(X_col_names, x_col)
 39 | 
 40 |     minmax_scale = MinMaxScaler()
 41 |     X = minmax_scale.fit_transform(X)
 42 | 
 43 |     np.save('./X_y_arrays/X_shallow', X)
 44 |     np.save('./X_y_arrays/y_shallow', y)
 45 | 
 46 | #new data
 47 | if True:
 48 |     df = pd.read_csv('final_df_1415.csv', index_col=0)
 49 |     df[['zone_id', 'period']] = df[['zone_id', 'period']].astype('category')
 50 | 
 51 |     X = df.drop(columns=['name', 'team_name', 'game_date', 'season', 'team_id','minutes_remaining', 'seconds_remaining', 'shot_made_flag', 'shot_type', 'opponent', 'x', 'y', 'defender_name', 'opp_id', 'game_id', 'game_event_id',
 52 |     'player_id', 'shot_zone_basic', 'shot_zone_area', 'shot_zone_range', 'htm', 'vtm', 'pos', 'age', 'defender_id', 'zone', 'pps', 'zone_id', 'zone_minus_lg_avg', 'lg_zone_avg',
 53 |     'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'dribbles', 'period', 'action_type', 'ts%', 'dbpm', '3par', 'usg%', 'blk_pct', 'def_rating'])
 54 |     y = np.array(df.shot_made_flag)
 55 | 
 56 |     X_col_names = X.columns
 57 |     with open('./X_y_arrays/X_column_names', 'wb') as x_col:
 58 |         pickle.dump(X_col_names, x_col)
 59 | 
 60 |     minmax_scale = MinMaxScaler()
 61 |     X = minmax_scale.fit_transform(X)
 62 | 
 63 |     np.save('./X_y_arrays/X_shallow', X)
 64 |     np.save('./X_y_arrays/y_shallow', y)
 65 | ################### SPLIT DATA INTO TRAIN/TEST SETS ###################
 66 | if True:
 67 |     with open ('./X_y_arrays/X_column_names', 'rb') as fp:
 68 |         X_col_names = pickle.load(fp)
 69 | 
 70 |     X = np.load('./X_y_arrays/X_shallow.npy')
 71 |     y = np.load('./X_y_arrays/y_shallow.npy')
 72 | 
 73 |     X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=23,test_size=.2)
 74 | 
 75 | ########################## HELPER FUNCTIONS ##########################
 76 | def build_model(model, path, X_train, X_test, y_train, y_test, decision_function=True):
 77 |     start = time.time()
 78 | 
 79 |     clf = model
 80 |     clf.fit(X_train,y_train)
 81 |     y_hat_test = clf.predict(X_test)
 82 | 
 83 |     if decision_function==True:
 84 |         y_score = clf.decision_function(X_test)
 85 |     else:
 86 |         y_score = clf.predict_proba(X_test)[:, 1]
 87 | 
 88 |     fpr, tpr, thresholds = roc_curve(y_test, y_score)
 89 | 
 90 |     #Save model
 91 |     with open('./models/'+ path + '/' + str(path) + '_' + time.asctime().replace(' ', '_'), 'wb') as f:
 92 |         pickle.dump(clf, f)
 93 | 
 94 |     print('Total Runtime: {} seconds'.format(time.time()-start))
 95 |     return clf, y_hat_test, y_score, fpr, tpr
 96 | 
 97 | def plot_feature_importances(model, path):
 98 |     matplotlib.style.use('fivethirtyeight')
 99 |     n_features = X.shape[1]
100 |     plt.figure(figsize=(10,6))
101 |     plt.barh(range(n_features), model.feature_importances_, align='center')
102 |     plt.yticks(np.arange(n_features), X_col_names)
103 |     plt.xlabel("Feature importance")
104 |     plt.ylabel("Features")
105 |     #Save output
106 |     plt.savefig('./models/'+ path + '/feature_importances/' + time.asctime().replace(' ', '_') + '.png')
107 |     plt.show()
108 | 
109 | def plot_confusion_matrix(cm, path, title='Confusion matrix', cmap=plt.cm.Blues):
110 |     #Create the basic matrix.
111 |     plt.imshow(cm, cmap)
112 | 
113 |     #Add title and Axis Labels
114 |     plt.title(title)
115 |     plt.xlabel('Predicted')
116 |     plt.ylabel('Actual')
117 |     #Add appropriate Axis Scales
118 |     class_names = set(y)
119 |     tick_marks = np.arange(len(class_names))
120 |     plt.xticks(tick_marks, class_names)
121 |     plt.yticks(tick_marks, class_names)
122 | 
123 |     #Add Labels to Each Cell
124 |     thresh = cm.max()*.75
125 | 
126 |     #Add a Side Bar Legend Showing Colors
127 |     plt.colorbar()
128 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
129 |         plt.text(j, i, cm[i, j],
130 |                  horizontalalignment="center",
131 |                  color="white" if cm[i, j] > thresh else "black")
132 |     #Save output
133 |     plt.savefig('./models/'+ path + '/cm/' + time.asctime().replace(' ', '_') + '.png', bbox_inches='tight', dpi=480)
134 |     plt.show()
135 | 
136 | def print_model_metrics(y_pred, y_score, path):
137 |     cm = confusion_matrix(y_test, y_pred)
138 |     plot_confusion_matrix(cm, path, title='Confusion matrix', cmap=plt.cm.Blues)
139 | 
140 |     accuracy = accuracy_score(y_test,y_pred)
141 |     precision = precision_score(y_test,y_pred)
142 |     recall = recall_score(y_test,y_pred)
143 |     f1 = f1_score(y_test,y_pred)
144 |     fpr, tpr, thresholds = roc_curve(y_test, y_score)
145 |     auc_ = auc(fpr, tpr)
146 | 
147 |     print('Accuracy:   {}'.format(round(accuracy,4)))
148 |     print('Precision:  {}'.format(round(precision,4)))
149 |     print('Recall:     {}'.format(round(recall,4)))
150 |     print('F1          {}'.format(round(f1,4)))
151 |     print('AUC:        {}'.format(round(auc_,4)))
152 | 
153 |     #Save output
154 |     metrics = np.array([accuracy, precision, recall, f1, auc_])
155 |     np.save('./models/'+ path + '/metrics/' + time.asctime().replace(' ', '_'), metrics)
156 | 
157 | def plot_roc_curve(fpr, tpr, path):
158 |     sns.set_style("darkgrid", {"axes.facecolor": ".9"})
159 | 
160 |     plt.figure(figsize=(10,6))
161 |     lw = 2
162 |     plt.plot(fpr, tpr, color='darkorange',
163 |              lw=lw, label='ROC curve')
164 |     plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
165 |     plt.xlim([0.0, 1.0])
166 |     plt.ylim([0.0, 1.05])
167 |     plt.yticks([i/20.0 for i in range(21)])
168 |     plt.xticks([i/20.0 for i in range(21)])
169 |     plt.xlabel('False Positive Rate')
170 |     plt.ylabel('True Positive Rate')
171 |     plt.title('Receiver operating characteristic (ROC) Curve')
172 |     plt.legend(loc="lower right")
173 | 
174 |     #Save output
175 |     plt.savefig('./models/'+ path + '/roc_curves/' + time.asctime().replace(' ', '_') + '.png', bbox_inches='tight', dpi=480)
176 |     plt.show()
177 | ######################################################################
178 | 
179 | ############################# GRID SEARCH ############################
180 | def run_grid_search(model, path, param_grid, X, y, cv=3):
181 |     start = time.time()
182 | 
183 |     search = GridSearchCVProgressBar(model, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
184 |     search.fit(X,y)
185 | 
186 |     print("Total Runtime for Grid Search: {:.4} seconds".format(round(time.time() - start, 2)))
187 | 
188 |     best_score = search.best_score_
189 |     best_params = search.best_params_
190 | 
191 |     print("Testing Accuracy: {:.4}%".format(best_score * 100))
192 |     print("\nOptimal Parameters: {}".format(best_params))
193 | 
194 |     search_results = pd.DataFrame.from_dict(search.cv_results_)
195 | 
196 |     search_results.to_csv('./grid_search_results/'+ path + '_' + str(round(best_score,4)).replace('.','') + '_' + time.asctime().replace(' ', '_'))
197 | 
198 |     return search_results, best_score, best_params
199 | ######################################################################
200 | 
201 | ########################## PARAMETER GRIDS ###########################
202 | if True:
203 |     log_reg_param_grid = {'penalty':['l1','l2'],
204 |                         'C': np.logspace(0, 4, 10)
205 |                     }
206 | 
207 |     rf_param_grid = {'n_estimators':[100,250],
208 |                     'criterion':['gini', 'entropy'],
209 |                     'min_samples_leaf':[2,5,10],
210 |                     'min_samples_split':[2,5,10],
211 |                     'n_jobs':[-1]
212 |                     }
213 | 
214 |     gb_param_grid = {'n_estimators':[50, 100, 250],
215 |                     'learning_rate':[.01, .05, .1, 1],
216 |                     'min_samples_leaf':[2, 5, 10],
217 |                     'min_samples_split':[2, 5, 10],
218 |                     'max_depth':[2, 5, 10]
219 |                     }
220 | 
221 |     xgb_param_grid = {'learning_rate':[.01, .05, .1, 1],
222 |                      'n_estimators':[100, 250],
223 |                      'max_depth':[2, 5, 10],
224 |                      'min_child_weight': [1, 5, 10],
225 |                      'gamma': [0.5, 1, 2],
226 |                      }
227 | 
228 | ######################################################################
229 | 
230 | 
231 | ######################## LOGISTIC REGRESSION #########################
232 | if True:
233 |     # log_reg, log_y_preds, log_y_score, log_fpr, log_tpr = build_model(LogisticRegression(C=1, class_weight='balanced'),
234 |     # 'logreg', X_train, X_test, y_train, y_test)
235 |     #
236 |     # print_model_metrics(log_y_preds, log_y_score, 'logreg')
237 |     # plot_roc_curve(log_fpr, log_tpr, 'logreg')
238 | 
239 |     log_reg_search_results, log_reg_best_score, log_reg_best_params = run_grid_search(LogisticRegression(random_state=23),'logreg', log_reg_param_grid, X, y, cv=10)
240 | ######################################################################
241 | 
242 | 
243 | ###################### RANDOM FOREST CLASSIFIER ######################
244 | if False:
245 |     rf, rf_y_preds, rf_y_score, rf_fpr, rf_tpr = build_model(RandomForestClassifier(n_estimators=500, criterion='gini', min_samples_leaf=10, min_samples_split=10, verbose=.5, class_weight='balanced', n_jobs=-1, random_state=23),
246 |     'rf', X_train, X_test, y_train, y_test, decision_function=False)
247 | 
248 |     print_model_metrics(rf_y_preds, rf_y_score, 'rf')
249 |     plot_roc_curve(rf_fpr, rf_tpr, 'rf')
250 |     plot_feature_importances(rf, 'rf')
251 | 
252 |     # rf_search_results, rf_best_score, rf_best_params = run_grid_search(RandomForestClassifier(random_state=23),'rf', rf_param_grid, X, y, cv=3)
253 | 
254 |     # [ParallelProgressBar(n_jobs=-1)]: Done 108 out of 108 | elapsed: 67.1min finished
255 |     # Total Runtime for Grid Search: 4.095e+03 seconds
256 |     # Testing Accuracy: 70.82%
257 |     #
258 |     # Optimal Parameters: {'criterion': 'gini', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 250, 'n_jobs': -1}
259 | ######################################################################
260 | 
261 | 
262 | #################### GRADIENT BOOSTING CLASSIFIER ####################
263 | if False:
264 |     gb, gb_y_preds, gb_y_score, gb_fpr, gb_tpr = build_model(GradientBoostingClassifier(learning_rate=0.05, n_estimators=500, max_depth=5, min_samples_leaf=7, min_samples_split=7, verbose=1, random_state=23),
265 |     'gb', X_train, X_test, y_train, y_test)
266 | 
267 |     print_model_metrics(gb_y_preds, gb_y_score, 'gb')
268 |     plot_roc_curve(gb_fpr, gb_tpr, 'gb')
269 |     plot_feature_importances(gb, 'gb')
270 | ######################################################################
271 | 
272 | 
273 | ######################### ADABOOST CLASSIFIER #########################
274 | if False:
275 |     ada, ada_y_preds, ada_y_score, ada_fpr, ada_tpr = build_model(AdaBoostClassifier(learning_rate=.01, n_estimators=500, algorithm='SAMME.R', random_state=23),
276 |     'ada', X_train, X_test, y_train, y_test)
277 | 
278 |     print_model_metrics(ada_y_preds, ada_y_score, 'ada')
279 |     plot_roc_curve(ada_fpr, ada_tpr, 'ada')
280 |     plot_feature_importances(ada, 'ada')
281 | ######################################################################
282 | 
283 | 
284 | ######################### XGBOOST CLASSIFIER #########################
285 | if False:
286 |     xgb, xgb_y_preds, xgb_y_score, xgb_fpr, xgb_tpr = build_model(XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=1, algorithm='SAMME.R', objective='binary:logistic', reg_alpha=0, reg_lambda=0, n_jobs=-1, random_state=23),
287 |     'xgb', X_train, X_test, y_train, y_test, decision_function=False)
288 | 
289 |     print_model_metrics(xgb_y_preds, xgb_y_score, 'xgb')
290 |     plot_roc_curve(xgb_fpr, xgb_tpr, 'xgb')
291 |     plot_feature_importances(xgb, 'xgb')
292 | 
293 |     # xgb_search_results, xgb_best_score, xgb_best_params = run_grid_search(XGBClassifier(random_state=23),'xgb', xgb_param_grid, X_train, y_train)
294 | 
295 |     # Testing Accuracy: 72.23%
296 |     #
297 |     # Optimal Parameters: {'gamma': 1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 250}
298 | ######################################################################
299 | 
300 | 
301 | ######################## STACKED ENSEMBLE MODEL ######################
302 | def create_ensemble_model(X,y):
303 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23, test_size=.2)
304 | 
305 |     rf = RandomForestClassifier(n_estimators=500, criterion='gini',  max_features='sqrt', min_samples_leaf=10, min_samples_split=2, verbose=1, class_weight='balanced', n_jobs=-1, random_state=23)
306 | 
307 |     xgb = XGBClassifier(learning_rate=0.1, n_estimators=250, max_depth=5, min_child_weight=1, gamma=1, algorithm='SAMME.R', objective='binary:logistic', n_jobs=-1, random_state=23)
308 | 
309 |     ada = AdaBoostClassifier(learning_rate=.75, n_estimators=500, algorithm='SAMME.R', random_state=23)
310 | 
311 |     rf.fit(X_train, y_train)
312 |     rf_train_preds = pd.DataFrame(rf.predict_proba(X_train))
313 |     rf_test_preds = pd.DataFrame(rf.predict_proba(X_test))
314 | 
315 |     xgb.fit(X_train, y_train)
316 |     xgb_train_preds = pd.DataFrame(xgb.predict_proba(X_train))
317 |     xgb_test_preds = pd.DataFrame(xgb.predict_proba(X_test))
318 | 
319 |     ada.fit(X_train, y_train)
320 |     ada_train_preds = pd.DataFrame(ada.predict_proba(X_train))
321 |     ada_test_preds = pd.DataFrame(ada.predict_proba(X_test))
322 | 
323 |     train_df = pd.concat([rf_train_preds, xgb_train_preds, ada_train_preds], names=['rf','xgb','ada'], axis=1)
324 |     test_df = pd.concat([rf_test_preds, xgb_test_preds, ada_test_preds], names=['rf','xgb','ada'], axis=1)
325 | 
326 |     model = LogisticRegression(random_state=1)
327 |     model.fit(train_df,y_train)
328 |     y_preds = model.predict(test_df)
329 |     # y_score = model.score(y_preds, y_test)
330 | 
331 |     return train_df, test_df, y_preds
332 | ######################################################################
333 | 


--------------------------------------------------------------------------------
/new_ETL.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | import matplotlib.pyplot as plt
  3 | import seaborn as sns
  4 | import numpy as np
  5 | import pandas as pd
  6 | pd.set_option('display.max_columns',100)
  7 | 
  8 | import warnings
  9 | warnings.filterwarnings('ignore')
 10 | 
 11 | import itertools, math, time, re
 12 | 
 13 | ############################--LOAD DATA--#############################
 14 | def load_data_to_df():
 15 |     oct_nov_ = pd.read_csv('./data/nba_savant/oct-nov-14-15.csv')
 16 |     dec_ = pd.read_csv('./data/nba_savant/dec-14-15.csv')
 17 |     jan_ = pd.read_csv('./data/nba_savant/jan-14-15.csv')
 18 |     feb_ = pd.read_csv('./data/nba_savant/feb-14-15.csv')
 19 |     mar_ = pd.read_csv('./data/nba_savant/mar-14-15.csv')
 20 |     apr_ = pd.read_csv('./data/nba_savant/apr-14-15.csv')
 21 | 
 22 |     df = pd.concat([oct_nov_,dec_,jan_,feb_,mar_,apr_])
 23 |     #reverse x values to plot correctly
 24 |     df.x = -df.x
 25 |     df.game_date = pd.to_datetime(df.game_date)
 26 |     df = df.reset_index(drop=True)
 27 |     return df
 28 | df = load_data_to_df()
 29 | ######################################################################
 30 | 
 31 | ###########################--BASIC CLEANING--#########################
 32 | df.shot_type = np.where(df.shot_type=='2PT Field Goal', 2, 3)
 33 | df.period[df.period>5]=5
 34 | df['pps'] = df.shot_type*df.shot_made_flag
 35 | df.touch_time[df.touch_time<0]=0
 36 | df.touch_time[df.touch_time>24]=24
 37 | 
 38 | def create_team_ids(df):
 39 |     team_id_dict = {}
 40 |     for id_, team in enumerate(list(set(df.team_name))):
 41 |         team_id_dict[team]=id_+1
 42 | 
 43 |     df['opp_id']=0
 44 |     #get team ids from 1-30
 45 |     for k,v in team_id_dict.items():
 46 |         df['team_id'] = np.where(df.team_name==k, v, df['team_id'])
 47 |         df['opp_id'] = np.where(df.opponent==k, v, df['opp_id'])
 48 | create_team_ids(df)
 49 | ######################################################################
 50 | 
 51 | 
 52 | ####################--LOAD NBA SCRAPED DATA--######################
 53 | nba_shots = pd.read_csv('./data/shots_1415.csv',index_col=0)
 54 | nba_shots.GAME_DATE = nba_shots.GAME_DATE.astype('str')
 55 | 
 56 | #Adds dashes to date string so it can be converted to datetime format
 57 | def add_dashes(string):
 58 |     date = string[:4] + '-' + string[4:6] + '-' + string[-2:]
 59 |     return date
 60 | 
 61 | def clean_scraped_nba_data():
 62 |     nba_shots.GAME_DATE = nba_shots.GAME_DATE.apply(lambda x: add_dashes(x))
 63 |     nba_shots.GAME_DATE = pd.to_datetime(nba_shots.GAME_DATE)
 64 |     nba_shots.LOC_X = -nba_shots.LOC_X
 65 | clean_scraped_nba_data()
 66 | ######################################################################
 67 | 
 68 | ########################--MERGE NBA AND SAVANT--######################
 69 | def merge_nba_and_savant_data(df,nba_shots):
 70 |     merged_df = df.merge(nba_shots, left_on=['team_name','game_date','period', 'minutes_remaining','seconds_remaining','x','y'], right_on=['TEAM_NAME','GAME_DATE','PERIOD','MINUTES_REMAINING', 'SECONDS_REMAINING','LOC_X','LOC_Y'])
 71 | 
 72 |     merged_df = merged_df.drop(columns=['GRID_TYPE','PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING', 'SECONDS_REMAINING','SHOT_DISTANCE','LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE', 'espn_player_id', 'espn_game_id', 'EVENT_TYPE', 'SHOT_TYPE', 'ACTION_TYPE'])
 73 | 
 74 |     return merged_df
 75 | merged_df = merge_nba_and_savant_data(df,nba_shots)
 76 | ######################################################################
 77 | 
 78 | ########################--FEATURE ENGINEERING--######################
 79 | #helper function to get dictionary matching team names to home and away team acronyms
 80 | def create_home_acronym_dict():
 81 |     team_acronyms = sorted(list(merged_df.HTM.unique()))
 82 |     team_names = sorted(list(merged_df.team_name.unique()))
 83 | 
 84 |     team_name_ac_dict = dict(zip(team_names,team_acronyms))
 85 |     team_name_ac_dict['Boston Celtics'] = 'BOS'
 86 |     team_name_ac_dict['Brooklyn Nets'] = 'BKN'
 87 |     return team_name_ac_dict
 88 | 
 89 | #Function to determing if the shooter is playing at home
 90 | def get_home_team():
 91 |     start = time.time()
 92 |     is_home_arr = []
 93 | 
 94 |     team_name_ac_dict=create_home_acronym_dict()
 95 | 
 96 |     for index, row in merged_df.iterrows():
 97 |         if team_name_ac_dict[row.team_name]==row.HTM:
 98 |             is_home_arr.append(1)
 99 |         else:
100 |             is_home_arr.append(0)
101 |         if index%100000==0:
102 |             print('Runtime: {} seconds. {} iterations to go.'.format(round(time.time()-start,2), len(merged_df)-index))
103 |     return is_home_arr
104 | merged_df['is_home'] = get_home_team()
105 | 
106 | #sort the dataframe by date, game_id, player_name, and game_event_id
107 | sorted_df = merged_df.copy().sort_values(by=['game_date','GAME_ID','name','GAME_EVENT_ID']).reset_index(drop=True)
108 | 
109 | #Function to calculate whether player is hot, i.e. whether they have hit 1, 2, or 3 previous shots
110 | def is_player_hot(df):
111 |     start=time.time()
112 | 
113 |     #create array that stores whether previous 1, 2, or 3 shots were made, respectively
114 |     heat_check_array=np.zeros((len(df),3))
115 | 
116 |     for index, row in df.iterrows():
117 |         #If index < 3, cant check prior three shots
118 |         if index==0:
119 |             heat_check_array[index,:]+=[0,0,0]
120 |         elif index==1:
121 |             if (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1):
122 |                 heat_check_array[index,:]+=[1,0,0]
123 |             else:
124 |                 heat_check_array[index,:]+=[0,0,0]
125 |         elif index==2:
126 |             if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1):
127 |                 heat_check_array[index,:]+=[1,1,0]
128 |             elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==0):
129 |                 heat_check_array[index,:]+=[1,0,0]
130 |             else:
131 |                 heat_check_array[index,:]+=[0,0,0]
132 |         #If index >=3
133 |         else:
134 |             if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==1):
135 |                 heat_check_array[index,:]+=[1,1,1]
136 |             elif (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==0):
137 |                 heat_check_array[index,:]+=[1,1,0]
138 |             elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1):
139 |                 heat_check_array[index,:]+=[1,0,0]
140 |             else:
141 |                 heat_check_array[index,:]+=[0,0,0]
142 | 
143 |         if index%50000==0:
144 |             print('Runtime: {} seconds. {} iterations remaining.'.format(round(time.time()-start,2), len(df)-index))
145 | 
146 |     return heat_check_array
147 | 
148 | def add_heat_check_to_df(df):
149 |     heat_check_array = is_player_hot(df)
150 |     df['prev_shot_made'] = heat_check_array[:,0]
151 |     df['prev_2_made'] = heat_check_array[:,1]
152 |     df['prev_3_made'] = heat_check_array[:,2]
153 | add_heat_check_to_df(sorted_df)
154 | ######################################################################
155 | 
156 | 
157 | ####################--LOAD ADVANCED STATS--######################
158 | stats = pd.read_excel('./data/adv-stats-14-15.xlsx',index_col=0)
159 | stats['DWS/48'] = round(stats.DWS/stats.MP*48,3)
160 | 
161 | # Clean up name discrepancies between two dfs
162 | def clean_name_discrepancies(df,stats):
163 |     stats.Player = stats.Player.apply(lambda x: re.sub(r'([^\s\w]|_)+', '', x))
164 |     df.name[df.name=='Jose Juan Barea'] = 'JJ Barea'
165 |     df.name[df.name=='Tim Hardaway Jr'] = 'Tim Hardaway'
166 |     df.name[df.name=='Charles Hayes'] = 'Chuck Hayes'
167 |     df.name[df.name=='Glen Rice Jr'] = 'Glen Rice'
168 |     df.name[df.name=='Louis Williams'] = 'Lou Williams'
169 | 
170 |     stats.Player[stats.Player=='Nene Hilario'] = 'Nene'
171 |     stats.Player[stats.Player=='Jeffery Taylor'] = 'Jeff Taylor'
172 |     stats.Player[stats.Player== 'Luigi Datome'] = 'Gigi Datome'
173 | 
174 |     #convert defender name to first name last name format
175 |     df.defender_name[df.defender_name.isnull()] = 'None'
176 | clean_name_discrepancies(sorted_df, stats)
177 | 
178 | #convert defender names from last,first to first,last
179 | def convert_defender_names(player):
180 |     if player =='None':
181 |         return 'None'
182 |     elif player=='Nene':
183 |         return 'Nene'
184 |     else:
185 |         name = player.split(', ')
186 |         full_name = ' '.join((name[1],name[0]))
187 |         return re.sub(r'([^\s\w]|_)+', '', full_name)
188 | sorted_df.defender_name = sorted_df.defender_name.apply(convert_defender_names)
189 | 
190 | # Clean up name discrepancies between two dfs
191 | def clean_defender_names(df):
192 |     df.defender_name[df.defender_name=='Jose Juan Barea'] = 'JJ Barea'
193 |     df.defender_name[df.defender_name=='Tim Hardaway Jr'] = 'Tim Hardaway'
194 |     df.defender_name[df.defender_name=='Charles Hayes'] = 'Chuck Hayes'
195 |     df.defender_name[df.defender_name=='Glen Rice Jr'] = 'Glen Rice'
196 |     df.defender_name[df.defender_name=='Louis Williams'] = 'Lou Williams'
197 | clean_defender_names(sorted_df)
198 | 
199 | ############# OFFENSE ###########
200 | def merge_off_stats(df,stats):
201 |     off_stats = stats[['Player','Pos','Age','TS%','3PAr','USG%']]
202 |     df = df.merge(off_stats, left_on='name', right_on='Player').drop(columns=['Player'])
203 |     df.columns = map(str.lower, df.columns)
204 |     return df
205 | sorted_df = merge_off_stats(sorted_df,stats)
206 | 
207 | ############ DEFENSE ###########
208 | #map player ids to new df column matching to defender name
209 | def add_defender_ids(df):
210 |     player_ids_df = df[['name','player_id']].rename(columns={'name': 'defender_name', 'player_id':'defender_id'})
211 |     player_ids_df = player_ids_df.groupby('defender_name').max()
212 | 
213 |     none_id = pd.DataFrame(data=[('None',0)],
214 |                            columns=['defender_name', 'defender_id']).set_index('defender_name')
215 |     player_ids_df = pd.concat((player_ids_df,none_id))
216 | 
217 |     #merge two dataframes with defender ids
218 |     df = df.merge(player_ids_df, on='defender_name')
219 |     return df
220 | sorted_df = add_defender_ids(sorted_df)
221 | 
222 | def merge_def_stats(df,stats):
223 |     def_stats = stats[['Player', 'BLK%', 'DWS/48', 'DBPM']].rename(columns={'Player':'defender_name', 'BLK%':'blk_pct', 'DWS/48':'dws/48', 'DBPM':'dbpm'})
224 | 
225 |     #add dummy stats for no defender (id=0) and append to defense stats
226 |     none_stats = pd.DataFrame(data = [('None', 0, 0, 0)], columns=['defender_name', 'blk_pct', 'dws/48', 'dbpm'])
227 | 
228 |     #add player advanced def stats
229 |     def_stats = pd.concat((def_stats, none_stats)).reset_index(drop= True)
230 |     df = df.merge(def_stats, on='defender_name')
231 | 
232 |     #add team defensive rating
233 |     d_rating_14 = pd.read_excel('./data/drating_2014.xlsx')
234 |     df = df.merge(d_rating_14, left_on='team_name', right_on='Team').drop(columns='Team')
235 | 
236 |     return df
237 | sorted_df = merge_def_stats(sorted_df,stats)
238 | 
239 | ######################################################################
240 | 
241 | 
242 | ########################--ADDITIONAL CLEANING--#######################
243 | def clean_positions(df):
244 |     df.pos[df.name=='Giannis Antetokounmpo'] = 'SF'
245 |     df.pos[df.pos=='PG-SG'] = 'SG'
246 |     df.pos[df.pos=='SF-SG'] = 'SF'
247 |     df.pos[df.pos=='SG-PG'] = 'PG'
248 |     df.pos[df.pos=='PF-SF'] = 'SF'
249 |     df.pos[df.pos=='SF-PF'] = 'PF'
250 |     df.pos[df.pos=='SG-SF'] = 'SF'
251 | clean_positions(sorted_df)
252 | 
253 | def clean_shot_zones(df):
254 |     df.shot_zone_basic[df.shot_zone_basic=='In The Paint (Non-RA)'] = 'Paint'
255 |     #change shots misclassified as above_break_3 to backcourt
256 |     df.shot_zone_basic[(df.shot_zone_area=='Back Court(BC)') &  (df.shot_zone_basic=='Above the Break 3')] = 'Backcourt'
257 | clean_shot_zones(sorted_df)
258 | 
259 | def reduce_action_types(df):
260 |     df.action_type=df.action_type.str.lower()
261 |     new_action_types=[]
262 |     for i, row in df.action_type.iteritems():
263 |         if 'dunk' in row:
264 |             new_action_types.append('dunk')
265 |         elif 'layup' in row:
266 |             new_action_types.append('layup')
267 |         elif ('driving') in row or ('running') in row:
268 |             new_action_types.append('driving_running')
269 |         elif 'pullup' in row:
270 |             new_action_types.append('pullup')
271 |         elif ('fadeaway') in row or ('turnaround') in row or 'step back' in row:
272 |             new_action_types.append('fade_turn_step')
273 |         elif 'hook' in row:
274 |             new_action_types.append('hook_shot')
275 |         elif 'jump' in row:
276 |             new_action_types.append('jump_shot')
277 |         else:
278 |             new_action_types.append(row)
279 |     return new_action_types
280 | sorted_df.action_type = reduce_action_types(sorted_df)
281 | ######################################################################
282 | 
283 | sorted_df.to_csv('data/mid_etl_checkpoint_df.csv')
284 | 
285 | ########################--GET FG % FOR EACH ZONE--####################
286 | def get_zone_fg_pct(df, date=None, event=None):
287 |     fg_pct_list = []
288 |     column_names = []
289 | 
290 |     # if date:
291 |     #     df = df[df.game_date<date]
292 |     #     if date<='2014-10-28':
293 |     #         return 'Invalid Date'
294 |     #     if event:
295 |     #         df = df[df.game_event_id<event]
296 | 
297 |     #create crosstab with number of makes and misses in each of the 15 zones
298 |     tab = pd.crosstab(df.name, [df.shot_zone_area, df.shot_zone_basic, df.shot_made_flag])
299 | 
300 |     ## format of col - ('Back Court(BC)', 'Backcourt', 0)
301 |     for col in tab.columns:
302 |         #if it is the shot made column
303 |         if col[2]==1:
304 |             #calculate percentages
305 |             pct = round(tab[col]/(tab[col]+tab[col[0]][col[1]][0]),3)
306 |             fg_pct_list.append(pct)
307 |             column_names.append('_'.join(col[:2]).replace(' ','_').replace(')','').split('(')[1])
308 | 
309 |     #concatenate each player's percentages into one df
310 |     zone_pct_df = pd.concat([fg_df for fg_df in fg_pct_list],axis=1).fillna(0)
311 |     #add column names
312 |     zone_pct_df.columns=column_names
313 | 
314 |     return zone_pct_df.reset_index()
315 | zone_fg_pct = get_zone_fg_pct(sorted_df)
316 | 
317 | def create_zone_ids_df(df):
318 |     #create table matching shot_zones to unique ids
319 |     zone_ids = []
320 | 
321 |     id_=0
322 |     for zone_ in df.shot_zone_basic.unique():
323 |         for area_ in df.shot_zone_area.unique():
324 |             #if combo exists (i.e. there is no possibility to shoot a corner 3 from the center)
325 |             if len(df[(df.shot_zone_basic==zone_) & (df.shot_zone_area==area_)]) > 0:
326 |                 zone_ids.append((id_, zone_, area_))
327 |                 id_+=1
328 | 
329 |     zone_id_df = pd.DataFrame.from_records(zone_ids, columns=['zone_id', 'shot_zone_basic', 'shot_zone_area'])
330 |     return zone_id_df
331 | zone_ids = create_zone_ids_df(sorted_df)
332 | 
333 | def add_zone_to_zone_ids(zone_ids):
334 |     list_ = []
335 |     for index, row in zone_ids.iterrows():
336 |         list_.append(('_'.join([row.shot_zone_area,
337 |                                 row.shot_zone_basic]).replace(' ','_').replace(')','').split('(')[1],
338 |                       row.zone_id))
339 | 
340 |     zone_ids = zone_ids.merge(pd.DataFrame(list_, columns=['zone', 'zone_id']),on='zone_id')
341 |     return zone_ids
342 | zone_ids = add_zone_to_zone_ids(zone_ids)
343 | 
344 | #add zone_id, zone to df
345 | sorted_df = sorted_df.merge(zone_ids, on=['shot_zone_basic', 'shot_zone_area'])
346 | 
347 | #get player avg for each zone they are shooting in
348 | def get_zone_avg(df):
349 |     start = time.time()
350 |     df_slice = df[['name','zone']]
351 |     zone_avg = []
352 | 
353 |     for index, row in df_slice.iterrows():
354 |         zone_slice= zone_fg_pct[zone_fg_pct.name==row[0]]
355 |         zone_avg.append(zone_slice[row[1]].sum())
356 | 
357 |         if index % 25000==0:
358 |             print('Runtime: {} seconds. Iterations remaining: {}.'.format(round(time.time()-start,2), len(df_slice)-index))
359 |     return zone_avg
360 | zone_avgs = get_zone_avg(sorted_df)
361 | sorted_df['zone_avg']=zone_avgs
362 | 
363 | #add league avg for each zone
364 | sorted_df = sorted_df.merge(sorted_df.groupby('zone').mean().zone_avg.reset_index().rename(columns={'zone_avg': 'lg_zone_avg'}), on='zone')
365 | #add fg% relative to lg avg for each zone
366 | sorted_df['zone_minus_lg_avg'] = sorted_df.zone_avg-sorted_df.lg_zone_avg
367 | 
368 | sorted_df.to_csv('final_df_1415.csv')
369 | zone_fg_pct.to_csv('data/zone_fg_pct.csv')
370 | zone_ids.to_csv('data/zone_ids.csv')
371 | 
372 | ######################################################################
373 | #rearrange columns for better visability
374 | # clean = sorted_df[['name','pos','age','player_id', 'team_name', 'team_id', 'game_date',
375 | #        'game_id', 'game_event_id','season', 'period',
376 | #        'minutes_remaining', 'seconds_remaining', 'shot_made_flag',
377 | #        'action_type', 'shot_zone_basic', 'shot_zone_area', 'shot_zone_range',
378 | #        'shot_type', 'shot_distance', 'x', 'y', 'dribbles', 'touch_time',
379 | #        'opponent', 'opp_id', 'defender_name', 'defender_distance', 'shot_clock', 'htm', 'vtm',
380 | #        'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made', 'ts%', '3par', 'usg%']]
381 | 


--------------------------------------------------------------------------------
/presentation.py:
--------------------------------------------------------------------------------
  1 | ############################### IMPORTS ###############################
  2 | if True:
  3 |     import itertools, math, time, re, pickle
  4 | 
  5 |     import plotly
  6 |     import plotly.plotly as py
  7 |     import plotly.graph_objs as go
  8 |     plotly.offline.init_notebook_mode(connected=True)
  9 | 
 10 |     import matplotlib
 11 |     import matplotlib.pyplot as plt
 12 |     import seaborn as sns
 13 |     import numpy as np
 14 |     import pandas as pd
 15 |     pd.set_option('display.max_columns',100)
 16 | 
 17 |     import ipywidgets as widgets
 18 |     from ipywidgets import interact
 19 | 
 20 |     import warnings
 21 |     warnings.filterwarnings('ignore')
 22 | 
 23 |     from court import court_shapes
 24 | 
 25 |     from shot_chart_viz import acquire_playerPic, get_team_logo, get_season_stats, get_team_stats, draw_court
 26 | 
 27 |     cdict = {
 28 |         'blue': [(0.0, 0.6313725709915161, 0.6313725709915161), (0.25, 0.4470588266849518, 0.4470588266849518), (0.5, 0.29019609093666077, 0.29019609093666077), (0.75, 0.11372549086809158, 0.11372549086809158), (1.0, 0.05098039284348488, 0.05098039284348488)],
 29 |         'green': [(0.0, 0.7333333492279053, 0.7333333492279053), (0.25, 0.572549045085907, 0.572549045085907), (0.5, 0.4156862795352936, 0.4156862795352936), (0.75, 0.0941176488995552, 0.0941176488995552), (1.0, 0.0, 0.0)],
 30 |         'red': [(0.0, 0.9882352948188782, 0.9882352948188782), (0.25, 0.9882352948188782, 0.9882352948188782), (0.5, 0.9843137264251709, 0.9843137264251709), (0.75, 0.7960784435272217, 0.7960784435272217), (1.0, 0.40392157435417175, 0.40392157435417175)]}
 31 |     mymap = matplotlib.colors.LinearSegmentedColormap('my_colormap', cdict, 1024)
 32 | ############################## LOAD DATA ##############################
 33 | df = pd.read_csv('final_df_1415.csv', index_col=0)
 34 | 
 35 | ######################################################################
 36 | ###########################--SHOT CHARTS--############################
 37 | ######################################################################
 38 | 
 39 | ########################--BUBBLE SHOT CHARTS--########################
 40 | def find_shootingPcts(shot_df, gridNum):
 41 |     x2 = shot_df.x[(shot_df['y']<425.1) & (shot_df.shot_type==2)]
 42 |     y2 = shot_df.y[(shot_df['y']<425.1) & (shot_df.shot_type==2)]
 43 | 
 44 |     x2_made = shot_df.x[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==2)]
 45 |     y2_made = shot_df.y[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==2)]
 46 | 
 47 |     #compute number of shots made and taken from each hexbin location
 48 |     hb_shot2 = plt.hexbin(x2, y2, gridsize=gridNum, extent=(-250,250,425,-50));
 49 |     plt.close()
 50 |     hb_made2 = plt.hexbin(x2_made, y2_made, gridsize=gridNum, extent=(-250,250,425,-50),cmap=plt.cm.Reds);
 51 |     plt.close()
 52 | 
 53 |     #compute shooting percentage
 54 |     ShootingPctLocs2 = hb_made2.get_array() / hb_shot2.get_array()
 55 |     ShootingPctLocs2[np.isnan(ShootingPctLocs2)] = 0 #makes 0/0s=0
 56 | 
 57 |     #############################################################################################################
 58 |     #############################################################################################################
 59 |     ###########################################  THREE POINTERS  ################################################
 60 |     #############################################################################################################
 61 |     #############################################################################################################
 62 | 
 63 |     x3 = shot_df.x[(shot_df['y']<425.1) & (shot_df.shot_type==3)]
 64 |     y3 = shot_df.y[(shot_df['y']<425.1) & (shot_df.shot_type==3)]
 65 | 
 66 |     x3_made = shot_df.x[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==3)]
 67 |     y3_made = shot_df.y[(shot_df['shot_made_flag']==1) & (shot_df['y']<425.1) & (shot_df.shot_type==3)]
 68 | 
 69 |     #compute number of shots made and taken from each hexbin location
 70 |     hb_shot3 = plt.hexbin(x3, y3, gridsize=gridNum, extent=(-250,250,425,-50));
 71 |     plt.close()
 72 |     hb_made3 = plt.hexbin(x3_made, y3_made, gridsize=gridNum, extent=(-250,250,425,-50),cmap=plt.cm.Reds);
 73 |     plt.close()
 74 | 
 75 |     #compute shooting percentage
 76 |     ShootingPctLocs3 = hb_made3.get_array() / hb_shot3.get_array()
 77 |     ShootingPctLocs3[np.isnan(ShootingPctLocs3)] = 0 #makes 0/0s=0
 78 | 
 79 |     return (ShootingPctLocs2, hb_shot2, ShootingPctLocs3, hb_shot3)
 80 | 
 81 | def freq_shooting_plot(player_name, gridNum=25):
 82 |     plot_size=(10,8)
 83 |     shot_df = df[df.name==player_name]
 84 | 
 85 |     from matplotlib.patches import Circle
 86 |     #compute shooting percentage and # of shots
 87 |     (ShootingPctLocs2, shotNumber2) = find_shootingPcts(shot_df, gridNum)[0:2]
 88 |     (ShootingPctLocs3, shotNumber3) = find_shootingPcts(shot_df, gridNum)[2:]
 89 | 
 90 |     #draw figure and court
 91 |     fig = plt.figure(figsize=plot_size)#(12,7)
 92 |     ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure
 93 |     draw_court(outer_lines=False)
 94 |     plt.xlim(-250,250)
 95 |     plt.ylim(400, -25)
 96 | 
 97 |     #draw player image
 98 |     zoom = np.float(plot_size[0])/(12.0*2) #how much to zoom the player's pic. I have this hackily dependent on figure size
 99 |     img = acquire_playerPic(shot_df.player_id, zoom)
100 |     ax.add_artist(img)
101 | 
102 |     ############################################  TWO POINTERS  #################################################
103 |     cmap = mymap.from_list('Color Map',[(0,'#ff0000'),(.45,'#ffff00'),(.6,'#00ff00'), (1,'#004d00')])
104 |     #draw circles
105 |     for i, shots in enumerate(ShootingPctLocs2):
106 |         restricted2 = Circle(shotNumber2.get_offsets()[i], radius=shotNumber2.get_array()[i],
107 |                             color=cmap(shots),alpha=1, fill=True)
108 |         if restricted2.radius > 240/gridNum: restricted2.radius=240/gridNum
109 |         ax.add_patch(restricted2)
110 | 
111 |     #draw color bar
112 |     ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8])
113 |     cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical')
114 |     cb.set_label('Field Goal %', labelpad=20)
115 |     cb.set_ticks([0.0, 0.25, .485, 0.75, 1.0])
116 |     cb.set_ticklabels(['0%','25%','48.5%\nLg Avg', '75%', '100%'])
117 | 
118 |     ###########################################  THREE POINTERS  ################################################
119 |     #plotting 3 pointers separately to account for expected lower fg% from deep
120 |     cmap3 = mymap.from_list('Color Map',[(0,'#ff0000'),(.35,'#ffff00'),(.6,'#00ff00'),(1,'#004d00')])
121 |     #draw circles
122 |     for i, shots in enumerate(ShootingPctLocs3):
123 |         restricted3 = Circle(shotNumber3.get_offsets()[i], radius=shotNumber3.get_array()[i],
124 |                             color=cmap3(shots),alpha=1, fill=True)
125 |         if restricted3.radius > 240/gridNum: restricted3.radius=240/gridNum
126 |         ax.add_patch(restricted3)
127 | 
128 |     #draw color bar
129 |     ax3 = fig.add_axes([1.1, 0.1, 0.02, 0.8])
130 |     cb3 = matplotlib.colorbar.ColorbarBase(ax3,cmap=cmap3, orientation='vertical')
131 |     cb3.set_label('Three Point %',labelpad=-8)
132 |     cb3.set_ticks([0.0, 0.25,.35, 0.5, 0.75, 1.0])
133 |     cb3.set_ticklabels(['0%','25%','35% - Lg Avg', '50%','75%', '100%'])
134 | 
135 |     ax.set_title(shot_df.name.unique()[0] +' - Shot Chart 2014-15')
136 |     #plot season stats
137 |     ax.text(135,395,get_season_stats(player_name)[1])
138 | 
139 |     plt.show()
140 |     shot_recommender(player_name)
141 | 
142 | #################PLOT TEAM FREQUENCY SHOT CHART (MATPLOTLIB)#################
143 | def team_freq_plot(team, gridNum=25):
144 |     plot_size=(10,8)
145 |     team_df = df[df.team_name==team]
146 | 
147 |     from matplotlib.patches import Circle
148 |     #compute shooting percentage and # of shots
149 |     (ShootingPctLocs2, shotNumber2) = find_shootingPcts(team_df, gridNum)[0:2]
150 |     (ShootingPctLocs3, shotNumber3) = find_shootingPcts(team_df, gridNum)[2:]
151 | 
152 |     #draw figure and court
153 |     fig = plt.figure(figsize=plot_size)
154 |     ax = plt.axes([0.1, 0.1, 0.8, 0.8]) #where to place the plot within the figure
155 |     draw_court(outer_lines=False)
156 |     plt.xlim(-250,250)
157 |     plt.ylim(400, -25)
158 | 
159 |     #draw team image
160 |     team_ac = team_df.htm[team_df.is_home==1].unique()[0]
161 |     zoom = 1 #np.float(plot_size[0])/(8.0)
162 |     img = get_team_logo(team_ac, zoom)
163 |     ax.add_artist(img)
164 | 
165 |     ############################################  TWO POINTERS  #################################################
166 |     cmap = mymap.from_list('Color Map',[(0,'#ff0000'),(.45,'#ffff00'),(.6,'#00ff00'), (1,'#004d00')])
167 |     #draw circles
168 |     for i, shots in enumerate(ShootingPctLocs2):
169 |         restricted2 = Circle(shotNumber2.get_offsets()[i], radius=shotNumber2.get_array()[i],
170 |                             color=cmap(shots),alpha=.9, fill=True)
171 |         if restricted2.radius > 240/gridNum: restricted2.radius=240/gridNum
172 |         ax.add_patch(restricted2)
173 | 
174 |     #draw color bar
175 |     ax2 = fig.add_axes([0.95, 0.1, 0.02, 0.8])
176 |     cb = matplotlib.colorbar.ColorbarBase(ax2,cmap=cmap, orientation='vertical')
177 |     cb.set_label('Field Goal %', labelpad=20)
178 |     cb.set_ticks([0.0, 0.25, .485, 0.75, 1.0])
179 |     cb.set_ticklabels(['0%','25%','48.5%\nLg Avg', '75%', '100%'])
180 | 
181 |     ###########################################  THREE POINTERS  ################################################
182 |     #plotting 3 pointers separately to account for expected lower fg% from deep
183 |     cmap3 = mymap.from_list('Color Map',[(0,'#ff0000'),(.35,'#ffff00'),(.6,'#00ff00'),(1,'#004d00')])
184 |     #draw circles
185 |     for i, shots in enumerate(ShootingPctLocs3):
186 |         restricted3 = Circle(shotNumber3.get_offsets()[i], radius=shotNumber3.get_array()[i],
187 |                             color=cmap3(shots),alpha=.9, fill=True)
188 |         if restricted3.radius > 240/gridNum: restricted3.radius=240/gridNum
189 |         ax.add_patch(restricted3)
190 | 
191 |     #draw color bar
192 |     ax3 = fig.add_axes([1.1, 0.1, 0.02, 0.8])
193 |     cb3 = matplotlib.colorbar.ColorbarBase(ax3,cmap=cmap3, orientation='vertical')
194 |     cb3.set_label('Three Point %',labelpad=-8)
195 |     cb3.set_ticks([0.0, 0.25,.35, 0.5, 0.75, 1.0])
196 |     cb3.set_ticklabels(['0%','25%','35% - Lg Avg', '50%','75%', '100%'])
197 | 
198 | 
199 |     ax.set_title(team_df.team_name.unique()[0] +' - Shot Chart 2014-15')
200 |     #plot season stats
201 |     ax.text(150,395,get_team_stats(team)[1])
202 |     plt.show()
203 | 
204 | ########################--GROUPED SHOT CHART--########################
205 | def grouped_plot(feature):
206 |     groups = df.groupby(feature)
207 |     colors = np.linspace(0,1,len(groups))
208 | 
209 |     color_list = ['aliceblue', 'aqua', 'steelblue','violet', 'blue',
210 |               'blueviolet', 'brown', 'cadetblue',
211 |               'chartreuse', 'darkgreen', 'darkmagenta', 'tomato',
212 |              'gold', 'red', 'slategray']
213 |     counter=0
214 |     data = []
215 |     for g, c in zip(groups, colors):
216 |         data.append(go.Scattergl(
217 |             x = g[1].x,
218 |             y = g[1].y,
219 |             mode = 'markers',
220 |             name = g[0],
221 |             marker= dict(symbol='circle', size=7,
222 |                          line={'width':1}, opacity=0.7, color=color_list[counter]),
223 |             text = g[0],
224 |             hoverinfo = 'text')
225 |         )
226 |         counter+=1
227 | 
228 |     layout = go.Layout(
229 |         title='Shot Distribution by ' + feature.title(),
230 |         showlegend =True,
231 |         xaxis={'showgrid':False, 'range':[-250,250]},
232 |         yaxis={'showgrid':False, 'range':[-47.5,500]},
233 |         height = 600,
234 |         width = 750,
235 |         hovermode='closest',
236 |         shapes=court_shapes)
237 | 
238 |     fig = go.Figure(data=data, layout=layout)
239 |     plotly.offline.iplot(fig, filename = 'Shot Zone Breakdown')
240 | 
241 | ##########################--SHOT FREQ HEATMAP--#########################
242 | def shot_freq_heatmap(name):
243 |     if name in df.name.unique():
244 |         df_ = df[df.name==name]
245 |         z_max=40
246 |         z_min=0
247 |     else:
248 |         df_ = df[df.team_name==name]
249 |         z_max=250
250 |         z_min=5
251 | 
252 |     x_make = df_[df_.shot_made_flag == 1]['x']
253 |     y_make = df_[df_.shot_made_flag == 1]['y']
254 |     x_miss = df_[df_.shot_made_flag == 0]['x']
255 |     y_miss = df_[df_.shot_made_flag == 0]['y']
256 | 
257 |     x = np.concatenate([x_make, x_miss])
258 |     y = np.concatenate([y_make, y_miss])
259 | 
260 |     makes = go.Scatter(
261 |         x=x_make,
262 |         y=y_make,
263 |         mode='markers',
264 |         name='Make',
265 |         showlegend=True,
266 |         marker=dict(
267 |             symbol='circle',
268 |             opacity=0.7,
269 |             color='green',
270 |             size=4,
271 |             line=dict(width=1),
272 |         )
273 |     )
274 |     misses = go.Scatter(
275 |         x=x_miss,
276 |         y=y_miss,
277 |         mode='markers',
278 |         name='Miss',
279 |         showlegend=True,
280 |         marker=dict(
281 |             symbol='x',
282 |             opacity=0.7,
283 |             color='yellow',
284 |             size=4,
285 |             line=dict(width=1),
286 |         )
287 |     )
288 |     trace3 = go.Histogram2d(
289 |         x=x,
290 |         y=y,
291 |         zmax=z_max,
292 |         zmin=z_min,
293 |     #     nbinsx=20,
294 |     #     nbinsy=20,
295 |         zsmooth='best',
296 |         autobinx=True,
297 |         autobiny=True,
298 |         reversescale=False,
299 |         opacity=.75,
300 |         #zauto=True,
301 |         #autocolorscale=True,
302 |     )
303 | 
304 |     layout = go.Layout(
305 |         xaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-250,250]),
306 |         yaxis=dict( ticks='', showgrid=False, zeroline=False, nticks=20, range=[-47.5,450]),
307 |         autosize=False,
308 |         height=600,
309 |         width=750,
310 |         hovermode='closest',
311 |         shapes= court_shapes,
312 |         title= name + ' - Shot Frequency Heatmap',
313 |         showlegend=True,
314 |         legend=dict(x=1.2, y=1),
315 |     )
316 | 
317 |     data = [trace3]#, makes, misses]
318 |     fig = go.Figure(data=data, layout=layout)
319 | 
320 |     plotly.offline.iplot(fig)
321 | 
322 | ############################--PPS HEATMAP--#############################
323 | # def pps_heatmap(feature):
324 | #     pps_tab=pd.crosstab(df.team_name, df[feature], values=df.pps, aggfunc='mean',margins=False).fillna(0)
325 | #
326 | #     team_heatmap = go.Heatmap(z=[np.array((pps_tab[pps_tab.index==pps_tab.index[i]])) for i in range(len(pps_tab.index))],
327 | #                        x=pps_tab.columns, y= [team.split(' ')[-1] for team in pps_tab.index]
328 | #                       )
329 | #
330 | #     layout = go.Layout(
331 | #         title='Points Per Shot Heatmap',
332 | #         xaxis = dict(ticks='', nticks=len(pps_tab.columns), automargin=True),
333 | #         yaxis = dict(ticks='', nticks=len(pps_tab.index), automargin=True),
334 | #     )
335 | #
336 | #     fig = go.Figure(data=[team_heatmap], layout=layout)
337 | #     plotly.offline.iplot(fig, filename='pps-heatmap')
338 | def pps_heatmap_sns(feature):
339 |     pps_tab=pd.crosstab(df[feature], df.team_name, values=df.pps, aggfunc='mean',margins=False).fillna(0)
340 | 
341 |     plt.figure(figsize=(15,6))
342 |     sns.heatmap(pps_tab, annot=False, robust=True)
343 |     plt.show()
344 | 
345 | ########################--FREQUENCY BAR PLOT--########################
346 | def freq_bar_plots(feature, round_=False):
347 |     df_ = df.copy()
348 |     if round_==True:
349 |         df_[feature] = round(df_[feature])
350 | 
351 |     feat_tab = pd.crosstab(df_[feature], df_.shot_made_flag, margins=True)
352 |     feat_tab['fg_pct'] = round(feat_tab[1]/feat_tab['All'],3)
353 | 
354 |     tab=feat_tab.drop(columns='All')[:-1]
355 |     make_text= [str(round(t*100,1)) + '%' for t in tab.fg_pct]
356 |     miss_text= [str(round((1-t)*100,1)) + '%' for t in tab.fg_pct]
357 | 
358 |     trace1 = go.Bar(
359 |         x=tab.index,
360 |         y=tab[1],
361 |         name='Makes',
362 |         text= make_text ,
363 |         textposition = 'inside',
364 |         textfont=dict(
365 |             family='sans serif', size=12, color='white'),
366 |         marker=dict(
367 |             color='red'),
368 |         opacity=0.75
369 |     )
370 |     trace2 = go.Bar(
371 |         x=tab.index,
372 |         y=tab[0],
373 |         name='Misses',
374 |         text= miss_text,
375 |         textposition = 'inside',
376 |         textfont=dict(
377 |             family='sans serif', size=10, color='white'),
378 |         marker=dict(
379 |             color='blue'),
380 |         opacity=0.75
381 |     )
382 | 
383 |     line = go.Scatter(
384 |         x=tab.index,
385 |         y=tab[1],
386 |         mode='markers+lines',
387 |         name='# Makes',
388 |         hoverinfo='skip',
389 |         line=dict(
390 |         color='black', width=.75)
391 |     )
392 | 
393 |     data = [trace1, trace2]#, line]
394 |     layout = go.Layout(
395 |         barmode='stack',
396 |         title='FG% by ' + feature.title().replace('_',' '),
397 |         showlegend =True,
398 |         xaxis=dict(
399 |             automargin=True,
400 |             autorange=True,
401 |             ticks='',
402 |             showticklabels=True,
403 |             #tickangle=25,
404 |             title=feature.replace('_',' ').title()
405 |         ),
406 |         yaxis=dict(
407 |             automargin=True,
408 |             ticks='',
409 |             showticklabels=True,
410 |             title='# of Shots'
411 |         )
412 |     )
413 | 
414 |     fig = go.Figure(data=data, layout=layout)
415 |     plotly.offline.iplot(fig, filename='stacked-bar')
416 | 
417 | #########################--PERCENTAGE BAR CHART--##########################
418 | def pct_bar_plots(feature, round_=False, player=None, team=None):
419 |     if round_==True:
420 |         df_ = df.copy()
421 |         df_[feature] = round(df_[feature])
422 |     else:
423 |         df_ = df
424 | 
425 |     if player:
426 |         df_ = df[df.name==player.title()]
427 |         title= player.title() + ' - FG% by ' + feature.title().replace('_',' ')
428 |     elif team:
429 |         df_ = df[df.team_name==team.title()]
430 |         title= team.title() + ' - FG% by ' + feature.title().replace('_',' ')
431 |     else:
432 |         df_ = df
433 |         title= 'FG% by ' + feature.title().replace('_',' ')
434 | 
435 | 
436 |     c_tab=pd.crosstab(df_[feature], df_.shot_made_flag, margins=True)
437 |     c_tab['pct_made'] = c_tab[1]/c_tab.All
438 |     c_tab['pct_missed'] = 1-c_tab.pct_made
439 | 
440 |     made_text= [str(round(t*100,1)) + '%' for t in c_tab.pct_made]
441 |     missed_text= [str(round(t*100,1)) + '%' for t in c_tab.pct_missed]
442 | 
443 |     trace1 = go.Bar(
444 |         x=c_tab.index,
445 |         y=c_tab.pct_made,
446 |         name='Makes',
447 |         text= made_text,
448 |         textposition = 'auto',
449 |         textfont=dict(
450 |             family='sans serif',
451 |             size=12, color='white'),
452 |         marker=dict(
453 |             color='red'),
454 |         opacity=0.75
455 |     )
456 |     trace2 = go.Bar(
457 |         x=c_tab.index,
458 |         y=c_tab.pct_missed,
459 |         name='Misses',
460 |         text= missed_text,
461 |         textposition = 'auto',
462 |         textfont=dict(
463 |             family='sans serif',
464 |             size=12, color='white'),
465 |         marker=dict(
466 |             color='blue'),
467 |         opacity=0.75,
468 |     )
469 | 
470 |     data = [trace1, trace2]
471 |     layout = go.Layout(
472 |         barmode='stack',
473 |         title= title,
474 |         showlegend =True,
475 |         xaxis=dict(
476 |             automargin=True,
477 |             autorange=True,
478 |             ticks='',
479 |             showticklabels=True,
480 |             title=feature.replace('_',' ').title()
481 |         ),
482 |         yaxis=dict(
483 |             automargin=True,
484 |             ticks='',
485 |             showticklabels=True,
486 |             title='FG %'
487 |         )
488 |     )
489 | 
490 |     fig = go.Figure(data=data, layout=layout)
491 |     plotly.offline.iplot(fig, filename='stacked-bar')
492 | 
493 | 
494 | ###########################--SHOT RECOMMENDER--###########################
495 | def player_pps(name):
496 |     player = df[df.name==name]
497 |     pps_tab=pd.crosstab(player.zone, player.name,
498 |                        values=player.pps, aggfunc='mean',
499 |                        margins=False).fillna(0).rename(
500 |                                 columns={list(set(player.name))[0]:'pps'})
501 | 
502 |     pps_freq = pd.concat([pps_tab,
503 |                           pd.DataFrame(
504 |                               player.zone.value_counts()).rename(
505 |                               columns={'zone':'count_'})],
506 |                               axis=1).sort_values(by='pps',
507 |                                                   ascending=False)
508 | 
509 |     pps_freq['freq_pct'] = pps_freq.count_/pps_freq.count_.sum()
510 | 
511 |     pps_freq=pps_freq.sort_values('freq_pct',ascending=False)
512 | 
513 |     return pps_freq[pps_freq.freq_pct>=.05]
514 | 
515 | def pps_zone_percentiles(name):
516 |     pps_per_zone = pd.crosstab(df.name, df.zone, df.pps, aggfunc='mean').fillna(0)
517 |     pps_percentiles = pps_per_zone.quantile(np.round(np.arange(.1,1,.2)*10)/10)
518 |     return pps_percentiles
519 | 
520 | def shot_recommender(name):
521 |     pps = player_pps(name)
522 |     zone_percentiles = pps_zone_percentiles(name)
523 | 
524 |     more_freq = []
525 |     less_freq = []
526 | 
527 |     for i in pps.index:
528 |         if pps.loc[i].pps > zone_percentiles[i].loc[.7]:
529 |             more_freq.append(i)
530 |             #print(name +  ' should shoot in ' + i + ' more frequently')
531 |         elif pps.loc[i].pps < zone_percentiles[i].loc[.5]:
532 |             if i != 'C_Restricted_Area':
533 |                 less_freq.append(i)
534 |             #print(name + ' should shoot in ' + i + ' less frequently')
535 |     if len(more_freq)>0:
536 |         print(name +  ' should shoot in the following zones more frequently:')
537 |         [print('  - ' + zone) for zone in more_freq]
538 |     if len(less_freq)>0:
539 |         print(name +  ' should shoot in the following zones less frequently:')
540 |         [print('  - ' + zone) for zone in less_freq]
541 | 


--------------------------------------------------------------------------------
/Data-Exploration.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 583,
   6 |    "metadata": {
   7 |     "extensions": {
   8 |      "jupyter_dashboards": {
   9 |       "version": 1,
  10 |       "views": {
  11 |        "grid_default": {
  12 |         "hidden": true
  13 |        },
  14 |        "report_default": {
  15 |         "hidden": true
  16 |        }
  17 |       }
  18 |      }
  19 |     },
  20 |     "scrolled": true
  21 |    },
  22 |    "outputs": [
  23 |     {
  24 |      "data": {
  25 |       "text/html": [
  26 |        "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
  27 |       ],
  28 |       "text/vnd.plotly.v1+html": [
  29 |        "<script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window.Plotly) {{require(['plotly'],function(plotly) {window.Plotly=plotly;});}}</script>"
  30 |       ]
  31 |      },
  32 |      "metadata": {},
  33 |      "output_type": "display_data"
  34 |     }
  35 |    ],
  36 |    "source": [
  37 |     "import matplotlib\n",
  38 |     "import matplotlib.pyplot as plt\n",
  39 |     "import seaborn as sns\n",
  40 |     "import numpy as np\n",
  41 |     "import pandas as pd\n",
  42 |     "pd.set_option('display.max_columns',100)\n",
  43 |     "\n",
  44 |     "import plotly\n",
  45 |     "import plotly.plotly as py\n",
  46 |     "import plotly.graph_objs as go\n",
  47 |     "plotly.offline.init_notebook_mode(connected=True)\n",
  48 |     "\n",
  49 |     "import warnings\n",
  50 |     "warnings.filterwarnings('ignore')\n",
  51 |     "from court import court_shapes\n",
  52 |     "\n",
  53 |     "import ipywidgets as widgets\n",
  54 |     "from ipywidgets import interact\n",
  55 |     "\n",
  56 |     "import itertools, math, time"
  57 |    ]
  58 |   },
  59 |   {
  60 |    "cell_type": "code",
  61 |    "execution_count": 589,
  62 |    "metadata": {
  63 |     "extensions": {
  64 |      "jupyter_dashboards": {
  65 |       "version": 1,
  66 |       "views": {
  67 |        "grid_default": {
  68 |         "hidden": true
  69 |        },
  70 |        "report_default": {
  71 |         "hidden": true
  72 |        }
  73 |       }
  74 |      }
  75 |     },
  76 |     "scrolled": true
  77 |    },
  78 |    "outputs": [],
  79 |    "source": [
  80 |     "oct_nov_ = pd.read_csv('./data/nba_savant/oct-nov-14-15.csv')\n",
  81 |     "dec_ = pd.read_csv('./data/nba_savant/dec-14-15.csv')\n",
  82 |     "jan_ = pd.read_csv('./data/nba_savant/jan-14-15.csv')\n",
  83 |     "feb_ = pd.read_csv('./data/nba_savant/feb-14-15.csv')\n",
  84 |     "mar_ = pd.read_csv('./data/nba_savant/mar-14-15.csv')\n",
  85 |     "apr_ = pd.read_csv('./data/nba_savant/apr-14-15.csv')"
  86 |    ]
  87 |   },
  88 |   {
  89 |    "cell_type": "code",
  90 |    "execution_count": 590,
  91 |    "metadata": {
  92 |     "extensions": {
  93 |      "jupyter_dashboards": {
  94 |       "version": 1,
  95 |       "views": {
  96 |        "grid_default": {
  97 |         "hidden": true
  98 |        },
  99 |        "report_default": {
 100 |         "hidden": true
 101 |        }
 102 |       }
 103 |      }
 104 |     },
 105 |     "scrolled": true
 106 |    },
 107 |    "outputs": [],
 108 |    "source": [
 109 |     "df = pd.concat([oct_nov_,dec_,jan_,feb_,mar_,apr_])\n",
 110 |     "#reverse x values to plot correctly\n",
 111 |     "df.x = -df.x\n",
 112 |     "df.game_date = pd.to_datetime(df.game_date)\n",
 113 |     "df = df.reset_index(drop=True)"
 114 |    ]
 115 |   },
 116 |   {
 117 |    "cell_type": "code",
 118 |    "execution_count": 591,
 119 |    "metadata": {
 120 |     "extensions": {
 121 |      "jupyter_dashboards": {
 122 |       "version": 1,
 123 |       "views": {
 124 |        "grid_default": {
 125 |         "hidden": true
 126 |        },
 127 |        "report_default": {
 128 |         "hidden": true
 129 |        }
 130 |       }
 131 |      }
 132 |     },
 133 |     "scrolled": true
 134 |    },
 135 |    "outputs": [],
 136 |    "source": [
 137 |     "stats = pd.read_excel('./data/adv-stats-14-15.xlsx',index_col=0)"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "code",
 142 |    "execution_count": 592,
 143 |    "metadata": {
 144 |     "extensions": {
 145 |      "jupyter_dashboards": {
 146 |       "version": 1,
 147 |       "views": {
 148 |        "grid_default": {
 149 |         "hidden": true
 150 |        },
 151 |        "report_default": {
 152 |         "hidden": true
 153 |        }
 154 |       }
 155 |      }
 156 |     },
 157 |     "scrolled": true
 158 |    },
 159 |    "outputs": [],
 160 |    "source": [
 161 |     "# Clean up name discrepancies between two dfs\n",
 162 |     "import re\n",
 163 |     "stats.Player = stats.Player.apply(lambda x: re.sub(r'([^\\s\\w]|_)+', '', x))\n",
 164 |     "df.name[df.name=='Jose Juan Barea'] = 'JJ Barea'\n",
 165 |     "df.name[df.name=='Tim Hardaway Jr'] = 'Tim Hardaway'\n",
 166 |     "df.name[df.name=='Charles Hayes'] = 'Chuck Hayes'\n",
 167 |     "df.name[df.name=='Glen Rice Jr'] = 'Glen Rice'\n",
 168 |     "df.name[df.name=='Louis Williams'] = 'Lou Williams'\n",
 169 |     "\n",
 170 |     "stats.Player[stats.Player=='Nene Hilario'] = 'Nene'\n",
 171 |     "stats.Player[stats.Player=='Jeffery Taylor'] = 'Jeff Taylor'\n",
 172 |     "stats.Player[stats.Player== 'Luigi Datome'] = 'Gigi Datome'"
 173 |    ]
 174 |   },
 175 |   {
 176 |    "cell_type": "code",
 177 |    "execution_count": 593,
 178 |    "metadata": {},
 179 |    "outputs": [],
 180 |    "source": [
 181 |     "# #convert defender name to first name last name format\n",
 182 |     "# df.defender_name[df.defender_name.isnull()] = 'None'\n",
 183 |     "\n",
 184 |     "# def convert_defender_names(player):\n",
 185 |     "#     if player =='None':\n",
 186 |     "#         return 'None'\n",
 187 |     "#     elif player=='Nene':\n",
 188 |     "#         return 'Nene'\n",
 189 |     "#     else:\n",
 190 |     "#         name = player.split(', ')\n",
 191 |     "#         full_name = ' '.join((name[1],name[0]))\n",
 192 |     "#         return re.sub(r'([^\\s\\w]|_)+', '', full_name)\n",
 193 |     "    \n",
 194 |     "# df.defender_name = df.defender_name.apply(convert_defender_names)\n",
 195 |     "\n",
 196 |     "# # Clean up name discrepancies between two dfs\n",
 197 |     "# df.defender_name[df.defender_name=='Jose Juan Barea'] = 'JJ Barea'\n",
 198 |     "# df.defender_name[df.defender_name=='Tim Hardaway Jr'] = 'Tim Hardaway'\n",
 199 |     "# df.defender_name[df.defender_name=='Charles Hayes'] = 'Chuck Hayes'\n",
 200 |     "# df.defender_name[df.defender_name=='Glen Rice Jr'] = 'Glen Rice'\n",
 201 |     "# df.defender_name[df.defender_name=='Louis Williams'] = 'Lou Williams'"
 202 |    ]
 203 |   },
 204 |   {
 205 |    "cell_type": "code",
 206 |    "execution_count": 594,
 207 |    "metadata": {},
 208 |    "outputs": [],
 209 |    "source": [
 210 |     "# #map player ids to new df column matching to defender name\n",
 211 |     "# player_ids_df = df[['name','player_id']].rename(columns={'name':'defender_name','player_id':'defender_id'})\n",
 212 |     "# player_ids_df = player_ids_df.groupby('defender_name').max()\n",
 213 |     "\n",
 214 |     "# df = df.merge(player_ids_df, on='defender_name')"
 215 |    ]
 216 |   },
 217 |   {
 218 |    "cell_type": "code",
 219 |    "execution_count": 595,
 220 |    "metadata": {
 221 |     "extensions": {
 222 |      "jupyter_dashboards": {
 223 |       "version": 1,
 224 |       "views": {
 225 |        "grid_default": {
 226 |         "hidden": true
 227 |        },
 228 |        "report_default": {
 229 |         "hidden": true
 230 |        }
 231 |       }
 232 |      }
 233 |     },
 234 |     "scrolled": true
 235 |    },
 236 |    "outputs": [],
 237 |    "source": [
 238 |     "df.shot_type = np.where(df.shot_type=='2PT Field Goal', 2, 3)"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "code",
 243 |    "execution_count": 596,
 244 |    "metadata": {
 245 |     "extensions": {
 246 |      "jupyter_dashboards": {
 247 |       "version": 1,
 248 |       "views": {
 249 |        "grid_default": {
 250 |         "hidden": true
 251 |        },
 252 |        "report_default": {}
 253 |       }
 254 |      }
 255 |     },
 256 |     "scrolled": true
 257 |    },
 258 |    "outputs": [],
 259 |    "source": [
 260 |     "def get_shot_distance(x,y):\n",
 261 |     "    x_squared=x**2\n",
 262 |     "    y_squared=y**2\n",
 263 |     "    shot_distance = math.sqrt(x_squared + y_squared) / 10  # unit for distance is off by factor of 10, divide by 10 to convert to feet\n",
 264 |     "    return round(shot_distance, 1)"
 265 |    ]
 266 |   },
 267 |   {
 268 |    "cell_type": "code",
 269 |    "execution_count": 598,
 270 |    "metadata": {
 271 |     "extensions": {
 272 |      "jupyter_dashboards": {
 273 |       "version": 1,
 274 |       "views": {
 275 |        "grid_default": {
 276 |         "hidden": true
 277 |        },
 278 |        "report_default": {}
 279 |       }
 280 |      }
 281 |     },
 282 |     "scrolled": true
 283 |    },
 284 |    "outputs": [],
 285 |    "source": [
 286 |     "def get_shot_zone(row):\n",
 287 |     "    x = row.x\n",
 288 |     "    y = row.y\n",
 289 |     "    \n",
 290 |     "    shot_zone = ''\n",
 291 |     "    shot_area = ''\n",
 292 |     "    \n",
 293 |     "    #restricted area, shots within 4ft of hoop\n",
 294 |     "    if get_shot_distance(x,y)<=4:\n",
 295 |     "        shot_zone = 'Restricted Area'\n",
 296 |     "       \n",
 297 |     "    #abov break 3 pointers\n",
 298 |     "    elif (get_shot_distance(x,y)>=23.9) & (y>=92.5):\n",
 299 |     "        shot_zone = 'Above Break 3'\n",
 300 |     "    #corner 3s    \n",
 301 |     "    elif (y<92.5) & ((x<=-220) | (x>=220)):\n",
 302 |     "        shot_zone = 'Corner 3'\n",
 303 |     "    #in the paint shots excluding restricted area  \n",
 304 |     "    elif (-80<=x<=80) & (-47.5<=y<=143.5) & (get_shot_distance(x,y)>4):\n",
 305 |     "        shot_zone = 'Paint'\n",
 306 |     "    #mid range shots, left and right side\n",
 307 |     "    elif (get_shot_distance(x,y)<23.9) & ((-220<x<-80) | (80<x<220)):\n",
 308 |     "        shot_zone = 'Mid Range'\n",
 309 |     "    #mid range shots, center (above foul line)\n",
 310 |     "    else:\n",
 311 |     "        shot_zone = 'Mid Range'\n",
 312 |     "    \n",
 313 |     "    #heaves (defined as shots 35+ feet from basket)\n",
 314 |     "    if get_shot_distance(x,y)>35:\n",
 315 |     "        shot_zone = 'Heave'\n",
 316 |     "    \n",
 317 |     "    #Get area of court (left, right, or center)\n",
 318 |     "    if shot_zone !='Paint':\n",
 319 |     "        if (x <= 80) & (x>=-80):\n",
 320 |     "            shot_area = 'C'\n",
 321 |     "        elif (x>80):\n",
 322 |     "            shot_area = 'L'\n",
 323 |     "        else:\n",
 324 |     "            shot_area = 'R'      \n",
 325 |     "    #for shots in paint, they have special designation for left, right, and center\n",
 326 |     "    else:\n",
 327 |     "        if x>40:\n",
 328 |     "            shot_area = 'L'\n",
 329 |     "        elif x<-40:\n",
 330 |     "            shot_area = 'R'\n",
 331 |     "        else:\n",
 332 |     "            shot_area = 'C'\n",
 333 |     "    return shot_zone, shot_area"
 334 |    ]
 335 |   },
 336 |   {
 337 |    "cell_type": "code",
 338 |    "execution_count": 599,
 339 |    "metadata": {
 340 |     "extensions": {
 341 |      "jupyter_dashboards": {
 342 |       "version": 1,
 343 |       "views": {
 344 |        "grid_default": {
 345 |         "hidden": true
 346 |        },
 347 |        "report_default": {}
 348 |       }
 349 |      }
 350 |     },
 351 |     "scrolled": true
 352 |    },
 353 |    "outputs": [],
 354 |    "source": [
 355 |     "def add_shot_zones_area_to_df(df):\n",
 356 |     "    shot_zones = []\n",
 357 |     "    shot_areas = []\n",
 358 |     "\n",
 359 |     "    for index, row in df.iterrows():\n",
 360 |     "        shot_zones.append(get_shot_zone(row)[0])\n",
 361 |     "        shot_areas.append(get_shot_zone(row)[1])\n",
 362 |     "\n",
 363 |     "    df['shot_zone'] = shot_zones\n",
 364 |     "    df['shot_area'] = shot_areas\n",
 365 |     "\n",
 366 |     "add_shot_zones_area_to_df(df)   "
 367 |    ]
 368 |   },
 369 |   {
 370 |    "cell_type": "code",
 371 |    "execution_count": 600,
 372 |    "metadata": {
 373 |     "extensions": {
 374 |      "jupyter_dashboards": {
 375 |       "version": 1,
 376 |       "views": {
 377 |        "grid_default": {
 378 |         "col": 0,
 379 |         "height": 10,
 380 |         "hidden": false,
 381 |         "row": 61,
 382 |         "width": 12
 383 |        },
 384 |        "report_default": {}
 385 |       }
 386 |      }
 387 |     },
 388 |     "scrolled": true
 389 |    },
 390 |    "outputs": [
 391 |     {
 392 |      "data": {
 393 |       "text/html": [
 394 |        "<div>\n",
 395 |        "<style scoped>\n",
 396 |        "    .dataframe tbody tr th:only-of-type {\n",
 397 |        "        vertical-align: middle;\n",
 398 |        "    }\n",
 399 |        "\n",
 400 |        "    .dataframe tbody tr th {\n",
 401 |        "        vertical-align: top;\n",
 402 |        "    }\n",
 403 |        "\n",
 404 |        "    .dataframe thead th {\n",
 405 |        "        text-align: right;\n",
 406 |        "    }\n",
 407 |        "</style>\n",
 408 |        "<table border=\"1\" class=\"dataframe\">\n",
 409 |        "  <thead>\n",
 410 |        "    <tr style=\"text-align: right;\">\n",
 411 |        "      <th></th>\n",
 412 |        "      <th>name</th>\n",
 413 |        "      <th>team_name</th>\n",
 414 |        "      <th>game_date</th>\n",
 415 |        "      <th>season</th>\n",
 416 |        "      <th>espn_player_id</th>\n",
 417 |        "      <th>team_id</th>\n",
 418 |        "      <th>espn_game_id</th>\n",
 419 |        "      <th>period</th>\n",
 420 |        "      <th>minutes_remaining</th>\n",
 421 |        "      <th>seconds_remaining</th>\n",
 422 |        "      <th>shot_made_flag</th>\n",
 423 |        "      <th>action_type</th>\n",
 424 |        "      <th>shot_type</th>\n",
 425 |        "      <th>shot_distance</th>\n",
 426 |        "      <th>opponent</th>\n",
 427 |        "      <th>x</th>\n",
 428 |        "      <th>y</th>\n",
 429 |        "      <th>dribbles</th>\n",
 430 |        "      <th>touch_time</th>\n",
 431 |        "      <th>defender_name</th>\n",
 432 |        "      <th>defender_distance</th>\n",
 433 |        "      <th>shot_clock</th>\n",
 434 |        "      <th>shot_zone</th>\n",
 435 |        "      <th>shot_area</th>\n",
 436 |        "    </tr>\n",
 437 |        "  </thead>\n",
 438 |        "  <tbody>\n",
 439 |        "    <tr>\n",
 440 |        "      <th>205545</th>\n",
 441 |        "      <td>Evan Turner</td>\n",
 442 |        "      <td>Boston Celtics</td>\n",
 443 |        "      <td>2015-04-08</td>\n",
 444 |        "      <td>2014</td>\n",
 445 |        "      <td>4239.0</td>\n",
 446 |        "      <td>1610612738</td>\n",
 447 |        "      <td>400579456.0</td>\n",
 448 |        "      <td>1</td>\n",
 449 |        "      <td>10</td>\n",
 450 |        "      <td>29</td>\n",
 451 |        "      <td>1</td>\n",
 452 |        "      <td>Turnaround Jump Shot</td>\n",
 453 |        "      <td>2</td>\n",
 454 |        "      <td>13</td>\n",
 455 |        "      <td>Detroit Pistons</td>\n",
 456 |        "      <td>114</td>\n",
 457 |        "      <td>64</td>\n",
 458 |        "      <td>5</td>\n",
 459 |        "      <td>4.3</td>\n",
 460 |        "      <td>Monroe, Greg</td>\n",
 461 |        "      <td>4.9</td>\n",
 462 |        "      <td>8.0</td>\n",
 463 |        "      <td>Mid Range</td>\n",
 464 |        "      <td>L</td>\n",
 465 |        "    </tr>\n",
 466 |        "    <tr>\n",
 467 |        "      <th>205546</th>\n",
 468 |        "      <td>PJ Tucker</td>\n",
 469 |        "      <td>Phoenix Suns</td>\n",
 470 |        "      <td>2015-04-08</td>\n",
 471 |        "      <td>2014</td>\n",
 472 |        "      <td>3033.0</td>\n",
 473 |        "      <td>1610612756</td>\n",
 474 |        "      <td>400579463.0</td>\n",
 475 |        "      <td>1</td>\n",
 476 |        "      <td>9</td>\n",
 477 |        "      <td>23</td>\n",
 478 |        "      <td>0</td>\n",
 479 |        "      <td>Turnaround Jump Shot</td>\n",
 480 |        "      <td>2</td>\n",
 481 |        "      <td>7</td>\n",
 482 |        "      <td>Dallas Mavericks</td>\n",
 483 |        "      <td>-73</td>\n",
 484 |        "      <td>26</td>\n",
 485 |        "      <td>1</td>\n",
 486 |        "      <td>1.6</td>\n",
 487 |        "      <td>Rondo, Rajon</td>\n",
 488 |        "      <td>2.9</td>\n",
 489 |        "      <td>17.7</td>\n",
 490 |        "      <td>Paint</td>\n",
 491 |        "      <td>R</td>\n",
 492 |        "    </tr>\n",
 493 |        "    <tr>\n",
 494 |        "      <th>205547</th>\n",
 495 |        "      <td>Dion Waiters</td>\n",
 496 |        "      <td>Oklahoma City Thunder</td>\n",
 497 |        "      <td>2015-04-01</td>\n",
 498 |        "      <td>2014</td>\n",
 499 |        "      <td>6628.0</td>\n",
 500 |        "      <td>1610612760</td>\n",
 501 |        "      <td>NaN</td>\n",
 502 |        "      <td>1</td>\n",
 503 |        "      <td>10</td>\n",
 504 |        "      <td>37</td>\n",
 505 |        "      <td>0</td>\n",
 506 |        "      <td>Turnaround Jump Shot</td>\n",
 507 |        "      <td>2</td>\n",
 508 |        "      <td>6</td>\n",
 509 |        "      <td>Dallas Mavericks</td>\n",
 510 |        "      <td>-67</td>\n",
 511 |        "      <td>-2</td>\n",
 512 |        "      <td>3</td>\n",
 513 |        "      <td>5.1</td>\n",
 514 |        "      <td>Nowitzki, Dirk</td>\n",
 515 |        "      <td>2.5</td>\n",
 516 |        "      <td>11.2</td>\n",
 517 |        "      <td>Paint</td>\n",
 518 |        "      <td>R</td>\n",
 519 |        "    </tr>\n",
 520 |        "    <tr>\n",
 521 |        "      <th>205548</th>\n",
 522 |        "      <td>Dante Exum</td>\n",
 523 |        "      <td>Utah Jazz</td>\n",
 524 |        "      <td>2015-04-08</td>\n",
 525 |        "      <td>2014</td>\n",
 526 |        "      <td>3102528.0</td>\n",
 527 |        "      <td>1610612762</td>\n",
 528 |        "      <td>400579462.0</td>\n",
 529 |        "      <td>1</td>\n",
 530 |        "      <td>2</td>\n",
 531 |        "      <td>58</td>\n",
 532 |        "      <td>0</td>\n",
 533 |        "      <td>Turnaround Jump Shot</td>\n",
 534 |        "      <td>2</td>\n",
 535 |        "      <td>8</td>\n",
 536 |        "      <td>Sacramento Kings</td>\n",
 537 |        "      <td>71</td>\n",
 538 |        "      <td>48</td>\n",
 539 |        "      <td>4</td>\n",
 540 |        "      <td>5.5</td>\n",
 541 |        "      <td>Landry, Carl</td>\n",
 542 |        "      <td>4.4</td>\n",
 543 |        "      <td>14.3</td>\n",
 544 |        "      <td>Paint</td>\n",
 545 |        "      <td>L</td>\n",
 546 |        "    </tr>\n",
 547 |        "    <tr>\n",
 548 |        "      <th>205549</th>\n",
 549 |        "      <td>Jason Smith</td>\n",
 550 |        "      <td>New York Knicks</td>\n",
 551 |        "      <td>2015-04-08</td>\n",
 552 |        "      <td>2014</td>\n",
 553 |        "      <td>3232.0</td>\n",
 554 |        "      <td>1610612752</td>\n",
 555 |        "      <td>400579457.0</td>\n",
 556 |        "      <td>2</td>\n",
 557 |        "      <td>3</td>\n",
 558 |        "      <td>32</td>\n",
 559 |        "      <td>1</td>\n",
 560 |        "      <td>Turnaround Jump Shot</td>\n",
 561 |        "      <td>2</td>\n",
 562 |        "      <td>7</td>\n",
 563 |        "      <td>Indiana Pacers</td>\n",
 564 |        "      <td>73</td>\n",
 565 |        "      <td>-24</td>\n",
 566 |        "      <td>4</td>\n",
 567 |        "      <td>5.1</td>\n",
 568 |        "      <td>Allen, Lavoy</td>\n",
 569 |        "      <td>4.7</td>\n",
 570 |        "      <td>3.7</td>\n",
 571 |        "      <td>Paint</td>\n",
 572 |        "      <td>L</td>\n",
 573 |        "    </tr>\n",
 574 |        "  </tbody>\n",
 575 |        "</table>\n",
 576 |        "</div>"
 577 |       ],
 578 |       "text/plain": [
 579 |        "                name              team_name  game_date  season  \\\n",
 580 |        "205545   Evan Turner         Boston Celtics 2015-04-08    2014   \n",
 581 |        "205546     PJ Tucker           Phoenix Suns 2015-04-08    2014   \n",
 582 |        "205547  Dion Waiters  Oklahoma City Thunder 2015-04-01    2014   \n",
 583 |        "205548    Dante Exum              Utah Jazz 2015-04-08    2014   \n",
 584 |        "205549   Jason Smith        New York Knicks 2015-04-08    2014   \n",
 585 |        "\n",
 586 |        "        espn_player_id     team_id  espn_game_id  period  minutes_remaining  \\\n",
 587 |        "205545          4239.0  1610612738   400579456.0       1                 10   \n",
 588 |        "205546          3033.0  1610612756   400579463.0       1                  9   \n",
 589 |        "205547          6628.0  1610612760           NaN       1                 10   \n",
 590 |        "205548       3102528.0  1610612762   400579462.0       1                  2   \n",
 591 |        "205549          3232.0  1610612752   400579457.0       2                  3   \n",
 592 |        "\n",
 593 |        "        seconds_remaining  shot_made_flag           action_type  shot_type  \\\n",
 594 |        "205545                 29               1  Turnaround Jump Shot          2   \n",
 595 |        "205546                 23               0  Turnaround Jump Shot          2   \n",
 596 |        "205547                 37               0  Turnaround Jump Shot          2   \n",
 597 |        "205548                 58               0  Turnaround Jump Shot          2   \n",
 598 |        "205549                 32               1  Turnaround Jump Shot          2   \n",
 599 |        "\n",
 600 |        "        shot_distance          opponent    x   y  dribbles  touch_time  \\\n",
 601 |        "205545             13   Detroit Pistons  114  64         5         4.3   \n",
 602 |        "205546              7  Dallas Mavericks  -73  26         1         1.6   \n",
 603 |        "205547              6  Dallas Mavericks  -67  -2         3         5.1   \n",
 604 |        "205548              8  Sacramento Kings   71  48         4         5.5   \n",
 605 |        "205549              7    Indiana Pacers   73 -24         4         5.1   \n",
 606 |        "\n",
 607 |        "         defender_name  defender_distance  shot_clock  shot_zone shot_area  \n",
 608 |        "205545    Monroe, Greg                4.9         8.0  Mid Range         L  \n",
 609 |        "205546    Rondo, Rajon                2.9        17.7      Paint         R  \n",
 610 |        "205547  Nowitzki, Dirk                2.5        11.2      Paint         R  \n",
 611 |        "205548    Landry, Carl                4.4        14.3      Paint         L  \n",
 612 |        "205549    Allen, Lavoy                4.7         3.7      Paint         L  "
 613 |       ]
 614 |      },
 615 |      "execution_count": 600,
 616 |      "metadata": {},
 617 |      "output_type": "execute_result"
 618 |     }
 619 |    ],
 620 |    "source": [
 621 |     "df.tail()"
 622 |    ]
 623 |   },
 624 |   {
 625 |    "cell_type": "code",
 626 |    "execution_count": 601,
 627 |    "metadata": {
 628 |     "extensions": {
 629 |      "jupyter_dashboards": {
 630 |       "version": 1,
 631 |       "views": {
 632 |        "grid_default": {
 633 |         "hidden": true
 634 |        },
 635 |        "report_default": {}
 636 |       }
 637 |      }
 638 |     },
 639 |     "scrolled": true
 640 |    },
 641 |    "outputs": [],
 642 |    "source": [
 643 |     "def get_lg_avgs(shot_zone_area_tup, df):\n",
 644 |     "    sz = shot_zone_area_tup[0]\n",
 645 |     "    sa = shot_zone_area_tup[1]\n",
 646 |     "    shots_made = len(df[(df.shot_zone==sz) & (df.shot_area==sa) & (df.shot_made_flag==1)])\n",
 647 |     "    total_shots = len(df[(df.shot_zone==sz) & (df.shot_area==sa)])\n",
 648 |     "    if total_shots ==0:\n",
 649 |     "        make_pct = 0\n",
 650 |     "    else:\n",
 651 |     "        make_pct = round((shots_made / total_shots),4)\n",
 652 |     "    return make_pct"
 653 |    ]
 654 |   },
 655 |   {
 656 |    "cell_type": "code",
 657 |    "execution_count": 602,
 658 |    "metadata": {
 659 |     "extensions": {
 660 |      "jupyter_dashboards": {
 661 |       "version": 1,
 662 |       "views": {
 663 |        "grid_default": {
 664 |         "hidden": true
 665 |        },
 666 |        "report_default": {}
 667 |       }
 668 |      }
 669 |     },
 670 |     "scrolled": true
 671 |    },
 672 |    "outputs": [],
 673 |    "source": [
 674 |     "sz = set(shot_zones)\n",
 675 |     "sa = set(shot_areas)\n",
 676 |     "sza_tups = list(itertools.product(sz,sa))\n",
 677 |     "\n",
 678 |     "sza_dict = {}\n",
 679 |     "for sza in sza_tups:\n",
 680 |     "    sza_dict[sza] = get_lg_avgs(sza, df)"
 681 |    ]
 682 |   },
 683 |   {
 684 |    "cell_type": "code",
 685 |    "execution_count": 603,
 686 |    "metadata": {
 687 |     "scrolled": true
 688 |    },
 689 |    "outputs": [],
 690 |    "source": [
 691 |     "def add_lg_avg_to_df(df):\n",
 692 |     "    df['lg_avg']=0\n",
 693 |     "    for k,v in sza_dict.items():\n",
 694 |     "        df['lg_avg'] = np.where((df.shot_zone==k[0]) & (df.shot_area==k[1]), v, df['lg_avg'])"
 695 |    ]
 696 |   },
 697 |   {
 698 |    "cell_type": "code",
 699 |    "execution_count": 604,
 700 |    "metadata": {
 701 |     "scrolled": true
 702 |    },
 703 |    "outputs": [],
 704 |    "source": [
 705 |     "add_lg_avg_to_df(df)"
 706 |    ]
 707 |   },
 708 |   {
 709 |    "cell_type": "code",
 710 |    "execution_count": 607,
 711 |    "metadata": {
 712 |     "extensions": {
 713 |      "jupyter_dashboards": {
 714 |       "version": 1,
 715 |       "views": {
 716 |        "grid_default": {
 717 |         "hidden": true
 718 |        },
 719 |        "report_default": {}
 720 |       }
 721 |      }
 722 |     },
 723 |     "scrolled": true
 724 |    },
 725 |    "outputs": [],
 726 |    "source": [
 727 |     "def create_team_ids(df):\n",
 728 |     "    team_id_dict = {}\n",
 729 |     "    for id_, team in enumerate(list(set(df.team_name))):\n",
 730 |     "        team_id_dict[team]=id_+1\n",
 731 |     "\n",
 732 |     "    df['opp_id']=0\n",
 733 |     "    #get team ids from 1-30\n",
 734 |     "    for k,v in team_id_dict.items():\n",
 735 |     "        df['team_id'] = np.where(df.team_name==k, v, df['team_id'])\n",
 736 |     "        df['opp_id'] = np.where(df.opponent==k, v, df['opp_id'])\n",
 737 |     "    return team_id_dict\n",
 738 |     "\n",
 739 |     "create_team_ids(df)"
 740 |    ]
 741 |   },
 742 |   {
 743 |    "cell_type": "code",
 744 |    "execution_count": 608,
 745 |    "metadata": {
 746 |     "scrolled": true
 747 |    },
 748 |    "outputs": [],
 749 |    "source": [
 750 |     "#df.groupby(by=['game_date','team_id','opp_id']).mean()"
 751 |    ]
 752 |   },
 753 |   {
 754 |    "cell_type": "code",
 755 |    "execution_count": 609,
 756 |    "metadata": {},
 757 |    "outputs": [],
 758 |    "source": [
 759 |     "nba_shots = pd.read_csv('./data/shots_1415.csv',index_col=0)\n",
 760 |     "nba_shots.GAME_DATE = nba_shots.GAME_DATE.astype('str')\n",
 761 |     "\n",
 762 |     "#Adds dashes to date string so it can be converted to datetime format\n",
 763 |     "def add_dashes(string):\n",
 764 |     "    date = string[:4] + '-' + string[4:6] + '-' + string[-2:]\n",
 765 |     "    return date\n",
 766 |     "\n",
 767 |     "nba_shots.GAME_DATE = nba_shots.GAME_DATE.apply(lambda x: add_dashes(x))\n",
 768 |     "nba_shots.GAME_DATE = pd.to_datetime(nba_shots.GAME_DATE)\n",
 769 |     "nba_shots.LOC_X = -nba_shots.LOC_X"
 770 |    ]
 771 |   },
 772 |   {
 773 |    "cell_type": "markdown",
 774 |    "metadata": {},
 775 |    "source": [
 776 |     "### Merge Dataframes"
 777 |    ]
 778 |   },
 779 |   {
 780 |    "cell_type": "code",
 781 |    "execution_count": 610,
 782 |    "metadata": {},
 783 |    "outputs": [],
 784 |    "source": [
 785 |     "merged_df = df.merge(nba_shots, left_on=['team_name','game_date','period','minutes_remaining','seconds_remaining','x','y'],\n",
 786 |     "              right_on=['TEAM_NAME','GAME_DATE','PERIOD','MINUTES_REMAINING','SECONDS_REMAINING','LOC_X','LOC_Y'])\n",
 787 |     "\n",
 788 |     "merged_df = merged_df.drop(columns=['GRID_TYPE','PLAYER_NAME', 'TEAM_ID', 'TEAM_NAME', 'PERIOD', 'MINUTES_REMAINING',\n",
 789 |     "       'SECONDS_REMAINING','SHOT_DISTANCE','LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE',\n",
 790 |     "       'espn_player_id', 'espn_game_id', 'EVENT_TYPE','ACTION_TYPE', 'SHOT_TYPE','SHOT_ZONE_BASIC',\n",
 791 |     "       'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE'])"
 792 |    ]
 793 |   },
 794 |   {
 795 |    "cell_type": "code",
 796 |    "execution_count": 611,
 797 |    "metadata": {
 798 |     "scrolled": true
 799 |    },
 800 |    "outputs": [],
 801 |    "source": [
 802 |     "#get dictionary matching team names to home and away team acronyms\n",
 803 |     "def create_home_acronym_dict():\n",
 804 |     "    team_acronyms = sorted(list(merged_df.HTM.unique()))\n",
 805 |     "    team_names = sorted(list(merged_df.team_name.unique()))\n",
 806 |     "\n",
 807 |     "    team_name_ac_dict = dict(zip(team_names,team_acronyms))\n",
 808 |     "    team_name_ac_dict['Boston Celtics'] = 'BOS'\n",
 809 |     "    team_name_ac_dict['Brooklyn Nets'] = 'BKN'\n",
 810 |     "    return team_name_ac_dict"
 811 |    ]
 812 |   },
 813 |   {
 814 |    "cell_type": "code",
 815 |    "execution_count": 612,
 816 |    "metadata": {
 817 |     "scrolled": false
 818 |    },
 819 |    "outputs": [
 820 |     {
 821 |      "name": "stdout",
 822 |      "output_type": "stream",
 823 |      "text": [
 824 |       "Runtime: 2.05 seconds. 205539 iterations to go.\n",
 825 |       "Runtime: 8.47 seconds. 155539 iterations to go.\n",
 826 |       "Runtime: 16.45 seconds. 105539 iterations to go.\n",
 827 |       "Runtime: 23.53 seconds. 55539 iterations to go.\n",
 828 |       "Runtime: 31.85 seconds. 5539 iterations to go.\n"
 829 |      ]
 830 |     }
 831 |    ],
 832 |    "source": [
 833 |     "def get_home_team():\n",
 834 |     "    start = time.time()\n",
 835 |     "    is_home_arr = []\n",
 836 |     "\n",
 837 |     "    team_name_ac_dict=create_home_acronym_dict()\n",
 838 |     "\n",
 839 |     "    for index, row in merged_df.iterrows():\n",
 840 |     "        if team_name_ac_dict[row.team_name]==row.HTM:\n",
 841 |     "            is_home_arr.append(1)\n",
 842 |     "        else:\n",
 843 |     "            is_home_arr.append(0)\n",
 844 |     "        if index%50000==0:\n",
 845 |     "            print('Runtime: {} seconds. {} iterations to go.'.format(round(time.time()-start,2), len(merged_df)-index))\n",
 846 |     "    return is_home_arr\n",
 847 |     "\n",
 848 |     "merged_df['is_home'] = get_home_team()"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "code",
 853 |    "execution_count": 613,
 854 |    "metadata": {
 855 |     "scrolled": true
 856 |    },
 857 |    "outputs": [],
 858 |    "source": [
 859 |     "#sort the dataframe by date, game_id, player_name, and game_event_id\n",
 860 |     "sorted_df = merged_df.copy().sort_values(by=['game_date','GAME_ID','name','GAME_EVENT_ID']).reset_index(drop=True)\n",
 861 |     "\n",
 862 |     "#adds to dataframe whether player has hit previous 1, 2, or 3 shots\n",
 863 |     "def is_player_hot(dataframe):\n",
 864 |     "    start=time.time()\n",
 865 |     "\n",
 866 |     "    df = dataframe\n",
 867 |     "    #create array that stores whether previous 1, 2, or 3 shots were made, respectively\n",
 868 |     "    heat_check_array=np.zeros((len(df),3))\n",
 869 |     "\n",
 870 |     "    for index, row in df.iterrows():\n",
 871 |     "        if index==0:\n",
 872 |     "            heat_check_array[index,:]+=[0,0,0]\n",
 873 |     "        elif index==1:\n",
 874 |     "            if (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1):\n",
 875 |     "                heat_check_array[index,:]+=[1,0,0]\n",
 876 |     "            else:\n",
 877 |     "                heat_check_array[index,:]+=[0,0,0]\n",
 878 |     "        elif index==2:\n",
 879 |     "            if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1):\n",
 880 |     "                heat_check_array[index,:]+=[1,1,0]\n",
 881 |     "            elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==0):\n",
 882 |     "                heat_check_array[index,:]+=[1,0,0]\n",
 883 |     "            else:\n",
 884 |     "                heat_check_array[index,:]+=[0,0,0]\n",
 885 |     "        else:\n",
 886 |     "            if (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==1):\n",
 887 |     "                heat_check_array[index,:]+=[1,1,1]\n",
 888 |     "            elif (df.name[index]==df.name[index-1]) & (df.name[index]==df.name[index-2]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1) & (df.shot_made_flag[index-2]==1) & (df.shot_made_flag[index-3]==0):\n",
 889 |     "                heat_check_array[index,:]+=[1,1,0]\n",
 890 |     "            elif (df.name[index]==df.name[index-1]) & (row.GAME_ID==df.GAME_ID[index-1]) & (df.shot_made_flag[index-1]==1):\n",
 891 |     "                heat_check_array[index,:]+=[1,0,0]\n",
 892 |     "            else:\n",
 893 |     "                heat_check_array[index,:]+=[0,0,0]\n",
 894 |     "\n",
 895 |     "        if index%50000==0:\n",
 896 |     "            print('Runtime: {} seconds. {} iterations remaining.'.format(round(time.time()-start,2),len(df)-index))\n",
 897 |     "\n",
 898 |     "    return heat_check_array"
 899 |    ]
 900 |   },
 901 |   {
 902 |    "cell_type": "code",
 903 |    "execution_count": 614,
 904 |    "metadata": {
 905 |     "scrolled": true
 906 |    },
 907 |    "outputs": [
 908 |     {
 909 |      "name": "stdout",
 910 |      "output_type": "stream",
 911 |      "text": [
 912 |       "Runtime: 2.21 seconds. 205539 iterations remaining.\n",
 913 |       "Runtime: 38.93 seconds. 155539 iterations remaining.\n",
 914 |       "Runtime: 75.29 seconds. 105539 iterations remaining.\n",
 915 |       "Runtime: 117.37 seconds. 55539 iterations remaining.\n",
 916 |       "Runtime: 157.18 seconds. 5539 iterations remaining.\n"
 917 |      ]
 918 |     }
 919 |    ],
 920 |    "source": [
 921 |     "heat_check_array = is_player_hot(sorted_df)"
 922 |    ]
 923 |   },
 924 |   {
 925 |    "cell_type": "code",
 926 |    "execution_count": 632,
 927 |    "metadata": {},
 928 |    "outputs": [
 929 |     {
 930 |      "data": {
 931 |       "text/html": [
 932 |        "<div>\n",
 933 |        "<style scoped>\n",
 934 |        "    .dataframe tbody tr th:only-of-type {\n",
 935 |        "        vertical-align: middle;\n",
 936 |        "    }\n",
 937 |        "\n",
 938 |        "    .dataframe tbody tr th {\n",
 939 |        "        vertical-align: top;\n",
 940 |        "    }\n",
 941 |        "\n",
 942 |        "    .dataframe thead th {\n",
 943 |        "        text-align: right;\n",
 944 |        "    }\n",
 945 |        "</style>\n",
 946 |        "<table border=\"1\" class=\"dataframe\">\n",
 947 |        "  <thead>\n",
 948 |        "    <tr style=\"text-align: right;\">\n",
 949 |        "      <th></th>\n",
 950 |        "      <th>name</th>\n",
 951 |        "      <th>shot_made_flag</th>\n",
 952 |        "      <th>prev_shot_made</th>\n",
 953 |        "      <th>prev_2_made</th>\n",
 954 |        "      <th>prev_3_made</th>\n",
 955 |        "      <th>game_date</th>\n",
 956 |        "      <th>GAME_EVENT_ID</th>\n",
 957 |        "    </tr>\n",
 958 |        "  </thead>\n",
 959 |        "  <tbody>\n",
 960 |        "    <tr>\n",
 961 |        "      <th>210</th>\n",
 962 |        "      <td>Cory Joseph</td>\n",
 963 |        "      <td>1</td>\n",
 964 |        "      <td>0.0</td>\n",
 965 |        "      <td>0.0</td>\n",
 966 |        "      <td>0.0</td>\n",
 967 |        "      <td>2014-10-28</td>\n",
 968 |        "      <td>380</td>\n",
 969 |        "    </tr>\n",
 970 |        "    <tr>\n",
 971 |        "      <th>211</th>\n",
 972 |        "      <td>Cory Joseph</td>\n",
 973 |        "      <td>1</td>\n",
 974 |        "      <td>1.0</td>\n",
 975 |        "      <td>0.0</td>\n",
 976 |        "      <td>0.0</td>\n",
 977 |        "      <td>2014-10-28</td>\n",
 978 |        "      <td>387</td>\n",
 979 |        "    </tr>\n",
 980 |        "    <tr>\n",
 981 |        "      <th>212</th>\n",
 982 |        "      <td>Danny Green</td>\n",
 983 |        "      <td>0</td>\n",
 984 |        "      <td>0.0</td>\n",
 985 |        "      <td>0.0</td>\n",
 986 |        "      <td>0.0</td>\n",
 987 |        "      <td>2014-10-28</td>\n",
 988 |        "      <td>9</td>\n",
 989 |        "    </tr>\n",
 990 |        "    <tr>\n",
 991 |        "      <th>213</th>\n",
 992 |        "      <td>Danny Green</td>\n",
 993 |        "      <td>1</td>\n",
 994 |        "      <td>0.0</td>\n",
 995 |        "      <td>0.0</td>\n",
 996 |        "      <td>0.0</td>\n",
 997 |        "      <td>2014-10-28</td>\n",
 998 |        "      <td>15</td>\n",
 999 |        "    </tr>\n",
1000 |        "    <tr>\n",
1001 |        "      <th>214</th>\n",
1002 |        "      <td>Danny Green</td>\n",
1003 |        "      <td>1</td>\n",
1004 |        "      <td>1.0</td>\n",
1005 |        "      <td>0.0</td>\n",
1006 |        "      <td>0.0</td>\n",
1007 |        "      <td>2014-10-28</td>\n",
1008 |        "      <td>102</td>\n",
1009 |        "    </tr>\n",
1010 |        "    <tr>\n",
1011 |        "      <th>215</th>\n",
1012 |        "      <td>Danny Green</td>\n",
1013 |        "      <td>1</td>\n",
1014 |        "      <td>1.0</td>\n",
1015 |        "      <td>1.0</td>\n",
1016 |        "      <td>0.0</td>\n",
1017 |        "      <td>2014-10-28</td>\n",
1018 |        "      <td>132</td>\n",
1019 |        "    </tr>\n",
1020 |        "    <tr>\n",
1021 |        "      <th>216</th>\n",
1022 |        "      <td>Danny Green</td>\n",
1023 |        "      <td>0</td>\n",
1024 |        "      <td>1.0</td>\n",
1025 |        "      <td>1.0</td>\n",
1026 |        "      <td>1.0</td>\n",
1027 |        "      <td>2014-10-28</td>\n",
1028 |        "      <td>150</td>\n",
1029 |        "    </tr>\n",
1030 |        "    <tr>\n",
1031 |        "      <th>217</th>\n",
1032 |        "      <td>Danny Green</td>\n",
1033 |        "      <td>0</td>\n",
1034 |        "      <td>0.0</td>\n",
1035 |        "      <td>0.0</td>\n",
1036 |        "      <td>0.0</td>\n",
1037 |        "      <td>2014-10-28</td>\n",
1038 |        "      <td>175</td>\n",
1039 |        "    </tr>\n",
1040 |        "    <tr>\n",
1041 |        "      <th>218</th>\n",
1042 |        "      <td>Danny Green</td>\n",
1043 |        "      <td>1</td>\n",
1044 |        "      <td>0.0</td>\n",
1045 |        "      <td>0.0</td>\n",
1046 |        "      <td>0.0</td>\n",
1047 |        "      <td>2014-10-28</td>\n",
1048 |        "      <td>259</td>\n",
1049 |        "    </tr>\n",
1050 |        "    <tr>\n",
1051 |        "      <th>219</th>\n",
1052 |        "      <td>Danny Green</td>\n",
1053 |        "      <td>0</td>\n",
1054 |        "      <td>1.0</td>\n",
1055 |        "      <td>0.0</td>\n",
1056 |        "      <td>0.0</td>\n",
1057 |        "      <td>2014-10-28</td>\n",
1058 |        "      <td>284</td>\n",
1059 |        "    </tr>\n",
1060 |        "  </tbody>\n",
1061 |        "</table>\n",
1062 |        "</div>"
1063 |       ],
1064 |       "text/plain": [
1065 |        "            name  shot_made_flag  prev_shot_made  prev_2_made  prev_3_made  \\\n",
1066 |        "210  Cory Joseph               1             0.0          0.0          0.0   \n",
1067 |        "211  Cory Joseph               1             1.0          0.0          0.0   \n",
1068 |        "212  Danny Green               0             0.0          0.0          0.0   \n",
1069 |        "213  Danny Green               1             0.0          0.0          0.0   \n",
1070 |        "214  Danny Green               1             1.0          0.0          0.0   \n",
1071 |        "215  Danny Green               1             1.0          1.0          0.0   \n",
1072 |        "216  Danny Green               0             1.0          1.0          1.0   \n",
1073 |        "217  Danny Green               0             0.0          0.0          0.0   \n",
1074 |        "218  Danny Green               1             0.0          0.0          0.0   \n",
1075 |        "219  Danny Green               0             1.0          0.0          0.0   \n",
1076 |        "\n",
1077 |        "     game_date  GAME_EVENT_ID  \n",
1078 |        "210 2014-10-28            380  \n",
1079 |        "211 2014-10-28            387  \n",
1080 |        "212 2014-10-28              9  \n",
1081 |        "213 2014-10-28             15  \n",
1082 |        "214 2014-10-28            102  \n",
1083 |        "215 2014-10-28            132  \n",
1084 |        "216 2014-10-28            150  \n",
1085 |        "217 2014-10-28            175  \n",
1086 |        "218 2014-10-28            259  \n",
1087 |        "219 2014-10-28            284  "
1088 |       ]
1089 |      },
1090 |      "execution_count": 632,
1091 |      "metadata": {},
1092 |      "output_type": "execute_result"
1093 |     }
1094 |    ],
1095 |    "source": [
1096 |     "#add heat check stats to dataframe\n",
1097 |     "sorted_df['prev_shot_made'] = heat_check_array[:,0]\n",
1098 |     "sorted_df['prev_2_made'] = heat_check_array[:,1]\n",
1099 |     "sorted_df['prev_3_made'] = heat_check_array[:,2]\n",
1100 |     "sorted_df[210:220][['name','shot_made_flag','prev_shot_made','prev_2_made','prev_3_made','game_date','GAME_EVENT_ID']]"
1101 |    ]
1102 |   },
1103 |   {
1104 |    "cell_type": "code",
1105 |    "execution_count": 37,
1106 |    "metadata": {
1107 |     "scrolled": true
1108 |    },
1109 |    "outputs": [
1110 |     {
1111 |      "data": {
1112 |       "text/html": [
1113 |        "<div>\n",
1114 |        "<style scoped>\n",
1115 |        "    .dataframe tbody tr th:only-of-type {\n",
1116 |        "        vertical-align: middle;\n",
1117 |        "    }\n",
1118 |        "\n",
1119 |        "    .dataframe tbody tr th {\n",
1120 |        "        vertical-align: top;\n",
1121 |        "    }\n",
1122 |        "\n",
1123 |        "    .dataframe thead th {\n",
1124 |        "        text-align: right;\n",
1125 |        "    }\n",
1126 |        "</style>\n",
1127 |        "<table border=\"1\" class=\"dataframe\">\n",
1128 |        "  <thead>\n",
1129 |        "    <tr style=\"text-align: right;\">\n",
1130 |        "      <th></th>\n",
1131 |        "      <th>name</th>\n",
1132 |        "      <th>team_name</th>\n",
1133 |        "      <th>game_date</th>\n",
1134 |        "      <th>season</th>\n",
1135 |        "      <th>team_id</th>\n",
1136 |        "      <th>period</th>\n",
1137 |        "      <th>minutes_remaining</th>\n",
1138 |        "      <th>seconds_remaining</th>\n",
1139 |        "      <th>shot_made_flag</th>\n",
1140 |        "      <th>action_type</th>\n",
1141 |        "      <th>shot_type</th>\n",
1142 |        "      <th>shot_distance</th>\n",
1143 |        "      <th>opponent</th>\n",
1144 |        "      <th>x</th>\n",
1145 |        "      <th>y</th>\n",
1146 |        "      <th>dribbles</th>\n",
1147 |        "      <th>touch_time</th>\n",
1148 |        "      <th>defender_name</th>\n",
1149 |        "      <th>defender_distance</th>\n",
1150 |        "      <th>shot_clock</th>\n",
1151 |        "      <th>shot_zone</th>\n",
1152 |        "      <th>shot_area</th>\n",
1153 |        "      <th>lg_avg</th>\n",
1154 |        "      <th>opp_id</th>\n",
1155 |        "      <th>GAME_ID</th>\n",
1156 |        "      <th>GAME_EVENT_ID</th>\n",
1157 |        "      <th>PLAYER_ID</th>\n",
1158 |        "      <th>HTM</th>\n",
1159 |        "      <th>VTM</th>\n",
1160 |        "      <th>is_home</th>\n",
1161 |        "      <th>prev_shot_made</th>\n",
1162 |        "      <th>prev_2_made</th>\n",
1163 |        "      <th>prev_3_made</th>\n",
1164 |        "    </tr>\n",
1165 |        "  </thead>\n",
1166 |        "  <tbody>\n",
1167 |        "    <tr>\n",
1168 |        "      <th>0</th>\n",
1169 |        "      <td>Aaron Gordon</td>\n",
1170 |        "      <td>Orlando Magic</td>\n",
1171 |        "      <td>2014-10-28</td>\n",
1172 |        "      <td>2014</td>\n",
1173 |        "      <td>10</td>\n",
1174 |        "      <td>2</td>\n",
1175 |        "      <td>11</td>\n",
1176 |        "      <td>34</td>\n",
1177 |        "      <td>1</td>\n",
1178 |        "      <td>Jump Shot</td>\n",
1179 |        "      <td>2</td>\n",
1180 |        "      <td>4</td>\n",
1181 |        "      <td>New Orleans Pelicans</td>\n",
1182 |        "      <td>-10</td>\n",
1183 |        "      <td>44</td>\n",
1184 |        "      <td>4</td>\n",
1185 |        "      <td>5.1</td>\n",
1186 |        "      <td>Anderson, Ryan</td>\n",
1187 |        "      <td>3.9</td>\n",
1188 |        "      <td>0.6</td>\n",
1189 |        "      <td>Paint</td>\n",
1190 |        "      <td>C</td>\n",
1191 |        "      <td>0.4011</td>\n",
1192 |        "      <td>13</td>\n",
1193 |        "      <td>21400001</td>\n",
1194 |        "      <td>164</td>\n",
1195 |        "      <td>203932</td>\n",
1196 |        "      <td>NOP</td>\n",
1197 |        "      <td>ORL</td>\n",
1198 |        "      <td>0</td>\n",
1199 |        "      <td>0</td>\n",
1200 |        "      <td>0</td>\n",
1201 |        "      <td>0</td>\n",
1202 |        "    </tr>\n",
1203 |        "    <tr>\n",
1204 |        "      <th>1</th>\n",
1205 |        "      <td>Aaron Gordon</td>\n",
1206 |        "      <td>Orlando Magic</td>\n",
1207 |        "      <td>2014-10-28</td>\n",
1208 |        "      <td>2014</td>\n",
1209 |        "      <td>10</td>\n",
1210 |        "      <td>2</td>\n",
1211 |        "      <td>9</td>\n",
1212 |        "      <td>13</td>\n",
1213 |        "      <td>1</td>\n",
1214 |        "      <td>Jump Shot</td>\n",
1215 |        "      <td>3</td>\n",
1216 |        "      <td>23</td>\n",
1217 |        "      <td>New Orleans Pelicans</td>\n",
1218 |        "      <td>-233</td>\n",
1219 |        "      <td>20</td>\n",
1220 |        "      <td>0</td>\n",
1221 |        "      <td>0.7</td>\n",
1222 |        "      <td>Evans, Tyreke</td>\n",
1223 |        "      <td>4.3</td>\n",
1224 |        "      <td>7.4</td>\n",
1225 |        "      <td>Corner 3</td>\n",
1226 |        "      <td>R</td>\n",
1227 |        "      <td>0.3915</td>\n",
1228 |        "      <td>13</td>\n",
1229 |        "      <td>21400001</td>\n",
1230 |        "      <td>198</td>\n",
1231 |        "      <td>203932</td>\n",
1232 |        "      <td>NOP</td>\n",
1233 |        "      <td>ORL</td>\n",
1234 |        "      <td>0</td>\n",
1235 |        "      <td>1</td>\n",
1236 |        "      <td>0</td>\n",
1237 |        "      <td>0</td>\n",
1238 |        "    </tr>\n",
1239 |        "    <tr>\n",
1240 |        "      <th>2</th>\n",
1241 |        "      <td>Aaron Gordon</td>\n",
1242 |        "      <td>Orlando Magic</td>\n",
1243 |        "      <td>2014-10-28</td>\n",
1244 |        "      <td>2014</td>\n",
1245 |        "      <td>10</td>\n",
1246 |        "      <td>2</td>\n",
1247 |        "      <td>2</td>\n",
1248 |        "      <td>55</td>\n",
1249 |        "      <td>0</td>\n",
1250 |        "      <td>Jump Shot</td>\n",
1251 |        "      <td>3</td>\n",
1252 |        "      <td>23</td>\n",
1253 |        "      <td>New Orleans Pelicans</td>\n",
1254 |        "      <td>-234</td>\n",
1255 |        "      <td>0</td>\n",
1256 |        "      <td>0</td>\n",
1257 |        "      <td>0.9</td>\n",
1258 |        "      <td>Gordon, Eric</td>\n",
1259 |        "      <td>12.5</td>\n",
1260 |        "      <td>14.8</td>\n",
1261 |        "      <td>Corner 3</td>\n",
1262 |        "      <td>R</td>\n",
1263 |        "      <td>0.3915</td>\n",
1264 |        "      <td>13</td>\n",
1265 |        "      <td>21400001</td>\n",
1266 |        "      <td>275</td>\n",
1267 |        "      <td>203932</td>\n",
1268 |        "      <td>NOP</td>\n",
1269 |        "      <td>ORL</td>\n",
1270 |        "      <td>0</td>\n",
1271 |        "      <td>1</td>\n",
1272 |        "      <td>1</td>\n",
1273 |        "      <td>0</td>\n",
1274 |        "    </tr>\n",
1275 |        "    <tr>\n",
1276 |        "      <th>3</th>\n",
1277 |        "      <td>Aaron Gordon</td>\n",
1278 |        "      <td>Orlando Magic</td>\n",
1279 |        "      <td>2014-10-28</td>\n",
1280 |        "      <td>2014</td>\n",
1281 |        "      <td>10</td>\n",
1282 |        "      <td>3</td>\n",
1283 |        "      <td>5</td>\n",
1284 |        "      <td>1</td>\n",
1285 |        "      <td>1</td>\n",
1286 |        "      <td>Jump Shot</td>\n",
1287 |        "      <td>2</td>\n",
1288 |        "      <td>5</td>\n",
1289 |        "      <td>New Orleans Pelicans</td>\n",
1290 |        "      <td>-9</td>\n",
1291 |        "      <td>58</td>\n",
1292 |        "      <td>2</td>\n",
1293 |        "      <td>2.6</td>\n",
1294 |        "      <td>Asik, Omer</td>\n",
1295 |        "      <td>3.5</td>\n",
1296 |        "      <td>8.3</td>\n",
1297 |        "      <td>Paint</td>\n",
1298 |        "      <td>C</td>\n",
1299 |        "      <td>0.4011</td>\n",
1300 |        "      <td>13</td>\n",
1301 |        "      <td>21400001</td>\n",
1302 |        "      <td>381</td>\n",
1303 |        "      <td>203932</td>\n",
1304 |        "      <td>NOP</td>\n",
1305 |        "      <td>ORL</td>\n",
1306 |        "      <td>0</td>\n",
1307 |        "      <td>0</td>\n",
1308 |        "      <td>0</td>\n",
1309 |        "      <td>0</td>\n",
1310 |        "    </tr>\n",
1311 |        "    <tr>\n",
1312 |        "      <th>4</th>\n",
1313 |        "      <td>Aaron Gordon</td>\n",
1314 |        "      <td>Orlando Magic</td>\n",
1315 |        "      <td>2014-10-28</td>\n",
1316 |        "      <td>2014</td>\n",
1317 |        "      <td>10</td>\n",
1318 |        "      <td>4</td>\n",
1319 |        "      <td>5</td>\n",
1320 |        "      <td>58</td>\n",
1321 |        "      <td>0</td>\n",
1322 |        "      <td>Jump Shot</td>\n",
1323 |        "      <td>2</td>\n",
1324 |        "      <td>11</td>\n",
1325 |        "      <td>New Orleans Pelicans</td>\n",
1326 |        "      <td>46</td>\n",
1327 |        "      <td>105</td>\n",
1328 |        "      <td>7</td>\n",
1329 |        "      <td>6.2</td>\n",
1330 |        "      <td>Davis, Anthony</td>\n",
1331 |        "      <td>4.8</td>\n",
1332 |        "      <td>1.5</td>\n",
1333 |        "      <td>Paint</td>\n",
1334 |        "      <td>L</td>\n",
1335 |        "      <td>0.3841</td>\n",
1336 |        "      <td>13</td>\n",
1337 |        "      <td>21400001</td>\n",
1338 |        "      <td>524</td>\n",
1339 |        "      <td>203932</td>\n",
1340 |        "      <td>NOP</td>\n",
1341 |        "      <td>ORL</td>\n",
1342 |        "      <td>0</td>\n",
1343 |        "      <td>1</td>\n",
1344 |        "      <td>0</td>\n",
1345 |        "      <td>0</td>\n",
1346 |        "    </tr>\n",
1347 |        "  </tbody>\n",
1348 |        "</table>\n",
1349 |        "</div>"
1350 |       ],
1351 |       "text/plain": [
1352 |        "           name      team_name  game_date  season  team_id  period  \\\n",
1353 |        "0  Aaron Gordon  Orlando Magic 2014-10-28    2014       10       2   \n",
1354 |        "1  Aaron Gordon  Orlando Magic 2014-10-28    2014       10       2   \n",
1355 |        "2  Aaron Gordon  Orlando Magic 2014-10-28    2014       10       2   \n",
1356 |        "3  Aaron Gordon  Orlando Magic 2014-10-28    2014       10       3   \n",
1357 |        "4  Aaron Gordon  Orlando Magic 2014-10-28    2014       10       4   \n",
1358 |        "\n",
1359 |        "   minutes_remaining  seconds_remaining  shot_made_flag action_type  \\\n",
1360 |        "0                 11                 34               1   Jump Shot   \n",
1361 |        "1                  9                 13               1   Jump Shot   \n",
1362 |        "2                  2                 55               0   Jump Shot   \n",
1363 |        "3                  5                  1               1   Jump Shot   \n",
1364 |        "4                  5                 58               0   Jump Shot   \n",
1365 |        "\n",
1366 |        "   shot_type  shot_distance              opponent    x    y  dribbles  \\\n",
1367 |        "0          2              4  New Orleans Pelicans  -10   44         4   \n",
1368 |        "1          3             23  New Orleans Pelicans -233   20         0   \n",
1369 |        "2          3             23  New Orleans Pelicans -234    0         0   \n",
1370 |        "3          2              5  New Orleans Pelicans   -9   58         2   \n",
1371 |        "4          2             11  New Orleans Pelicans   46  105         7   \n",
1372 |        "\n",
1373 |        "   touch_time   defender_name  defender_distance  shot_clock shot_zone  \\\n",
1374 |        "0         5.1  Anderson, Ryan                3.9         0.6     Paint   \n",
1375 |        "1         0.7   Evans, Tyreke                4.3         7.4  Corner 3   \n",
1376 |        "2         0.9    Gordon, Eric               12.5        14.8  Corner 3   \n",
1377 |        "3         2.6      Asik, Omer                3.5         8.3     Paint   \n",
1378 |        "4         6.2  Davis, Anthony                4.8         1.5     Paint   \n",
1379 |        "\n",
1380 |        "  shot_area  lg_avg  opp_id   GAME_ID  GAME_EVENT_ID  PLAYER_ID  HTM  VTM  \\\n",
1381 |        "0         C  0.4011      13  21400001            164     203932  NOP  ORL   \n",
1382 |        "1         R  0.3915      13  21400001            198     203932  NOP  ORL   \n",
1383 |        "2         R  0.3915      13  21400001            275     203932  NOP  ORL   \n",
1384 |        "3         C  0.4011      13  21400001            381     203932  NOP  ORL   \n",
1385 |        "4         L  0.3841      13  21400001            524     203932  NOP  ORL   \n",
1386 |        "\n",
1387 |        "   is_home  prev_shot_made  prev_2_made  prev_3_made  \n",
1388 |        "0        0               0            0            0  \n",
1389 |        "1        0               1            0            0  \n",
1390 |        "2        0               1            1            0  \n",
1391 |        "3        0               0            0            0  \n",
1392 |        "4        0               1            0            0  "
1393 |       ]
1394 |      },
1395 |      "execution_count": 37,
1396 |      "metadata": {},
1397 |      "output_type": "execute_result"
1398 |     }
1399 |    ],
1400 |    "source": [
1401 |     "sorted_df.head()"
1402 |    ]
1403 |   },
1404 |   {
1405 |    "cell_type": "code",
1406 |    "execution_count": 39,
1407 |    "metadata": {},
1408 |    "outputs": [],
1409 |    "source": [
1410 |     "positions = stats[['Player','Pos','Age']]"
1411 |    ]
1412 |   },
1413 |   {
1414 |    "cell_type": "code",
1415 |    "execution_count": 46,
1416 |    "metadata": {},
1417 |    "outputs": [],
1418 |    "source": [
1419 |     "sorted_df = sorted_df.merge(positions, left_on='name', right_on='Player').drop(columns=['Player'])\n",
1420 |     "sorted_df.columns = map(str.lower, sorted_df.columns)"
1421 |    ]
1422 |   },
1423 |   {
1424 |    "cell_type": "code",
1425 |    "execution_count": 55,
1426 |    "metadata": {},
1427 |    "outputs": [],
1428 |    "source": [
1429 |     "#rearrange columns for better visability\n",
1430 |     "sorted_df = sorted_df[['name','pos','age','player_id', 'team_name', 'team_id', 'game_date',\n",
1431 |     "       'game_id', 'game_event_id','season', 'period',\n",
1432 |     "       'minutes_remaining', 'seconds_remaining', 'shot_made_flag',\n",
1433 |     "       'action_type', 'shot_type', 'shot_distance', 'x', 'y',\n",
1434 |     "       'dribbles', 'touch_time', 'opponent', 'opp_id', 'defender_name', 'defender_distance',\n",
1435 |     "       'shot_clock', 'shot_zone', 'shot_area', 'lg_avg','htm', 'vtm',\n",
1436 |     "       'is_home', 'prev_shot_made', 'prev_2_made', 'prev_3_made']]"
1437 |    ]
1438 |   },
1439 |   {
1440 |    "cell_type": "code",
1441 |    "execution_count": 58,
1442 |    "metadata": {
1443 |     "scrolled": true
1444 |    },
1445 |    "outputs": [
1446 |     {
1447 |      "data": {
1448 |       "text/html": [
1449 |        "<div>\n",
1450 |        "<style scoped>\n",
1451 |        "    .dataframe tbody tr th:only-of-type {\n",
1452 |        "        vertical-align: middle;\n",
1453 |        "    }\n",
1454 |        "\n",
1455 |        "    .dataframe tbody tr th {\n",
1456 |        "        vertical-align: top;\n",
1457 |        "    }\n",
1458 |        "\n",
1459 |        "    .dataframe thead th {\n",
1460 |        "        text-align: right;\n",
1461 |        "    }\n",
1462 |        "</style>\n",
1463 |        "<table border=\"1\" class=\"dataframe\">\n",
1464 |        "  <thead>\n",
1465 |        "    <tr style=\"text-align: right;\">\n",
1466 |        "      <th></th>\n",
1467 |        "      <th>name</th>\n",
1468 |        "      <th>pos</th>\n",
1469 |        "      <th>age</th>\n",
1470 |        "      <th>player_id</th>\n",
1471 |        "      <th>team_name</th>\n",
1472 |        "      <th>team_id</th>\n",
1473 |        "      <th>game_date</th>\n",
1474 |        "      <th>game_id</th>\n",
1475 |        "      <th>game_event_id</th>\n",
1476 |        "      <th>season</th>\n",
1477 |        "      <th>period</th>\n",
1478 |        "      <th>minutes_remaining</th>\n",
1479 |        "      <th>seconds_remaining</th>\n",
1480 |        "      <th>shot_made_flag</th>\n",
1481 |        "      <th>action_type</th>\n",
1482 |        "      <th>shot_type</th>\n",
1483 |        "      <th>shot_distance</th>\n",
1484 |        "      <th>x</th>\n",
1485 |        "      <th>y</th>\n",
1486 |        "      <th>dribbles</th>\n",
1487 |        "      <th>touch_time</th>\n",
1488 |        "      <th>opponent</th>\n",
1489 |        "      <th>opp_id</th>\n",
1490 |        "      <th>defender_name</th>\n",
1491 |        "      <th>defender_distance</th>\n",
1492 |        "      <th>shot_clock</th>\n",
1493 |        "      <th>shot_zone</th>\n",
1494 |        "      <th>shot_area</th>\n",
1495 |        "      <th>lg_avg</th>\n",
1496 |        "      <th>htm</th>\n",
1497 |        "      <th>vtm</th>\n",
1498 |        "      <th>is_home</th>\n",
1499 |        "      <th>prev_shot_made</th>\n",
1500 |        "      <th>prev_2_made</th>\n",
1501 |        "      <th>prev_3_made</th>\n",
1502 |        "    </tr>\n",
1503 |        "  </thead>\n",
1504 |        "  <tbody>\n",
1505 |        "    <tr>\n",
1506 |        "      <th>205534</th>\n",
1507 |        "      <td>Vander Blue</td>\n",
1508 |        "      <td>SG</td>\n",
1509 |        "      <td>22</td>\n",
1510 |        "      <td>203505</td>\n",
1511 |        "      <td>Los Angeles Lakers</td>\n",
1512 |        "      <td>22</td>\n",
1513 |        "      <td>2015-04-15</td>\n",
1514 |        "      <td>21401230</td>\n",
1515 |        "      <td>508</td>\n",
1516 |        "      <td>2014</td>\n",
1517 |        "      <td>4</td>\n",
1518 |        "      <td>5</td>\n",
1519 |        "      <td>25</td>\n",
1520 |        "      <td>1</td>\n",
1521 |        "      <td>Turnaround Jump Shot</td>\n",
1522 |        "      <td>2</td>\n",
1523 |        "      <td>20</td>\n",
1524 |        "      <td>125</td>\n",
1525 |        "      <td>165</td>\n",
1526 |        "      <td>0</td>\n",
1527 |        "      <td>1.1</td>\n",
1528 |        "      <td>Sacramento Kings</td>\n",
1529 |        "      <td>11</td>\n",
1530 |        "      <td>Stockton, David</td>\n",
1531 |        "      <td>9.6</td>\n",
1532 |        "      <td>8.7</td>\n",
1533 |        "      <td>Mid Range</td>\n",
1534 |        "      <td>L</td>\n",
1535 |        "      <td>0.3925</td>\n",
1536 |        "      <td>LAL</td>\n",
1537 |        "      <td>SAC</td>\n",
1538 |        "      <td>1</td>\n",
1539 |        "      <td>0</td>\n",
1540 |        "      <td>0</td>\n",
1541 |        "      <td>0</td>\n",
1542 |        "    </tr>\n",
1543 |        "    <tr>\n",
1544 |        "      <th>205535</th>\n",
1545 |        "      <td>Vander Blue</td>\n",
1546 |        "      <td>SG</td>\n",
1547 |        "      <td>22</td>\n",
1548 |        "      <td>203505</td>\n",
1549 |        "      <td>Los Angeles Lakers</td>\n",
1550 |        "      <td>22</td>\n",
1551 |        "      <td>2015-04-15</td>\n",
1552 |        "      <td>21401230</td>\n",
1553 |        "      <td>521</td>\n",
1554 |        "      <td>2014</td>\n",
1555 |        "      <td>4</td>\n",
1556 |        "      <td>4</td>\n",
1557 |        "      <td>4</td>\n",
1558 |        "      <td>0</td>\n",
1559 |        "      <td>Jump Shot</td>\n",
1560 |        "      <td>2</td>\n",
1561 |        "      <td>16</td>\n",
1562 |        "      <td>109</td>\n",
1563 |        "      <td>126</td>\n",
1564 |        "      <td>10</td>\n",
1565 |        "      <td>9.3</td>\n",
1566 |        "      <td>Sacramento Kings</td>\n",
1567 |        "      <td>11</td>\n",
1568 |        "      <td>Stockton, David</td>\n",
1569 |        "      <td>3.1</td>\n",
1570 |        "      <td>12.7</td>\n",
1571 |        "      <td>Mid Range</td>\n",
1572 |        "      <td>L</td>\n",
1573 |        "      <td>0.3925</td>\n",
1574 |        "      <td>LAL</td>\n",
1575 |        "      <td>SAC</td>\n",
1576 |        "      <td>1</td>\n",
1577 |        "      <td>1</td>\n",
1578 |        "      <td>0</td>\n",
1579 |        "      <td>0</td>\n",
1580 |        "    </tr>\n",
1581 |        "    <tr>\n",
1582 |        "      <th>205536</th>\n",
1583 |        "      <td>Vander Blue</td>\n",
1584 |        "      <td>SG</td>\n",
1585 |        "      <td>22</td>\n",
1586 |        "      <td>203505</td>\n",
1587 |        "      <td>Los Angeles Lakers</td>\n",
1588 |        "      <td>22</td>\n",
1589 |        "      <td>2015-04-15</td>\n",
1590 |        "      <td>21401230</td>\n",
1591 |        "      <td>565</td>\n",
1592 |        "      <td>2014</td>\n",
1593 |        "      <td>4</td>\n",
1594 |        "      <td>1</td>\n",
1595 |        "      <td>8</td>\n",
1596 |        "      <td>0</td>\n",
1597 |        "      <td>Running Jump Shot</td>\n",
1598 |        "      <td>2</td>\n",
1599 |        "      <td>16</td>\n",
1600 |        "      <td>51</td>\n",
1601 |        "      <td>154</td>\n",
1602 |        "      <td>7</td>\n",
1603 |        "      <td>7.9</td>\n",
1604 |        "      <td>Sacramento Kings</td>\n",
1605 |        "      <td>11</td>\n",
1606 |        "      <td>Stockton, David</td>\n",
1607 |        "      <td>1.4</td>\n",
1608 |        "      <td>14.2</td>\n",
1609 |        "      <td>Mid Range</td>\n",
1610 |        "      <td>C</td>\n",
1611 |        "      <td>0.3994</td>\n",
1612 |        "      <td>LAL</td>\n",
1613 |        "      <td>SAC</td>\n",
1614 |        "      <td>1</td>\n",
1615 |        "      <td>0</td>\n",
1616 |        "      <td>0</td>\n",
1617 |        "      <td>0</td>\n",
1618 |        "    </tr>\n",
1619 |        "    <tr>\n",
1620 |        "      <th>205537</th>\n",
1621 |        "      <td>Jamaal Franklin</td>\n",
1622 |        "      <td>SG</td>\n",
1623 |        "      <td>23</td>\n",
1624 |        "      <td>203479</td>\n",
1625 |        "      <td>Denver Nuggets</td>\n",
1626 |        "      <td>19</td>\n",
1627 |        "      <td>2015-04-15</td>\n",
1628 |        "      <td>21401229</td>\n",
1629 |        "      <td>500</td>\n",
1630 |        "      <td>2014</td>\n",
1631 |        "      <td>4</td>\n",
1632 |        "      <td>5</td>\n",
1633 |        "      <td>33</td>\n",
1634 |        "      <td>1</td>\n",
1635 |        "      <td>Pullup Jump shot</td>\n",
1636 |        "      <td>3</td>\n",
1637 |        "      <td>26</td>\n",
1638 |        "      <td>59</td>\n",
1639 |        "      <td>257</td>\n",
1640 |        "      <td>1</td>\n",
1641 |        "      <td>2.7</td>\n",
1642 |        "      <td>Golden State Warriors</td>\n",
1643 |        "      <td>15</td>\n",
1644 |        "      <td>Livingston, Shaun</td>\n",
1645 |        "      <td>3.5</td>\n",
1646 |        "      <td>14.0</td>\n",
1647 |        "      <td>Above Break 3</td>\n",
1648 |        "      <td>C</td>\n",
1649 |        "      <td>0.3415</td>\n",
1650 |        "      <td>GSW</td>\n",
1651 |        "      <td>DEN</td>\n",
1652 |        "      <td>0</td>\n",
1653 |        "      <td>0</td>\n",
1654 |        "      <td>0</td>\n",
1655 |        "      <td>0</td>\n",
1656 |        "    </tr>\n",
1657 |        "    <tr>\n",
1658 |        "      <th>205538</th>\n",
1659 |        "      <td>Jamaal Franklin</td>\n",
1660 |        "      <td>SG</td>\n",
1661 |        "      <td>23</td>\n",
1662 |        "      <td>203479</td>\n",
1663 |        "      <td>Denver Nuggets</td>\n",
1664 |        "      <td>19</td>\n",
1665 |        "      <td>2015-04-15</td>\n",
1666 |        "      <td>21401229</td>\n",
1667 |        "      <td>563</td>\n",
1668 |        "      <td>2014</td>\n",
1669 |        "      <td>4</td>\n",
1670 |        "      <td>2</td>\n",
1671 |        "      <td>8</td>\n",
1672 |        "      <td>0</td>\n",
1673 |        "      <td>Pullup Jump shot</td>\n",
1674 |        "      <td>3</td>\n",
1675 |        "      <td>26</td>\n",
1676 |        "      <td>-72</td>\n",
1677 |        "      <td>252</td>\n",
1678 |        "      <td>1</td>\n",
1679 |        "      <td>1.9</td>\n",
1680 |        "      <td>Golden State Warriors</td>\n",
1681 |        "      <td>15</td>\n",
1682 |        "      <td>Rush, Brandon</td>\n",
1683 |        "      <td>4.2</td>\n",
1684 |        "      <td>11.8</td>\n",
1685 |        "      <td>Above Break 3</td>\n",
1686 |        "      <td>C</td>\n",
1687 |        "      <td>0.3415</td>\n",
1688 |        "      <td>GSW</td>\n",
1689 |        "      <td>DEN</td>\n",
1690 |        "      <td>0</td>\n",
1691 |        "      <td>1</td>\n",
1692 |        "      <td>0</td>\n",
1693 |        "      <td>0</td>\n",
1694 |        "    </tr>\n",
1695 |        "  </tbody>\n",
1696 |        "</table>\n",
1697 |        "</div>"
1698 |       ],
1699 |       "text/plain": [
1700 |        "                   name pos  age  player_id           team_name  team_id  \\\n",
1701 |        "205534      Vander Blue  SG   22     203505  Los Angeles Lakers       22   \n",
1702 |        "205535      Vander Blue  SG   22     203505  Los Angeles Lakers       22   \n",
1703 |        "205536      Vander Blue  SG   22     203505  Los Angeles Lakers       22   \n",
1704 |        "205537  Jamaal Franklin  SG   23     203479      Denver Nuggets       19   \n",
1705 |        "205538  Jamaal Franklin  SG   23     203479      Denver Nuggets       19   \n",
1706 |        "\n",
1707 |        "        game_date   game_id  game_event_id  season  period  minutes_remaining  \\\n",
1708 |        "205534 2015-04-15  21401230            508    2014       4                  5   \n",
1709 |        "205535 2015-04-15  21401230            521    2014       4                  4   \n",
1710 |        "205536 2015-04-15  21401230            565    2014       4                  1   \n",
1711 |        "205537 2015-04-15  21401229            500    2014       4                  5   \n",
1712 |        "205538 2015-04-15  21401229            563    2014       4                  2   \n",
1713 |        "\n",
1714 |        "        seconds_remaining  shot_made_flag           action_type  shot_type  \\\n",
1715 |        "205534                 25               1  Turnaround Jump Shot          2   \n",
1716 |        "205535                  4               0             Jump Shot          2   \n",
1717 |        "205536                  8               0     Running Jump Shot          2   \n",
1718 |        "205537                 33               1      Pullup Jump shot          3   \n",
1719 |        "205538                  8               0      Pullup Jump shot          3   \n",
1720 |        "\n",
1721 |        "        shot_distance    x    y  dribbles  touch_time               opponent  \\\n",
1722 |        "205534             20  125  165         0         1.1       Sacramento Kings   \n",
1723 |        "205535             16  109  126        10         9.3       Sacramento Kings   \n",
1724 |        "205536             16   51  154         7         7.9       Sacramento Kings   \n",
1725 |        "205537             26   59  257         1         2.7  Golden State Warriors   \n",
1726 |        "205538             26  -72  252         1         1.9  Golden State Warriors   \n",
1727 |        "\n",
1728 |        "        opp_id      defender_name  defender_distance  shot_clock  \\\n",
1729 |        "205534      11    Stockton, David                9.6         8.7   \n",
1730 |        "205535      11    Stockton, David                3.1        12.7   \n",
1731 |        "205536      11    Stockton, David                1.4        14.2   \n",
1732 |        "205537      15  Livingston, Shaun                3.5        14.0   \n",
1733 |        "205538      15      Rush, Brandon                4.2        11.8   \n",
1734 |        "\n",
1735 |        "            shot_zone shot_area  lg_avg  htm  vtm  is_home  prev_shot_made  \\\n",
1736 |        "205534      Mid Range         L  0.3925  LAL  SAC        1               0   \n",
1737 |        "205535      Mid Range         L  0.3925  LAL  SAC        1               1   \n",
1738 |        "205536      Mid Range         C  0.3994  LAL  SAC        1               0   \n",
1739 |        "205537  Above Break 3         C  0.3415  GSW  DEN        0               0   \n",
1740 |        "205538  Above Break 3         C  0.3415  GSW  DEN        0               1   \n",
1741 |        "\n",
1742 |        "        prev_2_made  prev_3_made  \n",
1743 |        "205534            0            0  \n",
1744 |        "205535            0            0  \n",
1745 |        "205536            0            0  \n",
1746 |        "205537            0            0  \n",
1747 |        "205538            0            0  "
1748 |       ]
1749 |      },
1750 |      "execution_count": 58,
1751 |      "metadata": {},
1752 |      "output_type": "execute_result"
1753 |     }
1754 |    ],
1755 |    "source": [
1756 |     "sorted_df.tail()"
1757 |    ]
1758 |   },
1759 |   {
1760 |    "cell_type": "markdown",
1761 |    "metadata": {},
1762 |    "source": [
1763 |     "## Final cleaning and export"
1764 |    ]
1765 |   },
1766 |   {
1767 |    "cell_type": "code",
1768 |    "execution_count": 75,
1769 |    "metadata": {},
1770 |    "outputs": [],
1771 |    "source": [
1772 |     "#clean positions down to 5 standard positions (no combos)\n",
1773 |     "sorted_df.pos[sorted_df.name=='Giannis Antetokounmpo'] = 'SF'\n",
1774 |     "\n",
1775 |     "sorted_df.pos[sorted_df.pos=='PG-SG']='SG'\n",
1776 |     "sorted_df.pos[sorted_df.pos=='SF-SG'] = 'SF'\n",
1777 |     "sorted_df.pos[sorted_df.pos=='SG-PG'] = 'PG'\n",
1778 |     "sorted_df.pos[sorted_df.pos=='PF-SF'] = 'SF'\n",
1779 |     "sorted_df.pos[sorted_df.pos=='SF-PF'] = 'PF'\n",
1780 |     "sorted_df.pos[sorted_df.pos=='SG-SF'] = 'SF'\n",
1781 |     "\n"
1782 |    ]
1783 |   },
1784 |   {
1785 |    "cell_type": "code",
1786 |    "execution_count": null,
1787 |    "metadata": {},
1788 |    "outputs": [],
1789 |    "source": []
1790 |   },
1791 |   {
1792 |    "cell_type": "code",
1793 |    "execution_count": 493,
1794 |    "metadata": {},
1795 |    "outputs": [],
1796 |    "source": [
1797 |     "# players \n",
1798 |     "    # name | team | \n",
1799 |     "# shots \n",
1800 |     "    # |player_id| zone name| area| made? \n",
1801 |     "    # \n",
1802 |     "# def player_shots()    \n",
1803 |     "    # shots[shots[player_id] == id]\n",
1804 |     "    \n",
1805 |     "# def shots_by_zone(shots):\n",
1806 |     "# \"\"\" first zone\"\"\"\n",
1807 |     "#     returns {'2' = [[], , {}]}\n",
1808 |     "\n",
1809 |     "# shots = player_shots('bob koozie')\n",
1810 |     "# shots_by_zone(shots)\n",
1811 |     "\n",
1812 |     "# iterate through every player \n",
1813 |     "# retrieve each player's shots\n",
1814 |     "# for each zone\n",
1815 |     "# retreive shots taken\n",
1816 |     "# retrieve shots scored \n",
1817 |     "# \n",
1818 |     "\n",
1819 |     "def get_fg_pct_by_player_for_each_zone(df):\n",
1820 |     "    start = time.time()\n",
1821 |     "    player_names = list(df.name.unique())\n",
1822 |     "    df_list = []\n",
1823 |     "\n",
1824 |     "    for c, player in enumerate(player_names):\n",
1825 |     "        df_ = df[df.name==player].reset_index(drop=True)\n",
1826 |     "        shot_arr = np.zeros((len(df_),26))\n",
1827 |     "\n",
1828 |     "        if (c+1)%100==0:\n",
1829 |     "            print('Runtime: {} seconds. {} of {} players completed.'.format(round(time.time()-start,2), c+1, len(player_names)))\n",
1830 |     "        for index, row in df_.iterrows():\n",
1831 |     "            if index != 0:\n",
1832 |     "                shot_arr[index,:] = shot_arr[index-1,:]\n",
1833 |     "            if row.shot_zone=='Mid Range':\n",
1834 |     "                if row.shot_area=='R':\n",
1835 |     "                    if row.shot_made_flag==1:\n",
1836 |     "                        shot_arr[index,0:2]+=[1,1]\n",
1837 |     "                    else:\n",
1838 |     "                        shot_arr[index,0:2]+=[0,1]\n",
1839 |     "                elif row.shot_area=='C':\n",
1840 |     "                    if row.shot_made_flag==1:\n",
1841 |     "                        shot_arr[index,2:4]+=[1,1]\n",
1842 |     "                    else:\n",
1843 |     "                        shot_arr[index,2:4]+=[0,1]\n",
1844 |     "                else:\n",
1845 |     "                    if row.shot_made_flag==1:\n",
1846 |     "                        shot_arr[index,4:6]+=[1,1]\n",
1847 |     "                    else:\n",
1848 |     "                        shot_arr[index,4:6]+=[0,1]\n",
1849 |     "            elif row.shot_zone=='Restricted Area':\n",
1850 |     "                if row.shot_made_flag==1:\n",
1851 |     "                    shot_arr[index,6:8]+=[1,1]\n",
1852 |     "                else:\n",
1853 |     "                    shot_arr[index,6:8]+=[0,1]\n",
1854 |     "            elif row.shot_zone=='Heave':\n",
1855 |     "                if row.shot_made_flag==1:\n",
1856 |     "                    shot_arr[index,8:10]+=[1,1]\n",
1857 |     "                else:\n",
1858 |     "                    shot_arr[index,8:10]+=[0,1]\n",
1859 |     "            elif row.shot_zone=='Above Break 3':\n",
1860 |     "                if row.shot_area=='R':\n",
1861 |     "                    if row.shot_made_flag==1:\n",
1862 |     "                        shot_arr[index,10:12]+=[1,1]\n",
1863 |     "                    else:\n",
1864 |     "                        shot_arr[index,10:12]+=[0,1]\n",
1865 |     "                elif row.shot_area=='C':\n",
1866 |     "                    if row.shot_made_flag==1:\n",
1867 |     "                        shot_arr[index,12:14]+=[1,1]\n",
1868 |     "                    else:\n",
1869 |     "                        shot_arr[index,12:14]+=[0,1]\n",
1870 |     "                else:\n",
1871 |     "                    if row.shot_made_flag==1:\n",
1872 |     "                        shot_arr[index,14:16]+=[1,1]\n",
1873 |     "                    else:\n",
1874 |     "                        shot_arr[index,14:16]+=[0,1]\n",
1875 |     "            elif row.shot_zone=='Paint':\n",
1876 |     "                if row.shot_area=='R':\n",
1877 |     "                    if row.shot_made_flag==1:\n",
1878 |     "                        shot_arr[index,16:18]+=[1,1]\n",
1879 |     "                    else:\n",
1880 |     "                        shot_arr[index,16:18]+=[0,1]\n",
1881 |     "                elif row.shot_area=='C':\n",
1882 |     "                    if row.shot_made_flag==1:\n",
1883 |     "                        shot_arr[index,18:20]+=[1,1]\n",
1884 |     "                    else:\n",
1885 |     "                        shot_arr[index,18:20]+=[0,1]\n",
1886 |     "                else:\n",
1887 |     "                    if row.shot_made_flag==1:\n",
1888 |     "                        shot_arr[index,20:22]+=[1,1]\n",
1889 |     "                    else:\n",
1890 |     "                        shot_arr[index,20:22]+=[0,1]\n",
1891 |     "            elif row.shot_zone=='Corner 3':\n",
1892 |     "                if row.shot_area=='R':\n",
1893 |     "                    if row.shot_made_flag==1:\n",
1894 |     "                        shot_arr[index,22:24]+=[1,1]\n",
1895 |     "                    else:\n",
1896 |     "                        shot_arr[index,22:24]+=[0,1]\n",
1897 |     "                else:\n",
1898 |     "                    if row.shot_made_flag==1:\n",
1899 |     "                        shot_arr[index,24:26]+=[1,1]\n",
1900 |     "                    else:\n",
1901 |     "                        shot_arr[index,24:26]+=[0,1]\n",
1902 |     "\n",
1903 |     "        df_list.append(pd.DataFrame(shot_arr,index=df_.name))\n",
1904 |     "\n",
1905 |     "    print('Total Runtime: {} seconds.'.format(round(time.time()-start,2),\n",
1906 |     "                                              c, len(player_names)))\n",
1907 |     "    return df_list"
1908 |    ]
1909 |   },
1910 |   {
1911 |    "cell_type": "code",
1912 |    "execution_count": 574,
1913 |    "metadata": {},
1914 |    "outputs": [],
1915 |    "source": [
1916 |     "def add_zone_fg_pct_to_df(df):\n",
1917 |     "    df_list = get_fg_pct_by_player_for_each_zone(df)\n",
1918 |     "    zone_df = pd.concat([df_ for df_ in df_list])\n",
1919 |     "    \n",
1920 |     "    column_names = ['mid_R_pct', 'mid_C_pct', 'mid_L_pct', 'restricted_pct', 'heave_pct', 'ab_3_R_pct', 'ab_3_C_pct',\n",
1921 |     "                'ab_3_L_pct', 'paint_R_pct', 'paint_C_pct', 'paint_L_pct', 'corner_3_R_pct', 'corner_3_L_pct',]    \n",
1922 |     "\n",
1923 |     "    counter = 0\n",
1924 |     "    for col in column_names:\n",
1925 |     "        zone_df[col] = np.round(zone_df[counter]/zone_df[counter+1],4)\n",
1926 |     "        counter+=2\n",
1927 |     "        \n",
1928 |     "    zone_df = zone_df.drop(columns=list(range(0,26))).reset_index().rename(columns={\n",
1929 |     "                                                                    'name':'player_name'})\n",
1930 |     "    zone_fg_df = pd.concat((sorted_df,zone_df),axis=1)\n",
1931 |     "    \n",
1932 |     "    return zone_fg_df.drop(columns=['player_name'])\n"
1933 |    ]
1934 |   },
1935 |   {
1936 |    "cell_type": "code",
1937 |    "execution_count": 575,
1938 |    "metadata": {
1939 |     "scrolled": true
1940 |    },
1941 |    "outputs": [
1942 |     {
1943 |      "name": "stdout",
1944 |      "output_type": "stream",
1945 |      "text": [
1946 |       "Runtime: 13.34 seconds. 100 of 490 players completed.\n",
1947 |       "Runtime: 27.03 seconds. 200 of 490 players completed.\n",
1948 |       "Runtime: 39.58 seconds. 300 of 490 players completed.\n",
1949 |       "Runtime: 47.93 seconds. 400 of 490 players completed.\n",
1950 |       "Total Runtime: 51.96 seconds.\n"
1951 |      ]
1952 |     }
1953 |    ],
1954 |    "source": [
1955 |     "zone_fg_df = add_zone_fg_pct_to_df(sorted_df)"
1956 |    ]
1957 |   },
1958 |   {
1959 |    "cell_type": "code",
1960 |    "execution_count": 581,
1961 |    "metadata": {},
1962 |    "outputs": [
1963 |     {
1964 |      "data": {
1965 |       "text/plain": [
1966 |        "name                 0\n",
1967 |        "pos                  0\n",
1968 |        "age                  0\n",
1969 |        "player_id            0\n",
1970 |        "team_name            0\n",
1971 |        "team_id              0\n",
1972 |        "game_date            0\n",
1973 |        "game_id              0\n",
1974 |        "game_event_id        0\n",
1975 |        "season               0\n",
1976 |        "period               0\n",
1977 |        "minutes_remaining    0\n",
1978 |        "seconds_remaining    0\n",
1979 |        "shot_made_flag       0\n",
1980 |        "action_type          0\n",
1981 |        "shot_type            0\n",
1982 |        "shot_distance        0\n",
1983 |        "x                    0\n",
1984 |        "y                    0\n",
1985 |        "dribbles             0\n",
1986 |        "touch_time           0\n",
1987 |        "opponent             0\n",
1988 |        "opp_id               0\n",
1989 |        "defender_name        0\n",
1990 |        "defender_distance    0\n",
1991 |        "shot_clock           0\n",
1992 |        "shot_zone            0\n",
1993 |        "shot_area            0\n",
1994 |        "lg_avg               0\n",
1995 |        "htm                  0\n",
1996 |        "vtm                  0\n",
1997 |        "is_home              0\n",
1998 |        "prev_shot_made       0\n",
1999 |        "prev_2_made          0\n",
2000 |        "prev_3_made          0\n",
2001 |        "mid_R_pct            0\n",
2002 |        "mid_C_pct            0\n",
2003 |        "mid_L_pct            0\n",
2004 |        "restricted_pct       0\n",
2005 |        "heave_pct            0\n",
2006 |        "ab_3_R_pct           0\n",
2007 |        "ab_3_C_pct           0\n",
2008 |        "ab_3_L_pct           0\n",
2009 |        "paint_R_pct          0\n",
2010 |        "paint_C_pct          0\n",
2011 |        "paint_L_pct          0\n",
2012 |        "corner_3_R_pct       0\n",
2013 |        "corner_3_L_pct       0\n",
2014 |        "dtype: int64"
2015 |       ]
2016 |      },
2017 |      "execution_count": 581,
2018 |      "metadata": {},
2019 |      "output_type": "execute_result"
2020 |     }
2021 |    ],
2022 |    "source": [
2023 |     "#fill null values with 0\n",
2024 |     "zone_fg_df = zone_fg_df.fillna(value=0)"
2025 |    ]
2026 |   },
2027 |   {
2028 |    "cell_type": "code",
2029 |    "execution_count": 582,
2030 |    "metadata": {},
2031 |    "outputs": [],
2032 |    "source": [
2033 |     "#export as csv\n",
2034 |     "zone_fg_df.to_csv('./data/sorted_df_14_15.csv')"
2035 |    ]
2036 |   },
2037 |   {
2038 |    "cell_type": "code",
2039 |    "execution_count": null,
2040 |    "metadata": {},
2041 |    "outputs": [],
2042 |    "source": []
2043 |   }
2044 |  ],
2045 |  "metadata": {
2046 |   "extensions": {
2047 |    "jupyter_dashboards": {
2048 |     "activeView": "grid_default",
2049 |     "version": 1,
2050 |     "views": {
2051 |      "grid_default": {
2052 |       "cellMargin": 10,
2053 |       "defaultCellHeight": 20,
2054 |       "maxColumns": 12,
2055 |       "name": "grid",
2056 |       "type": "grid"
2057 |      },
2058 |      "report_default": {
2059 |       "name": "report",
2060 |       "type": "report"
2061 |      }
2062 |     }
2063 |    }
2064 |   },
2065 |   "kernelspec": {
2066 |    "display_name": "Python 3",
2067 |    "language": "python",
2068 |    "name": "python3"
2069 |   },
2070 |   "language_info": {
2071 |    "codemirror_mode": {
2072 |     "name": "ipython",
2073 |     "version": 3
2074 |    },
2075 |    "file_extension": ".py",
2076 |    "mimetype": "text/x-python",
2077 |    "name": "python",
2078 |    "nbconvert_exporter": "python",
2079 |    "pygments_lexer": "ipython3",
2080 |    "version": "3.6.5"
2081 |   }
2082 |  },
2083 |  "nbformat": 4,
2084 |  "nbformat_minor": 2
2085 | }
2086 | 


--------------------------------------------------------------------------------