├── predict_scores.py ├── README.md ├── train_models.py ├── urls_to_scrape.txt ├── daily_data_transform.py └── data_modeling_pt1.py /predict_scores.py: -------------------------------------------------------------------------------- 1 | """ 2 | The goal of this file is to use the trained model from the previous Sunday 3 | and to predict the nightly fantasy score of all the players playing 4 | that night. This file is a work in progress...only a shell is written 5 | here so far. 6 | """ 7 | 8 | ### IMPORTS ### 9 | import pandas as pd 10 | import numpy as np 11 | import pickle 12 | 13 | # load our pickled model 14 | 15 | # load our data for the night 16 | X = data[features] 17 | y = data['fanduel_score'] 18 | 19 | # predict 20 | model_final.predict(X, y) 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Description 2 | This repo is dedicated to my love of fantasy sports! This is going to contain my source code 3 | for predicting player performance. The goal is to provide daily lineups that are most likely 4 | to win a person money in fanduel daily fantasy contests. More to come. 5 | 6 | ### Steps 7 | 1. daily_data_transform.py is how I will transform the incoming csv data file every day. 8 | The data needs to be transformed so that I can make predictions with it. 9 | 10 | 2. train_model.py is the file I will use to train the model. the model will be retrained 11 | and reevaluated weekly. 12 | 13 | 3. predict_scores.py is the file I will use to predict the fanduel scores for that day 14 | 15 | ### Other 16 | data_modeling_pt1.py is my initial exploration of the models 17 | -------------------------------------------------------------------------------- /train_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | The goal of this file is to train the optimal model for predicting fanduel 3 | scores. This file should be run once a week on a Sunday. All games the 4 | following week will use this model to predict fanduel scores 5 | """ 6 | 7 | ### IMPORTS ### 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from sklearn.ensemble import RandomForestRegressor 12 | from sklearn.svm import SVR 13 | 14 | # data will come from the 'daily_data_transform.py' file 15 | # the dataframe will be called 'data' 16 | 17 | features = ['past_1', 'past_3', 'past_5', 'past_10', 'MIN', 'position_C', 18 | u'position_PF', u'position_PG', u'position_SF', u'position_SG', 19 | u'position_Unknown', 'TO', 'PF', 'venue_H', 'venue_R', 20 | u'Opponent_Bos', u'Opponent_Bro', u'Opponent_Cha', u'Opponent_Chi', 21 | u'Opponent_Cle',u'Opponent_Dal', u'Opponent_Den', u'Opponent_Det', 22 | u'Opponent_Gol', u'Opponent_Hou', u'Opponent_Ind', u'Opponent_Lac', 23 | u'Opponent_Lal', u'Opponent_Mem', u'Opponent_Mia', u'Opponent_Mil', 24 | u'Opponent_Min', u'Opponent_Nor', u'Opponent_Nyk', u'Opponent_Okc', 25 | u'Opponent_Orl', u'Opponent_Phi', u'Opponent_Pho', u'Opponent_Por', 26 | u'Opponent_Sac', u'Opponent_San', u'Opponent_Tor', u'Opponent_Uta', 27 | u'Opponent_Was'] 28 | 29 | # using crossval to validate our model 30 | X = data[features] 31 | y = data['fanduel_score'] 32 | 33 | from sklearn.cross_validation import cross_val_score 34 | model = SVR(kernel='rbf', gamma=0, C=.6, epsilon=0) 35 | scores = cross_validation.cross_val_score(model, X, y, cv=5) 36 | print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) 37 | 38 | # now training our full model 39 | model_final = SVR(kernel='rbf', gamma=0, C=.6, epsilon=0) 40 | model_final.fit(X,y) 41 | -------------------------------------------------------------------------------- /urls_to_scrape.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | url_to_scrape = http://stats.nba.com/stats/teamdashlineups?Season=2014-15&SeasonType=Regular+Season&TeamID=1610612752&MeasureType=Base&PerMode=PerGame&PlusMinus=N&PaceAdjust=N&Rank=N&Outcome=&Location=&Month=0&SeasonSegment=&DateFrom=&DateTo=&OpponentTeamID=0&VsConference=&VsDivision=&GameSegment=&Period=0&LastNGames=0&GroupQuantity=5&GameScope=&GameID= 4 | 5 | 6 | 7 | TeamID=1610612737|Atlanta Hawks 8 | TeamID=1610612738|Boston Celtics 9 | TeamID=1610612739|Cleveland Cavaliers 10 | TeamID=1610612740|New Orleans Pelicans 11 | TeamID=1610612741|Chicago Bulls 12 | TeamID=1610612742|Dallas Mavericks 13 | TeamID=1610612743|Denver Nuggets 14 | TeamID=1610612744|Golden State Warriors 15 | TeamID=1610612745|Houston Rockets 16 | TeamID=1610612746|Los Angeles Clippers 17 | TeamID=1610612747|Los Angeles Lakers 18 | TeamID=1610612748|Miami Heat 19 | TeamID=1610612749|Milwaukee Bucks 20 | TeamID=1610612750|Minnesota Timberwolves 21 | TeamID=1610612751|Brooklyn Nets 22 | TeamID=1610612752|New York Knicks 23 | TeamID=1610612753|Orlando Magic 24 | TeamID=1610612754|Indiana Pacers 25 | TeamID=1610612755|Philadelphia 76ers 26 | TeamID=1610612756|Phoenix Suns 27 | TeamID=1610612757|Portland Trail Blazers 28 | TeamID=1610612758|Sacramento Kings 29 | TeamID=1610612759|San Antonio Spurs 30 | TeamID=1610612760|Oklahoma City Thunder 31 | TeamID=1610612761|Toronto Raptors 32 | TeamID=1610612762|Utah Jazz 33 | TeamID=1610612763|Memphis Grizzlies 34 | TeamID=1610612764|Washington Wizards 35 | TeamID=1610612765|Detroit Pistons 36 | TeamID=1610612766|Charlotte Hornets 37 | 38 | 39 | 40 | http://stats.nba.com/gameDetail.html?GameID=400578735 41 | https://stats.nba.com/stats/boxscoresummaryv2?GameID=0021400443 42 | 43 | 44 | http://stats.nba.com/stats/boxscoretraditionalv2?EndPeriod=10&EndRange=28800&GameID=0021400443&RangeType=2&Season=2014-15&SeasonType=Regular+Season&StartPeriod=1&StartRange=0 45 | -------------------------------------------------------------------------------- /daily_data_transform.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file transforms the historical data from 2012-2014. It also transforms 3 | the files that I will be receiving daily. The input files are expected to be 4 | CSV, but I will also have to load historical data too. For now, this 5 | historical data will come from the CSV file, but I would like to transfer 6 | this over to Postres 7 | """ 8 | 9 | ### IMPORTS ### 10 | import pandas as pd 11 | import numpy as np 12 | 13 | ### HISTORICAL TRANSFORM ### 14 | df = pd.read_csv('2012-2014-Table 1.csv') 15 | pd.set_option('display.max_columns', 100) 16 | 17 | # create fanduel score 18 | """ 19 | this function calculates the fanduel score 20 | points = points 21 | rebounds = 1.2x 22 | assists = 1.5x 23 | blocks = 2.0x 24 | steals = 2.0x 25 | turnovers = -1.0x 26 | """ 27 | df['fanduel_score'] = df.apply(lambda x: x['PTS'] + x['TOT']*1.2 + x['A']*1.5 + 28 | x['BL']*2 + x['ST']*2 - x['TO'], axis=1) 29 | 30 | # need to sort data first...come up with robust way to do this 31 | 32 | # create moving averages 33 | def add_moving_averages(column, period): 34 | """ 35 | this function creates a moving average for a column for a specified 36 | period of time 37 | 38 | column: the column you want to create a moving average for ('' to enclose) 39 | period: the number of previous games to be used for the moving average 40 | """ 41 | column = str(column) 42 | new_column_name = "mavg_"+column 43 | period = int(period) 44 | df[new_column_name] = pd.rolling_mean(df[column], period) 45 | 46 | # create dummies for teams, home vs away, and position 47 | home_dummies = pd.get_dummies(df.VENUE, prefix='venue') 48 | position_dummies = pd.get_dummies(df.Position, prefix='position') 49 | team_dummies = pd.get_dummies(df.OPP_TEAM, prefix='Opponent') 50 | 51 | # combining them all together to form new data set 52 | data = pd.concat([df, home_dummies, position_dummies, team_dummies], axis=1) 53 | -------------------------------------------------------------------------------- /data_modeling_pt1.py: -------------------------------------------------------------------------------- 1 | """ 2 | The goal of this file is to document how I predict fanduel scores using only 3 | basic data from nbastuffer.com. The data set has only been augmented 4 | slightly by changing column names to be more readable and I've also added in 5 | player positions which I found on nba.com 6 | """ 7 | 8 | ### IMPORTS ### 9 | import pandas as pd 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | from sklearn.linear_model import LinearRegression 13 | from sklearn.ensemble import RandomForestRegressor 14 | from sklearn.svm import SVR 15 | from sklearn.cross_validation import train_test_split 16 | 17 | ### DATA PREP ### 18 | df = pd.read_csv('2012-2014-Table 1.csv') 19 | pd.set_option('display.max_columns', 100) 20 | %matplotlib inline 21 | 22 | # adding fanduel score 23 | df['fanduel_score'] = df.apply(lambda x: x['PTS'] + x['TOT']*1.2 + x['A']*1.5 + 24 | x['BL']*2 + x['ST']*2 -x['TO'], axis=1) 25 | 26 | # creating dummy columns 27 | home_dummies = pd.get_dummies(df.VENUE, prefix='venue') 28 | position_dummies = pd.get_dummies(df.Position, prefix='position') 29 | team_dummies = pd.get_dummies(df.OPP_TEAM, prefix='Opponent') 30 | 31 | # combining them all together to form new data set 32 | data = pd.concat([df, home_dummies, position_dummies, team_dummies], axis=1) 33 | 34 | # our features 35 | features = ['MIN', 'position_C', u'position_PF', u'position_PG', u'position_SF' 36 | , u'position_SG', u'position_Unknown', 'TO', 'PF', 'venue_H', 'venue_R' 37 | ,'Opponent_Bos', u'Opponent_Bro', u'Opponent_Cha', u'Opponent_Chi' 38 | , u'Opponent_Cle', u'Opponent_Dal', u'Opponent_Den', u'Opponent_Det' 39 | , u'Opponent_Gol', u'Opponent_Hou', u'Opponent_Ind', u'Opponent_Lac' 40 | , u'Opponent_Lal', u'Opponent_Mem', u'Opponent_Mia', u'Opponent_Mil' 41 | , u'Opponent_Min', u'Opponent_Nor', u'Opponent_Nyk', u'Opponent_Okc' 42 | , u'Opponent_Orl', u'Opponent_Phi', u'Opponent_Pho', u'Opponent_Por' 43 | , u'Opponent_Sac', u'Opponent_San', u'Opponent_Tor', u'Opponent_Uta' 44 | , u'Opponent_Was'] 45 | 46 | # split data into train and test 47 | train, test = train_test_split(data3, train_size = 0.8) 48 | 49 | x_train = train[features] 50 | x_test = test[features] 51 | 52 | y_train = train['fanduel_score'] 53 | y_test = test['fanduel_score'] 54 | 55 | ### PREDICTIONS ### 56 | 57 | # linear regression 58 | lm = LinearRegression() 59 | lm.fit(x_train, y_train) 60 | print lm.score(x_test, y_test) 61 | print zip(features, lm.coef_) 62 | ##### score = 0.691 63 | 64 | # random forest 65 | rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_features='sqrt') 66 | rf.fit(x_train, y_train) 67 | print rf.score(x_test, y_test) 68 | print zip(features, rf.coef_) 69 | ##### score = 0.667 70 | 71 | # svm - linear kernel 72 | svr_linear = SVR(kernel='linear', C=.5) 73 | svr_linear.fit(x_train, y_train) 74 | print svr_linear.score(x_train, y_train) 75 | print zip(features, svr_linear.coef_) 76 | ##### score = 0.686 77 | 78 | # svm - rbf kernel 79 | svr_rbf = SVR(kernel='rbf', C=.5) 80 | svr_rbf.fit(x_train, y_train) 81 | print svr_rbf.score(x_train, y_train) 82 | print zip(features, svr_rbf.coef_) 83 | ##### score = 0.700 84 | 85 | # let's transform our dependent variable 86 | y_train_log = np.log(y_train) 87 | y_test_log = np.log(y_test) 88 | # fill in nan and inf 89 | y_test_log = np.nan_to_num(y_test_log) 90 | y_train_log = np.nan_to_num(y_train_log) 91 | 92 | # svm - rbf kernel - log transform 93 | svr_rbf = SVR(kernel='rbf', C=.5) 94 | svr_rbf.fit(x_train, y_train_log) 95 | print svr_rbf.score(x_train, y_train_log) 96 | print zip(features, svr_rbf.coef_) 97 | ##### score = 0.700 98 | 99 | ### RESIDUALS ### 100 | y_predicted = model_SVR.predict(x_test) 101 | residuals = y_test - y_predicted 102 | residuals.hist(bins=20) 103 | # plot looks normally distributed...good. 104 | 105 | ### MOVING AVERAGES ### 106 | data3['past_1'] = pd.rolling_mean(data3['fanduel_score'], 1) 107 | data3['past_3'] = pd.rolling_mean(data3['fanduel_score'], 3) 108 | data3['past_5'] = pd.rolling_mean(data3['fanduel_score'], 5) 109 | data3['past_10'] = pd.rolling_mean(data3['fanduel_score'], 10) 110 | 111 | train, test = train_test_split(data4, train_size = 0.8) 112 | 113 | features6 = ['past_1', 'past_3', 'past_5', 'past_10','MIN', 'position_C', 114 | u'position_PF', u'position_PG', u'position_SF', u'position_SG', 115 | u'position_Unknown', 'TO', 'PF', 'venue_H', 'venue_R','Opponent_Bos', 116 | u'Opponent_Bro', u'Opponent_Cha', u'Opponent_Chi',u'Opponent_Cle', 117 | u'Opponent_Dal', u'Opponent_Den', u'Opponent_Det', u'Opponent_Gol', 118 | u'Opponent_Hou', u'Opponent_Ind',u'Opponent_Lac', u'Opponent_Lal', 119 | u'Opponent_Mem', u'Opponent_Mia', u'Opponent_Mil', u'Opponent_Min', 120 | u'Opponent_Nor', u'Opponent_Nyk', u'Opponent_Okc', u'Opponent_Orl', 121 | u'Opponent_Phi', u'Opponent_Pho',u'Opponent_Por', u'Opponent_Sac', 122 | u'Opponent_San', u'Opponent_Tor', u'Opponent_Uta', u'Opponent_Was'] 123 | 124 | x_train = train[features6] 125 | x_test = test[features6] 126 | 127 | y_train = train['fanduel_score'] 128 | y_test = test['fanduel_score'] 129 | 130 | model_SVR_rbf = SVR(kernel='rbf', C=.5) 131 | model_SVR_rbf.fit(x_train, y_train) 132 | model_SVR_rbf.score(x_test, y_test) 133 | ### Score: 0.938 134 | 135 | --------------------------------------------------------------------------------