├── predict_scores.py
├── README.md
├── train_models.py
├── urls_to_scrape.txt
├── daily_data_transform.py
└── data_modeling_pt1.py


/predict_scores.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The goal of this file is to use the trained model from the previous Sunday
 3 | and to predict the nightly fantasy score of all the players playing
 4 | that night.  This file is a work in progress...only a shell is written
 5 | here so far.
 6 | """
 7 | 
 8 | ### IMPORTS ###
 9 | import pandas as pd
10 | import numpy as np
11 | import pickle
12 | 
13 | # load our pickled model
14 | 
15 | # load our data for the night
16 | X = data[features]
17 | y = data['fanduel_score']
18 | 
19 | # predict
20 | model_final.predict(X, y)
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Description
 2 | This repo is dedicated to my love of fantasy sports!  This is going to contain my source code
 3 | for predicting player performance.  The goal is to provide daily lineups that are most likely
 4 | to win a person money in fanduel daily fantasy contests.  More to come.
 5 | 
 6 | ### Steps
 7 | 1. daily_data_transform.py is how I will transform the incoming csv data file every day.
 8 | The data needs to be transformed so that I can make predictions with it.
 9 | 
10 | 2. train_model.py is the file I will use to train the model. the model will be retrained
11 | and reevaluated weekly.
12 | 
13 | 3. predict_scores.py is the file I will use to predict the fanduel scores for that day
14 | 
15 | ### Other
16 | data_modeling_pt1.py is my initial exploration of the models
17 | 


--------------------------------------------------------------------------------
/train_models.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The goal of this file is to train the optimal model for predicting fanduel
 3 | scores.  This file should be run once a week on a Sunday.  All games the
 4 | following week will use this model to predict fanduel scores
 5 | """
 6 | 
 7 | ### IMPORTS ###
 8 | import pandas as pd
 9 | import numpy as np
10 | 
11 | from sklearn.ensemble import RandomForestRegressor
12 | from sklearn.svm import SVR
13 | 
14 | # data will come from the 'daily_data_transform.py' file
15 | # the dataframe will be called 'data'
16 | 
17 | features = ['past_1', 'past_3', 'past_5', 'past_10', 'MIN', 'position_C',
18 |             u'position_PF', u'position_PG', u'position_SF', u'position_SG',
19 |             u'position_Unknown', 'TO', 'PF', 'venue_H', 'venue_R',
20 |             u'Opponent_Bos', u'Opponent_Bro', u'Opponent_Cha', u'Opponent_Chi',
21 |             u'Opponent_Cle',u'Opponent_Dal', u'Opponent_Den', u'Opponent_Det',
22 |             u'Opponent_Gol', u'Opponent_Hou', u'Opponent_Ind', u'Opponent_Lac',
23 |             u'Opponent_Lal', u'Opponent_Mem', u'Opponent_Mia', u'Opponent_Mil',
24 |             u'Opponent_Min', u'Opponent_Nor', u'Opponent_Nyk', u'Opponent_Okc',
25 |             u'Opponent_Orl', u'Opponent_Phi', u'Opponent_Pho', u'Opponent_Por',
26 |             u'Opponent_Sac', u'Opponent_San', u'Opponent_Tor', u'Opponent_Uta',
27 |             u'Opponent_Was']
28 | 
29 | # using crossval to validate our model
30 | X = data[features]
31 | y = data['fanduel_score']
32 | 
33 | from sklearn.cross_validation import cross_val_score
34 | model = SVR(kernel='rbf', gamma=0, C=.6, epsilon=0)
35 | scores = cross_validation.cross_val_score(model, X, y, cv=5)
36 | print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
37 | 
38 | # now training our full model
39 | model_final = SVR(kernel='rbf', gamma=0, C=.6, epsilon=0)
40 | model_final.fit(X,y)
41 | 


--------------------------------------------------------------------------------
/urls_to_scrape.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | url_to_scrape = http://stats.nba.com/stats/teamdashlineups?Season=2014-15&SeasonType=Regular+Season&TeamID=1610612752&MeasureType=Base&PerMode=PerGame&PlusMinus=N&PaceAdjust=N&Rank=N&Outcome=&Location=&Month=0&SeasonSegment=&DateFrom=&DateTo=&OpponentTeamID=0&VsConference=&VsDivision=&GameSegment=&Period=0&LastNGames=0&GroupQuantity=5&GameScope=&GameID=
 4 | 
 5 | 
 6 | 
 7 | TeamID=1610612737|Atlanta Hawks
 8 | TeamID=1610612738|Boston Celtics
 9 | TeamID=1610612739|Cleveland Cavaliers
10 | TeamID=1610612740|New Orleans Pelicans
11 | TeamID=1610612741|Chicago Bulls
12 | TeamID=1610612742|Dallas Mavericks
13 | TeamID=1610612743|Denver Nuggets
14 | TeamID=1610612744|Golden State Warriors
15 | TeamID=1610612745|Houston Rockets
16 | TeamID=1610612746|Los Angeles Clippers
17 | TeamID=1610612747|Los Angeles Lakers
18 | TeamID=1610612748|Miami Heat
19 | TeamID=1610612749|Milwaukee Bucks
20 | TeamID=1610612750|Minnesota Timberwolves
21 | TeamID=1610612751|Brooklyn Nets
22 | TeamID=1610612752|New York Knicks
23 | TeamID=1610612753|Orlando Magic
24 | TeamID=1610612754|Indiana Pacers
25 | TeamID=1610612755|Philadelphia 76ers
26 | TeamID=1610612756|Phoenix Suns
27 | TeamID=1610612757|Portland Trail Blazers
28 | TeamID=1610612758|Sacramento Kings
29 | TeamID=1610612759|San Antonio Spurs
30 | TeamID=1610612760|Oklahoma City Thunder
31 | TeamID=1610612761|Toronto Raptors
32 | TeamID=1610612762|Utah Jazz
33 | TeamID=1610612763|Memphis Grizzlies
34 | TeamID=1610612764|Washington Wizards
35 | TeamID=1610612765|Detroit Pistons
36 | TeamID=1610612766|Charlotte Hornets
37 | 
38 | 
39 | 
40 | http://stats.nba.com/gameDetail.html?GameID=400578735
41 | https://stats.nba.com/stats/boxscoresummaryv2?GameID=0021400443
42 | 
43 | 
44 | http://stats.nba.com/stats/boxscoretraditionalv2?EndPeriod=10&EndRange=28800&GameID=0021400443&RangeType=2&Season=2014-15&SeasonType=Regular+Season&StartPeriod=1&StartRange=0
45 | 


--------------------------------------------------------------------------------
/daily_data_transform.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file transforms the historical data from 2012-2014.  It also transforms
 3 | the files that I will be receiving daily.  The input files are expected to be
 4 | CSV, but I will also have to load historical data too.  For now, this
 5 | historical data will come from the CSV file, but I would like to transfer
 6 | this over to Postres
 7 | """
 8 | 
 9 | ### IMPORTS ###
10 | import pandas as pd
11 | import numpy as np
12 | 
13 | ### HISTORICAL TRANSFORM ###
14 | df = pd.read_csv('2012-2014-Table 1.csv')
15 | pd.set_option('display.max_columns', 100)
16 | 
17 | # create fanduel score
18 |     """
19 |     this function calculates the fanduel score
20 |     points = points
21 |     rebounds = 1.2x
22 |     assists = 1.5x
23 |     blocks = 2.0x
24 |     steals = 2.0x
25 |     turnovers = -1.0x
26 |     """
27 | df['fanduel_score'] = df.apply(lambda x: x['PTS'] + x['TOT']*1.2 + x['A']*1.5 +
28 |                                x['BL']*2 + x['ST']*2 - x['TO'], axis=1)
29 | 
30 | # need to sort data first...come up with robust way to do this
31 | 
32 | # create moving averages
33 | def add_moving_averages(column, period):
34 |     """
35 |     this function creates a moving average for a column for a specified
36 |     period of time
37 | 
38 |     column: the column you want to create a moving average for ('' to enclose)
39 |     period: the number of previous games to be used for the moving average
40 |     """
41 |     column = str(column)
42 |     new_column_name = "mavg_"+column
43 |     period = int(period)
44 |     df[new_column_name] = pd.rolling_mean(df[column], period)
45 | 
46 | # create dummies for teams, home vs away, and position
47 | home_dummies = pd.get_dummies(df.VENUE, prefix='venue')
48 | position_dummies = pd.get_dummies(df.Position, prefix='position')
49 | team_dummies = pd.get_dummies(df.OPP_TEAM, prefix='Opponent')
50 | 
51 | # combining them all together to form new data set
52 | data = pd.concat([df, home_dummies, position_dummies, team_dummies], axis=1)
53 | 


--------------------------------------------------------------------------------
/data_modeling_pt1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The goal of this file is to document how I predict fanduel scores using only
  3 |  basic data from nbastuffer.com.  The data set has only been augmented
  4 |  slightly by changing column names to be more readable and I've also added in
  5 |   player positions which I found on nba.com
  6 | """
  7 | 
  8 | ### IMPORTS ###
  9 | import pandas as pd
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | from sklearn.linear_model import LinearRegression
 13 | from sklearn.ensemble import RandomForestRegressor
 14 | from sklearn.svm import SVR
 15 | from sklearn.cross_validation import train_test_split
 16 | 
 17 | ### DATA PREP ###
 18 | df = pd.read_csv('2012-2014-Table 1.csv')
 19 | pd.set_option('display.max_columns', 100)
 20 | %matplotlib inline
 21 | 
 22 | # adding fanduel score
 23 | df['fanduel_score'] = df.apply(lambda x: x['PTS'] + x['TOT']*1.2 + x['A']*1.5 +
 24 |                                x['BL']*2 + x['ST']*2 -x['TO'], axis=1)
 25 | 
 26 | # creating dummy columns
 27 | home_dummies = pd.get_dummies(df.VENUE, prefix='venue')
 28 | position_dummies = pd.get_dummies(df.Position, prefix='position')
 29 | team_dummies = pd.get_dummies(df.OPP_TEAM, prefix='Opponent')
 30 | 
 31 | # combining them all together to form new data set
 32 | data = pd.concat([df, home_dummies, position_dummies, team_dummies], axis=1)
 33 | 
 34 | # our features
 35 | features = ['MIN', 'position_C', u'position_PF', u'position_PG', u'position_SF'
 36 | , u'position_SG', u'position_Unknown', 'TO', 'PF', 'venue_H', 'venue_R'
 37 | ,'Opponent_Bos', u'Opponent_Bro', u'Opponent_Cha', u'Opponent_Chi'
 38 | , u'Opponent_Cle', u'Opponent_Dal', u'Opponent_Den', u'Opponent_Det'
 39 | , u'Opponent_Gol', u'Opponent_Hou', u'Opponent_Ind', u'Opponent_Lac'
 40 | , u'Opponent_Lal', u'Opponent_Mem', u'Opponent_Mia', u'Opponent_Mil'
 41 | , u'Opponent_Min', u'Opponent_Nor', u'Opponent_Nyk', u'Opponent_Okc'
 42 | , u'Opponent_Orl', u'Opponent_Phi', u'Opponent_Pho', u'Opponent_Por'
 43 | , u'Opponent_Sac', u'Opponent_San', u'Opponent_Tor', u'Opponent_Uta'
 44 | , u'Opponent_Was']
 45 | 
 46 | # split data into train and test
 47 | train, test = train_test_split(data3, train_size = 0.8)
 48 | 
 49 | x_train = train[features]
 50 | x_test = test[features]
 51 | 
 52 | y_train = train['fanduel_score']
 53 | y_test = test['fanduel_score']
 54 | 
 55 | ### PREDICTIONS ###
 56 | 
 57 | # linear regression
 58 | lm = LinearRegression()
 59 | lm.fit(x_train, y_train)
 60 | print lm.score(x_test, y_test)
 61 | print zip(features, lm.coef_)
 62 | ##### score = 0.691
 63 | 
 64 | # random forest
 65 | rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_features='sqrt')
 66 | rf.fit(x_train, y_train)
 67 | print rf.score(x_test, y_test)
 68 | print zip(features, rf.coef_)
 69 | ##### score = 0.667
 70 | 
 71 | # svm - linear kernel
 72 | svr_linear = SVR(kernel='linear', C=.5)
 73 | svr_linear.fit(x_train, y_train)
 74 | print svr_linear.score(x_train, y_train)
 75 | print zip(features, svr_linear.coef_)
 76 | ##### score = 0.686
 77 | 
 78 | # svm - rbf kernel
 79 | svr_rbf = SVR(kernel='rbf', C=.5)
 80 | svr_rbf.fit(x_train, y_train)
 81 | print svr_rbf.score(x_train, y_train)
 82 | print zip(features, svr_rbf.coef_)
 83 | ##### score = 0.700
 84 | 
 85 | # let's transform our dependent variable
 86 | y_train_log = np.log(y_train)
 87 | y_test_log = np.log(y_test)
 88 | # fill in nan and inf
 89 | y_test_log = np.nan_to_num(y_test_log)
 90 | y_train_log = np.nan_to_num(y_train_log)
 91 | 
 92 | # svm - rbf kernel - log transform
 93 | svr_rbf = SVR(kernel='rbf', C=.5)
 94 | svr_rbf.fit(x_train, y_train_log)
 95 | print svr_rbf.score(x_train, y_train_log)
 96 | print zip(features, svr_rbf.coef_)
 97 | ##### score = 0.700
 98 | 
 99 | ### RESIDUALS ###
100 | y_predicted = model_SVR.predict(x_test)
101 | residuals = y_test - y_predicted
102 | residuals.hist(bins=20)
103 | # plot looks normally distributed...good.
104 | 
105 | ### MOVING AVERAGES ###
106 | data3['past_1'] = pd.rolling_mean(data3['fanduel_score'], 1)
107 | data3['past_3'] = pd.rolling_mean(data3['fanduel_score'], 3)
108 | data3['past_5'] = pd.rolling_mean(data3['fanduel_score'], 5)
109 | data3['past_10'] = pd.rolling_mean(data3['fanduel_score'], 10)
110 | 
111 | train, test = train_test_split(data4, train_size = 0.8)
112 | 
113 | features6 = ['past_1', 'past_3', 'past_5', 'past_10','MIN', 'position_C',
114 |  u'position_PF', u'position_PG', u'position_SF', u'position_SG',
115 |  u'position_Unknown', 'TO', 'PF', 'venue_H', 'venue_R','Opponent_Bos',
116 |   u'Opponent_Bro', u'Opponent_Cha', u'Opponent_Chi',u'Opponent_Cle',
117 |   u'Opponent_Dal', u'Opponent_Den', u'Opponent_Det', u'Opponent_Gol',
118 |   u'Opponent_Hou', u'Opponent_Ind',u'Opponent_Lac', u'Opponent_Lal',
119 |   u'Opponent_Mem', u'Opponent_Mia', u'Opponent_Mil', u'Opponent_Min',
120 |   u'Opponent_Nor', u'Opponent_Nyk', u'Opponent_Okc', u'Opponent_Orl',
121 |    u'Opponent_Phi', u'Opponent_Pho',u'Opponent_Por', u'Opponent_Sac',
122 |    u'Opponent_San', u'Opponent_Tor', u'Opponent_Uta', u'Opponent_Was']
123 | 
124 | x_train = train[features6]
125 | x_test = test[features6]
126 | 
127 | y_train = train['fanduel_score']
128 | y_test = test['fanduel_score']
129 | 
130 | model_SVR_rbf = SVR(kernel='rbf', C=.5)
131 | model_SVR_rbf.fit(x_train, y_train)
132 | model_SVR_rbf.score(x_test, y_test)
133 | ### Score: 0.938
134 | 
135 | 


--------------------------------------------------------------------------------