├── .gitignore
├── README.md
├── boosting.R
├── cache
    └── .gitignore
├── data
    ├── college_stats.csv
    ├── combines.csv
    └── drafts.csv
├── post.Rmd
├── prepare_training_data.R
├── publish.R
├── scrape_pfr.R
├── style.css
└── utils.R


/.gitignore:
--------------------------------------------------------------------------------
1 | *.*~
2 | .RData


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Learning the NFL Draft
2 | 
3 | See the [post here](https://seanjtaylor.github.io/learning-the-draft/).
4 | 


--------------------------------------------------------------------------------
/boosting.R:
--------------------------------------------------------------------------------
 1 | library(xgboost)
 2 | 
 3 | fitX <- model.matrix(~ 0 +
 4 |                      factor(pos) + year +
 5 |                      # Ensemble the sparse model here.
 6 |                      sparse.fr.hat +
 7 |                      age + height + weight +
 8 |                      forty + bench + vertical +
 9 |                      threecone + broad + shuttle +
10 |                      games + seasons +
11 |                      completions + attempts +
12 |                      pass_yards + pass_ints + pass_tds + 
13 |                      rec_yards + rec_td + receptions +
14 |                      rush_att + rush_yds + rush_td +
15 |                      solo_tackes + tackles + loss_tackles + ast_tackles +
16 |                      fum_forced + fum_rec + fum_tds + fum_yds +
17 |                      sacks + int + int_td + int_yards + pd +
18 |                      punt_returns + punt_return_td + punt_return_yards +
19 |                      kick_returns + kick_return_td + kick_return_yards
20 |                      ,training)
21 | 
22 | b1.tuning <- expand.grid(depth = c(3, 4, 5, 6),
23 |                          rounds = c(50, 100, 150, 200, 250)) %>%
24 |   group_by(depth, rounds) %>%
25 |   do({
26 |     m <- xgboost(data = fitX[train.set,],
27 |                  label = as.numeric(training$pick[train.set] <= 32),
28 |                  max.depth = .$depth,
29 |                  nround =.$rounds,
30 |                  print.every.n = 50,
31 |                  objective = 'binary:logistic')
32 |     yhat <- predict(m, newdata = fitX)
33 |     data_frame(test.set = test.set, yhat = yhat,
34 |                label = as.numeric(training$pick <= 32))
35 |   })
36 | 
37 | aucs <- b1.tuning %>%
38 |   ungroup %>%
39 |   filter(test.set) %>%
40 |   group_by(depth, rounds) %>%
41 |   do({
42 |     auc <- performance(prediction(.$yhat, .$label), "auc")@y.values[[1]]
43 |     data_frame(auc = auc)
44 |   }) %>%
45 |   ungroup %>%
46 |   arrange(-auc)
47 | 
48 | best <- aucs %>% head(1)
49 | 
50 | # Fit the best model on the training data
51 | b1.train <- xgboost(data = fitX[train.set,],
52 |               label = first.round[train.set],
53 |               max.depth = best$depth,
54 |               nround = best$rounds,
55 |               objective = "binary:logistic")
56 | 
57 | # Fit the best model on all the data
58 | b1 <- xgboost(data = fitX[!holdout.set,],
59 |               label = first.round[!holdout.set],
60 |               max.depth = best$depth,
61 |               nround = best$rounds,
62 |               objective = "binary:logistic")
63 | 
64 | training$fr.hat <- predict(b1, newdata = fitX)
65 | 


--------------------------------------------------------------------------------
/cache/.gitignore:
--------------------------------------------------------------------------------
1 | *.html


--------------------------------------------------------------------------------
/post.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Learning the NFL Draft"
  3 | author: "Sean J. Taylor"
  4 | output:
  5 |   html_document:
  6 |     theme: journal
  7 |     css: style.css
  8 | ---
  9 | 
 10 | <a href="https://twitter.com/share" class="twitter-share-button" data-via="seanjtaylor">Tweet</a>
 11 | <script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, 'script', 'twitter-wjs');</script>
 12 | 
 13 | I love watching the NFL, but when the season ends it gets boring for a few months.  Probably the biggest event of the offseason is the draft, which I think is interesting but I can't get excited about.  I don't watch college football, so I can't evaluate or project players.
 14 | 
 15 | Most of the articles you read about the NFL draft are complete garbage.  It's speculation about player quality or draft tactics based, at best, someone who's casually watched a player in a few games.  So I decided that this year I'm going to do my own "mock draft" but it's going to be based on the best data science I can muster.
 16 | 
 17 | If you're not interested in how I did this, [skip to the results](#results).
 18 | 
 19 | ## Scraping the data
 20 | 
 21 | Fortuantely [Pro Football Reference](http://pro-football-reference.com/) has great data on historical [drafts](http://www.pro-football-reference.com/draft/) and [combine results](http://www.pro-football-reference.com/play-index/nfl-combine-results.cgi) from 2000-2016.  They also link to [college statistics](http://www.sports-reference.com/cfb/players/marcus-mariota-1.html) of a large number of the players who were drafted or appeared at the combine.
 22 | 
 23 | I won't bore you with the scraping code, but you can see [how I did it](https://github.com/seanjtaylor/learning-the-draft/blob/master/scrape_pfr.R) or just directly [use the files](https://github.com/seanjtaylor/learning-the-draft/blob/master/data) I created.  This was probably the bulk of the work!
 24 | 
 25 | I was able to gather the following data:
 26 | 
 27 |  * `r training %>% nrow` players in total.
 28 |  * `r draft.table %>% nrow` players that were drafted.
 29 |  * `r combine.table %>% nrow` that appeared at the NFL combine.
 30 |  * `r college.stats %>% with(length(unique(url)))` players with at least some basic college stats available.
 31 | 
 32 | ## Goal
 33 | 
 34 | The goal of this exercise is to build a model that answers the following question: what is the probability that the player will be picked in the first round?  We'll assume that players with higher first round probabilities are more likely to be drafted higher. Obviously we could do something fancier, e.g. learning to rank, or regression to predict where they will be pick.  My experience was that these models performed much worse than a logistic loss function on the first round outcome.
 35 | 
 36 | ## Caveats
 37 | 
 38 | * I'm not trying to model teams picking for certain needs (and in fact, I don't use the team information at all here.)  You could picture adding team variables or features of the team's last year draft as features here.
 39 | * I'm not doing a proper cross-validation procedure here.  I'm using a single test-train split to pick hyper-parameters (number of rounds of boosting and tree depth).
 40 | * There's a bit of "peeking" involved in imputing the missing combine scores.  Sorry about that, I'm lazy.
 41 | 
 42 | ## Imputing missing data
 43 | 
 44 | Not every player performs every test at the NFL combine, so I used [mice](https://cran.r-project.org/web/packages/mice/index.html) to impute the missing combine scores.  This allows me to ignore missingness in these variables (which may be informative!) while doing machine learning.  You can see how I prepare the data [in the source files](https://github.com/seanjtaylor/learning-the-draft/blob/master/prepare_training_data.R).
 45 | 
 46 | For the college statistics, I only use count statistics (e.g. number of tackles, number of interceptions) so you can interpret a zero as the player did not do this in college.  It's not perfect, since we are missing college data for a number of players and they will look the same as players who didn't accumulate any statistics.
 47 | 
 48 | ## Building a linear models
 49 | 
 50 | I first tried my favorite ML tool: sparse regularized regression.  [Glmnet](https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html) is my favorite implementation.  The trick to getting good results here is producing a lot of interactions.  We are learning a different model for each position, including which colleges teams prefer as well as which statistics matter.  Because the position dummy variables are sparse, the linear model has sparse features.  The matrix has `r ncol(sparseX)` features and only `r nrow(sparseX)` rows so we'll be regularizing a lot.
 51 | 
 52 | ```{r eval=FALSE}
 53 | library(glmnet)
 54 | 
 55 | sparseX <- sparse.model.matrix(~  + (1 + factor(pos)) * (1 +
 56 |                      factor(short_college) +
 57 |                      age + height + weight +
 58 |                      forty + bench + vertical +
 59 |                      threecone + broad + shuttle +
 60 |                      games + seasons +
 61 |                      completions + attempts +
 62 |                      pass_yards + pass_ints + pass_tds + 
 63 |                      rec_yards + rec_td + receptions +
 64 |                      rush_att + rush_yds + rush_td +
 65 |                      solo_tackes + tackles + loss_tackles + ast_tackles +
 66 |                      fum_forced + fum_rec + fum_tds + fum_yds +
 67 |                      sacks + int + int_td + int_yards + pd +
 68 |                      punt_returns + punt_return_td + punt_return_yards +
 69 |                      kick_returns + kick_return_td + kick_return_yards)
 70 |                      ,training)
 71 | 
 72 | m1 <- cv.glmnet(sparseX[train.set,],
 73 |                 first.round[train.set],
 74 |                 alpha = 0.5,
 75 |                 family = 'binomial')
 76 | 
 77 | training$sparse.fr.hat <- predict(m1, newx = sparseX, type = 'response')[,1]
 78 | ```
 79 | 
 80 | The first thing we probably want to do is look at an ROC curve to see how well we do out-of-sample.  The AUC of the model is `r round(performance(prediction(training$sparse.fr.hat[test.set], first.round[test.set]), 'auc')@y.values[[1]], 2)`.
 81 | 
 82 | ```{r message=FALSE}
 83 | library(ROCR)
 84 | preds <- prediction(training$sparse.fr.hat[test.set], first.round[test.set])
 85 | perf <- performance(preds, 'tpr', 'fpr')
 86 | plot(perf)
 87 | ```
 88 | 
 89 | ## Building a dense model
 90 | 
 91 | The results for the sparse model were kind of underwhelming, so we're going to try a more complex model.  My favorite technique these days is gradient boosting, and there's no better implementation than the [XGBoost](https://github.com/dmlc/xgboost) package.
 92 | 
 93 | Notice that I include the in-sample predictions from the sparse model here as features.  The sparse model doesn't perform great, but it can pick up on things the tree cannot efficiently learn, such as the college and position effects.  This is essentially a cheap hack to do ensembling.
 94 | 
 95 | ```{r eval=FALSE}
 96 | fitX <- model.matrix(~ 0 +
 97 |                      factor(pos) +
 98 |                      # Ensemble the sparse model here.
 99 |                      sparse.pick.hat +
100 |                      age + height + weight +
101 |                      forty + bench + vertical +
102 |                      threecone + broad + shuttle +
103 |                      games + seasons +
104 |                      completions + attempts +
105 |                      pass_yards + pass_ints + pass_tds + 
106 |                      rec_yards + rec_td + receptions +
107 |                      rush_att + rush_yds + rush_td +
108 |                      solo_tackes + tackles + loss_tackles + ast_tackles +
109 |                      fum_forced + fum_rec + fum_tds + fum_yds +
110 |                      sacks + int + int_td + int_yards + pd +
111 |                      punt_returns + punt_return_td + punt_return_yards +
112 |                      kick_returns + kick_return_td + kick_return_yards
113 |                      ,training)
114 | 
115 | b1.tuning <- expand.grid(depth = c(3, 4, 5, 6),
116 |                          rounds = c(50, 100, 150, 200, 250)) %>%
117 |   group_by(depth, rounds) %>%
118 |   do({
119 |     m <- xgboost(data = fitX[train.set,],
120 |                  label = first.round[train.set],
121 |                  max.depth = .$depth,
122 |                  nround =.$rounds,
123 |                  print.every.n = 50,
124 |                  objective = 'binary:logistic')
125 |     yhat <- predict(m, newdata = fitX)
126 |     data_frame(test.set = test.set, yhat = yhat, label = first.round)
127 |   })
128 | ```
129 | 
130 | We'll compute the AUC for each point on the grid and see which one predicts best on the test set.  Remember we'd normally do a cross-validation procedure here, but I'm lazy.
131 | 
132 | ```{r}
133 | aucs <- b1.tuning %>%
134 |   ungroup %>%
135 |   filter(test.set) %>%
136 |   group_by(depth, rounds) %>%
137 |   do({
138 |     auc <- performance(prediction(.$yhat, .$label), "auc")@y.values[[1]]
139 |     data_frame(auc = auc)
140 |   }) %>%
141 |   ungroup %>%
142 |   arrange(-auc)
143 | best <- aucs %>% head(1)
144 | best
145 | ```
146 | 
147 | ### Testing on the 2015 Draft
148 | 
149 | That's a pretty good AUC! To get another perspective, we can train on pre-2015 and look at how many of the 2015 first rounders we could predict.
150 | 
151 | ```{r}
152 | pre2015 <- with(training, year < 2015)
153 | b1.train <- xgboost(data = fitX[pre2015,],
154 | 	            label = first.round[pre2015],
155 |                     max.depth = best$depth,
156 |                     nround = best$rounds,
157 |                     verbose = FALSE,
158 |                     objective = "binary:logistic")
159 | training$fr.hat2015 <- predict(b1.train, newdata = fitX)
160 | preds2015 <- training %>%
161 |    filter(year == 2015) %>%
162 |    arrange(-fr.hat2015) %>%
163 |    mutate(predicted.pick = row_number()) %>%
164 |    select(predicted.pick, pick, player, college, pos, fr.hat2015) %>%
165 |    head(32)
166 | kable(preds2015, digits = 2)
167 | ```
168 | 
169 | Not bad.  We're able to find `r preds2015 %>% with(100*round(sum(pick <= 32) / 32, 2))`% of the first round picks just using machine learning and combine/college data.  I did not watch a single college football game in 2014 and I could have done almost as good as the experts ;)
170 | 
171 | We can also look to see how these predictions correlate across the whole draft:
172 | 
173 | ```{r message=FALSE}
174 | library(ggplot2)
175 | training %>%
176 |    filter(year == 2015) %>%
177 |    ggplot(aes(x = pick, y = fr.hat2015)) +
178 |    geom_smooth() +
179 |    geom_point(size = 0.5) +
180 |    theme_bw() +
181 |    xlab('Pick') + ylab('P(first round)')
182 | ```
183 | 
184 | ## 2016 Results
185 | 
186 | <a name="results"/>
187 | 
188 | Let's predict the first round of the 2016 NFL draft! We'll train one final model on all the pre-2016 data with the hyperparameters we chose.
189 | 
190 | ```{r}
191 | training %>%
192 |    filter(year == 2016) %>%
193 |    arrange(-fr.hat) %>%
194 |    mutate(predicted.pick = row_number()) %>%
195 |    select(predicted.pick, player, college, pos, fr.hat) %>%
196 |    head(32) %>%
197 |    kable(digits = 2)
198 | ```
199 | 
200 | A few simple observations:
201 | 
202 | - Jared Goff is (probably correctly) rated the number 1 overall pick.
203 | - Carson Wentz is ranked very lowly (actually his probability of being in the first round is `r training %>% filter(player == 'Carson Wentz') %>% with(sparse.fr.hat) %>% round(2)`%).  This is actually consistent with the model, since he's from a small school and his college statistics aren't available.
204 | - Derrick Henry is now the first running back off the board over Ezekiel Elliott.  The things Elliot is praised for (blocking, being a good all-around back) are not highly measurable and would be discounted here.
205 | - Trevor Davis is an interesting one.  He's got very rare combine measurables, but is projected to be a much lower pick by most experts.
206 | 


--------------------------------------------------------------------------------
/prepare_training_data.R:
--------------------------------------------------------------------------------
  1 | library(readr)
  2 | library(dplyr)
  3 | library(tidyr)
  4 | library(feather)
  5 | library(stringr)
  6 | library(mice) ## multiple data imputation
  7 | source('utils.R')
  8 | 
  9 | combine.table <- read_feather('data/combines.feather')
 10 | draft.table <- read_feather('data/drafts.feather')
 11 | college.stats <- read_feather('data/college_stats.feather')
 12 | 
 13 | ## join on url first, then name
 14 | left <- draft.table %>%
 15 |   select(year, round, pick, team,
 16 |          player,
 17 |          college,
 18 |          pos,
 19 |          age,
 20 |          carav,
 21 |          drav,
 22 |          url) %>%
 23 |   mutate(key = ifelse(is.na(url), paste(player, year, sep = '-'), url))
 24 | 
 25 | right <- combine.table %>%
 26 |   select(year_combine = year,
 27 |          player_combine = player,
 28 |          pos_combine = pos,
 29 |          college_combine = college,
 30 |          height,
 31 |          weight,
 32 |          forty,
 33 |          vertical,
 34 |          broad,
 35 |          bench,
 36 |          threecone,
 37 |          shuttle,
 38 |          url_combine = url) %>%
 39 |   mutate(key = ifelse(is.na(url_combine),
 40 |            paste(player_combine, year_combine, sep = '-'),
 41 |            url_combine)) %>%
 42 |   ## This next block filters out multiple rows with the same player
 43 |   group_by(key) %>%
 44 |   mutate(appearance = row_number()) %>%
 45 |   filter(appearance == 1) %>%
 46 |   select(-appearance) %>%
 47 |   ungroup
 48 | 
 49 | combined <- full_join(left, right, by = 'key') %>%
 50 |   mutate(player = coalesce2(player, player_combine),
 51 |          pos = coalesce2(pos, pos_combine),
 52 |          college = coalesce2(college, college_combine),
 53 |          year = coalesce2(year, year_combine),
 54 |          url = coalesce2(url, url_combine))
 55 | 
 56 | ## Convert into long format so we can merge with college stats
 57 | training1 <- combined %>%
 58 |   select(key, carav,
 59 |          height, weight,
 60 |          forty, vertical,
 61 |          bench, age,
 62 |          threecone, shuttle,
 63 |          broad) %>%
 64 |   mutate(height = ifelse(is.na(height), 'NA-NA', height)) %>%
 65 |   separate(height, c('feet', 'inches'), sep = '-', convert = TRUE) %>%
 66 |   mutate(height = feet * 12 + inches) %>%
 67 |   select(-feet, -inches) %>%
 68 |   gather(metric, value, carav,
 69 |          height, weight,
 70 |          forty, vertical,
 71 |          bench, age,
 72 |          threecone, shuttle,
 73 |          broad) %>%
 74 |   filter(!is.na(value), value != '') %>%
 75 |   mutate(value = as.numeric(value))
 76 | 
 77 | ## Impute the missing combine data
 78 | ## A. Convert to wide
 79 | training1a <- training1 %>%
 80 |   spread(metric, value, fill = NA)
 81 | 
 82 | ## B. do the imputation and add back the non-imputed columns
 83 | training1b <- complete(mice(training1a %>% select(-key, -carav)))
 84 | training1b$key <- training1a$key
 85 | training1b$carav <- training1a$carav
 86 | 
 87 | ## C. Convert back to long format
 88 | training1c <- training1b %>%
 89 |   gather(metric, value, -key)
 90 | 
 91 | ## Rename some of the collge stats
 92 | ## make sure we only have one stat per person
 93 | training2 <- college.stats %>%
 94 |   group_by(url, stat) %>%
 95 |   mutate(row = row_number()) %>%
 96 |   filter(row == 1) %>%
 97 |   select(-row) %>%
 98 |   ungroup %>%
 99 |   rename(key = url, metric = stat) %>%
100 |   select(-section) %>%
101 |   mutate(metric = str_replace_all(metric, '[.]', '_'))
102 | 
103 | ## Convert back into wide form
104 | training3 <- bind_rows(training1c, training2) %>%
105 |   spread(metric, value, fill = 0) ## note we fill zeros, not NAs
106 | 
107 | ## Join the pick/position/college/year/team back on
108 | ## Aggregate smaller schools into representative small school
109 | training <- combined %>%
110 |   select(key, player, pick, pos, college, year, team) %>%
111 |   group_by(college) %>%
112 |   mutate(n_college_picks = n()) %>%
113 |   ungroup %>%
114 |   mutate(short_college = ifelse(n_college_picks < 50, 'SMALL SCHOOL', college),
115 |          pick = ifelse(is.na(pick), 257, as.numeric(pick))) %>%
116 |   inner_join(training3)
117 |   
118 | N <- nrow(training)
119 | train.set <- (rbinom(N, 1, prob = 0.9) == 1 & training$year < 2016)
120 | test.set <- (!train.set & training$year < 2016)
121 | holdout.set <- !(test.set | train.set)
122 | 
123 | # Outcome variables
124 | pick <- training$pick
125 | carav <- training$carav
126 | first.round <- as.numeric(training$pick <= 32)
127 | 


--------------------------------------------------------------------------------
/publish.R:
--------------------------------------------------------------------------------
1 | 
2 | source('prepare_training_data.R')
3 | source('sparse_models.R')
4 | source('boosting.R')
5 | 
6 | library(knitr)
7 | rmarkdown::render('post.Rmd', output_file = 'index.html')
8 | 


--------------------------------------------------------------------------------
/scrape_pfr.R:
--------------------------------------------------------------------------------
  1 | library(rvest)
  2 | library(readr)
  3 | library(dplyr)
  4 | library(RCurl)
  5 | library(tidyr)
  6 | library(stringr)
  7 | library(feather)
  8 | 
  9 | read_html_cache <- function(url, cache.dir = 'cache') {
 10 |   fn <- tail(strsplit(url, '/')[[1]], 1)
 11 |   fn.path <- paste(cache.dir, fn, sep = '/')
 12 |   if (!file.exists(fn.path)) {
 13 |     text <- getURL(url)
 14 |     write(text, fn.path)
 15 |   }
 16 |   read_html(fn.path)
 17 | }
 18 | 
 19 | draft.header <- c('round', 'pick', 'team', 'player', 'pos', 'age', 'to', 'ap1', 'pb', 'st', 'carav', 'drav', 'games', 'pass.cmp', 'pass.att', 'pass.yds', 'pass.tds', 'pass.ints', 'rush.att', 'rush.yds', 'rush.tds', 'receptions', 'rec.yds', 'rec.tds', 'tackles', 'ints', 'sacks', 'college', 'stats')
 20 | 
 21 | combine.header <- c('player', 'pos', 'college', 'stats', 'height', 'weight', 'forty', 'vertical', 'bench', 'broad', 'threecone', 'shuttle', 'drafted')
 22 | 
 23 | url.extract <- function(tds) {
 24 |   results <- c()
 25 |   for(td in tds) {
 26 |     children <- html_children(td)
 27 |     if (length(children) == 0) {
 28 |       results <- c(results, NA)
 29 |     } else{
 30 |       results <- c(results, (html_attr(html_children(td), 'href')))
 31 |     }
 32 |   }
 33 |   results
 34 | }
 35 | 
 36 | headers <- list()
 37 | headers[['defense']] <- c('year', 'school', 'conf', 'class', 'pos', 'games', 'solo.tackes', 'ast.tackles', 'tackles', 'loss.tackles', 'sacks', 'int', 'int.yards', 'int.yards.avg', 'int.td', 'pd', 'fum.rec', 'fum.yds', 'fum.tds', 'fum.forced')
 38 | headers[['scoring']] <- c('year', 'school', 'conf', 'class', 'pos', 'games', 'td.rush', 'td.rec', 'td.int', 'td.fr', 'td.pr', 'td.kr', 'td.oth', 'td.tot', 'kick.xpm', 'kick.fgm', 'twopm', 'safety', 'total.pts')
 39 | headers[['punt_ret']] <- c('year', 'school', 'conf', 'class', 'pos', 'games', 'punt.returns', 'punt.return.yards', 'punt.return.avg', 'punt.return.td', 'kick.returns', 'kick.return.yards', 'kick.return.avg', 'kick.return.td')
 40 | headers[['receiving']] <- c('year', 'school', 'conf', 'class', 'pos', 'games', 'receptions', 'rec.yards', 'rec.avg', 'rec.td', 'rush.att', 'rush.yds', 'rush.avg', 'rush.td', 'scrim.plays', 'scrim.yds', 'scrim.avg', 'scrim.tds')
 41 | headers[['rushing']] <- c('year', 'school', 'conf', 'class', 'pos', 'games', 'receptions', 'rec.yards', 'rec.avg', 'rec.td', 'rush.att', 'rush.yds', 'rush.avg', 'rush.td', 'scrim.plays', 'scrim.yds', 'scrim.avg', 'scrim.tds')
 42 | headers[['passing']] <- c('year', 'school', 'conf', 'class', 'pos', 'games', 'completions', 'attempts', 'comp.pct', 'pass.yards', 'yards.per.attempt', 'adj.yards.per.attempt', 'pass.tds', 'pass.ints', 'int.rate')
 43 | 
 44 | parse_pfr_tables <- function(tables) {
 45 |   results = list()
 46 |   for (tbl in tables) {
 47 |     id <- html_attr(tbl, 'id')
 48 |     if (id %in% names(headers)) {
 49 |       
 50 |       df <- html_table(tbl) %>%
 51 |         head(-1) %>% tail(-1)
 52 | 
 53 |       if(ncol(df) == length(headers[[id]])) {
 54 |         colnames(df) <- headers[[id]]
 55 |       } else {
 56 |         next;
 57 |       }
 58 |       
 59 |       melted <- df %>%
 60 |         select(-year, -school, -conf, -class, -pos) %>%
 61 |         mutate(seasons = 1) %>%
 62 |         gather(stat, value) %>%
 63 |         mutate(stat = as.character(stat)) %>%
 64 |         filter(value != '') %>%
 65 |         mutate(value = as.numeric(value),
 66 |                section = id)
 67 | 
 68 |       
 69 |       results[[id]] <- melted
 70 |     }
 71 |   }
 72 |   bind_rows(results)
 73 | }
 74 | 
 75 | if (!file.exists('data/drafts.feather')) {
 76 |   
 77 | draft.table <- data_frame(year = 2000:2015) %>%
 78 |   group_by(year) %>% do({
 79 |     url <- paste('http://www.pro-football-reference.com/years/', .$year, '/draft.htm', sep ='')
 80 |     doc <- read_html(url)
 81 |     html.table <- doc %>%
 82 |       html_nodes('table') %>%
 83 |       first
 84 |     urls <- html.table %>%
 85 |       html_nodes('tr td:nth-child(29)') %>%
 86 |       url.extract
 87 |     my.table <- html_table(html.table)
 88 |     colnames(my.table) <- draft.header
 89 |     my.table <- my.table %>%
 90 |       filter(pos != 'Pos') %>%
 91 |       mutate(url = urls)
 92 |     my.table
 93 |   }) %>%
 94 |   ungroup
 95 | write_feather(draft.table, 'data/drafts.feather')
 96 | 
 97 | }
 98 | 
 99 | if (!file.exists('data/combines.feather')) {
100 |   
101 | combine.table <- data_frame(year = 2000:2016) %>%
102 |   group_by(year) %>% do({
103 |     url <- paste('http://www.pro-football-reference.com/draft/', .$year, '-combine.htm', sep ='')
104 |     html.table <- read_html(url) %>%
105 |       html_nodes('table') %>%
106 |       first
107 |     urls <- html.table %>%
108 |       html_nodes('tr td:nth-child(4)') %>%
109 |       url.extract
110 |     my.table <- html_table(html.table)
111 |     colnames(my.table) <- combine.header
112 |     my.table <- my.table %>%
113 |       filter(pos != 'Pos') %>%
114 |       mutate(url = urls)
115 |     my.table
116 |   }) %>%
117 |   ungroup
118 | 
119 | write_feather(combine.table, 'data/combines.feather')
120 | }
121 | 
122 | all.urls <- combine.table %>%
123 |   select(url) %>%
124 |   full_join(draft.table %>% select(url)) %>%
125 |   filter(!is.na(url))
126 | 
127 | college.stats <- all.urls %>%
128 |   group_by(url) %>% do({
129 |     #cat('URL = ', .$url, '\n')
130 |     doc <- read_html_cache(.$url)
131 |     stats <- doc %>%
132 |       html_nodes('table') %>%
133 |       parse_pfr_tables
134 |     if (nrow(stats) > 0) {
135 |       stats <- stats %>%
136 |         group_by(section, stat) %>%
137 |         summarise(value = sum(value))
138 |     }
139 |     stats
140 |   })
141 |   
142 | write_feather(college.stats, 'data/college_stats.feather')
143 | 


--------------------------------------------------------------------------------
/style.css:
--------------------------------------------------------------------------------
1 | body {
2 |   max-width: 800px;
3 |   margin: auto;
4 | }   


--------------------------------------------------------------------------------
/utils.R:
--------------------------------------------------------------------------------
1 | coalesce2 <- function(...) {
2 |   Reduce(function(x, y) {
3 |     i <- which(is.na(x))
4 |     x[i] <- y[i]
5 |     x},
6 |   list(...))
7 | }
8 | 


--------------------------------------------------------------------------------