├── data
    └── .keep
├── run.sh
├── LICENSE.md
├── README.md
└── win_ncaa.R


/data/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | Rscript win_ncaa.R
2 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Do what ever you want, respect R & it's packages licenses.
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Win NCAA competition
 2 | 
 3 | The 1st place solution of [NCAA women's march madness competition (2018).](https://www.kaggle.com/c/womens-machine-learning-competition-2018)
 4 | 
 5 | The model was created using:
 6 | 
 7 | 1. Ubuntu (16.04)
 8 | 2. R (3.4.2)
 9 | 3. dplyr (1.2.1)
10 | 4. lme4 (1.1-15)
11 | 5. xgboost (0.6.4.1)
12 | 
13 | How to generate a submission file:
14 | 
15 | 1. Populate the _data_ folder with the competition files
16 | 2. Run the _run.sh_ script 
17 | 3. Submit to Kaggle & enjoy the results
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/win_ncaa.R:
--------------------------------------------------------------------------------
  1 | library(dplyr)
  2 | library(xgboost)
  3 | library(lme4)
  4 | 
  5 | regresults <- read.csv("data/WRegularSeasonDetailedResults_PrelimData2018.csv")
  6 | results <- read.csv("data/WNCAATourneyDetailedResults_PrelimData2018.csv")
  7 | sub <- read.csv("data/WSampleSubmissionStage2.csv")
  8 | seeds <- read.csv("data/WNCAATourneySeeds.csv")
  9 | 
 10 | seeds$Seed = as.numeric(substring(seeds$Seed,2,4))
 11 | 
 12 | 
 13 | ### Collect regular season results - double the data by swapping team positions
 14 | 
 15 | r1 = regresults[, c("Season", "DayNum", "WTeamID", "WScore", "LTeamID", "LScore", "NumOT", "WFGA", "WAst", "WBlk", "LFGA", "LAst", "LBlk")]
 16 | r2 = regresults[, c("Season", "DayNum", "LTeamID", "LScore", "WTeamID", "WScore", "NumOT", "LFGA", "LAst", "LBlk", "WFGA", "WAst", "WBlk")]
 17 | names(r1) = c("Season", "DayNum", "T1", "T1_Points", "T2", "T2_Points", "NumOT", "T1_fga", "T1_ast", "T1_blk", "T2_fga", "T2_ast", "T2_blk")
 18 | names(r2) = c("Season", "DayNum", "T1", "T1_Points", "T2", "T2_Points", "NumOT", "T1_fga", "T1_ast", "T1_blk", "T2_fga", "T2_ast", "T2_blk")
 19 | regular_season = rbind(r1, r2)
 20 | 
 21 | 
 22 | ### Collect tourney results - double the data by swapping team positions
 23 | 
 24 | t1 = results[, c("Season", "DayNum", "WTeamID", "LTeamID", "WScore", "LScore")] %>% mutate(ResultDiff = WScore - LScore)
 25 | t2 = results[, c("Season", "DayNum", "LTeamID", "WTeamID", "LScore", "WScore")] %>% mutate(ResultDiff = LScore - WScore)
 26 | names(t1) = c("Season", "DayNum", "T1", "T2", "T1_Points", "T2_Points", "ResultDiff")
 27 | names(t2) = c("Season", "DayNum", "T1", "T2", "T1_Points", "T2_Points", "ResultDiff")
 28 | tourney = rbind(t1, t2)
 29 | 
 30 | 
 31 | ### Fit GLMM on regular season data (selected march madness teams only) - extract random effects for each team
 32 | 
 33 | march_teams = select(seeds, Season, Team = TeamID)
 34 | X =  regular_season %>% 
 35 |   inner_join(march_teams, by = c("Season" = "Season", "T1" = "Team")) %>% 
 36 |   inner_join(march_teams, by = c("Season" = "Season", "T2" = "Team")) %>% 
 37 |   select(Season, T1, T2, T1_Points, T2_Points, NumOT) %>% distinct()
 38 | X$T1 = as.factor(X$T1)
 39 | X$T2 = as.factor(X$T2)
 40 | 
 41 | quality = list()
 42 | for (season in unique(X$Season)) {
 43 |   glmm = glmer(I(T1_Points > T2_Points) ~  (1 | T1) + (1 | T2), data = X[X$Season == season & X$NumOT == 0, ], family = binomial) 
 44 |   random_effects = ranef(glmm)$T1
 45 |   quality[[season]] = data.frame(Season = season, Team_Id = as.numeric(row.names(random_effects)), quality = exp(random_effects[,"(Intercept)"]))
 46 | }
 47 | quality = do.call(rbind, quality)
 48 | 
 49 | 
 50 | ### Regular season statistics
 51 | 
 52 | season_summary = 
 53 |   regular_season %>%
 54 |   mutate(win14days = ifelse(DayNum > 118 & T1_Points > T2_Points, 1, 0),
 55 |          last14days = ifelse(DayNum > 118, 1, 0)) %>% 
 56 |   group_by(Season, T1) %>%
 57 |   summarize(
 58 |     WinRatio14d = sum(win14days) / sum(last14days),
 59 |     PointsMean = mean(T1_Points),
 60 |     PointsMedian = median(T1_Points),
 61 |     PointsDiffMean = mean(T1_Points - T2_Points),
 62 |     FgaMean = mean(T1_fga), 
 63 |     FgaMedian = median(T1_fga),
 64 |     FgaMin = min(T1_fga), 
 65 |     FgaMax = max(T1_fga), 
 66 |     AstMean = mean(T1_ast), 
 67 |     BlkMean = mean(T1_blk), 
 68 |     OppFgaMean = mean(T2_fga), 
 69 |     OppFgaMin = min(T2_fga)  
 70 |   )
 71 | 
 72 | season_summary_X1 = season_summary
 73 | season_summary_X2 = season_summary
 74 | names(season_summary_X1) = c("Season", "T1", paste0("X1_",names(season_summary_X1)[-c(1,2)]))
 75 | names(season_summary_X2) = c("Season", "T2", paste0("X2_",names(season_summary_X2)[-c(1,2)]))
 76 | 
 77 | 
 78 | ### Combine all features into a data frame
 79 | 
 80 | data_matrix =
 81 |   tourney %>% 
 82 |   left_join(season_summary_X1, by = c("Season", "T1")) %>% 
 83 |   left_join(season_summary_X2, by = c("Season", "T2")) %>%
 84 |   left_join(select(seeds, Season, T1 = TeamID, Seed1 = Seed), by = c("Season", "T1")) %>% 
 85 |   left_join(select(seeds, Season, T2 = TeamID, Seed2 = Seed), by = c("Season", "T2")) %>% 
 86 |   mutate(SeedDiff = Seed1 - Seed2) %>%
 87 |   left_join(select(quality, Season, T1 = Team_Id, quality_march_T1 = quality), by = c("Season", "T1")) %>%
 88 |   left_join(select(quality, Season, T2 = Team_Id, quality_march_T2 = quality), by = c("Season", "T2"))
 89 | 
 90 | 
 91 | ### Prepare xgboost 
 92 | 
 93 | features = setdiff(names(data_matrix), c("Season", "DayNum", "T1", "T2", "T1_Points", "T2_Points", "ResultDiff"))
 94 | dtrain = xgb.DMatrix(as.matrix(data_matrix[, features]), label = data_matrix$ResultDiff)
 95 | 
 96 | cauchyobj <- function(preds, dtrain) {
 97 |   labels <- getinfo(dtrain, "label")
 98 |   c <- 5000 
 99 |   x <-  preds-labels
100 |   grad <- x / (x^2/c^2+1)
101 |   hess <- -c^2*(x^2-c^2)/(x^2+c^2)^2
102 |   return(list(grad = grad, hess = hess))
103 | }
104 | 
105 | xgb_parameters = 
106 |   list(objective = cauchyobj, 
107 |        eval_metric = "mae",
108 |        booster = "gbtree", 
109 |        eta = 0.02,
110 |        subsample = 0.35,
111 |        colsample_bytree = 0.7,
112 |        num_parallel_tree = 10,
113 |        min_child_weight = 40,
114 |        gamma = 10,
115 |        max_depth = 3)
116 | 
117 | N = nrow(data_matrix)
118 | fold5list = c(
119 |   rep( 1, floor(N/5) ),
120 |   rep( 2, floor(N/5) ),
121 |   rep( 3, floor(N/5) ),
122 |   rep( 4, floor(N/5) ),
123 |   rep( 5, N - 4*floor(N/5) )
124 | )
125 | 
126 | 
127 | ### Build cross-validation model, repeated 10-times
128 | 
129 | iteration_count = c()
130 | smooth_model = list()
131 | 
132 | for (i in 1:10) {
133 |   
134 |   ### Resample fold split
135 |   set.seed(i)
136 |   folds = list()  
137 |   fold_list = sample(fold5list)
138 |   for (k in 1:5) folds[[k]] = which(fold_list == k)
139 |   
140 |   set.seed(120)
141 |   xgb_cv = 
142 |     xgb.cv(
143 |       params = xgb_parameters,
144 |       data = dtrain,
145 |       nrounds = 3000,
146 |       verbose = 0,
147 |       nthread = 12,
148 |       folds = folds,
149 |       early_stopping_rounds = 25,
150 |       maximize = FALSE,
151 |       prediction = TRUE
152 |     )
153 |   iteration_count = c(iteration_count, xgb_cv$best_iteration)
154 |   
155 |   ### Fit a smoothed GAM model on predicted result point differential to get probabilities
156 |   smooth_model[[i]] = smooth.spline(x = xgb_cv$pred, y = ifelse(data_matrix$ResultDiff > 0, 1, 0))
157 |   
158 | }
159 | 
160 | 
161 | ### Build submission models
162 | 
163 | submission_model = list()
164 | 
165 | for (i in 1:10) {
166 |   set.seed(i)
167 |   submission_model[[i]] = 
168 |     xgb.train(
169 |       params = xgb_parameters,
170 |       data = dtrain,
171 |       nrounds = round(iteration_count[i]*1.05),
172 |       verbose = 0,
173 |       nthread = 12,
174 |       maximize = FALSE,
175 |       prediction = TRUE
176 |     )
177 | }
178 | 
179 | 
180 | ### Run predictions
181 | 
182 | sub$Season = 2018
183 | sub$T1 = as.numeric(substring(sub$ID,6,9))
184 | sub$T2 = as.numeric(substring(sub$ID,11,14))
185 | 
186 | Z = sub %>% 
187 |   left_join(season_summary_X1, by = c("Season", "T1")) %>% 
188 |   left_join(season_summary_X2, by = c("Season", "T2")) %>%
189 |   left_join(select(seeds, Season, T1 = TeamID, Seed1 = Seed), by = c("Season", "T1")) %>% 
190 |   left_join(select(seeds, Season, T2 = TeamID, Seed2 = Seed), by = c("Season", "T2")) %>% 
191 |   mutate(SeedDiff = Seed1 - Seed2) %>%
192 |   left_join(select(quality, Season, T1 = Team_Id, quality_march_T1 = quality), by = c("Season", "T1")) %>%
193 |   left_join(select(quality, Season, T2 = Team_Id, quality_march_T2 = quality), by = c("Season", "T2"))
194 | 
195 | dtest = xgb.DMatrix(as.matrix(Z[, features]))
196 | 
197 | probs = list()
198 | for (i in 1:10) {
199 |   preds = predict(submission_model[[i]], dtest)
200 |   probs[[i]] = predict(smooth_model[[i]], preds)$y
201 | }
202 | Z$Pred = Reduce("+", probs) / 10
203 | 
204 | ### Better be safe than sorry
205 | Z$Pred[Z$Pred <= 0.025] = 0.025
206 | Z$Pred[Z$Pred >= 0.975] = 0.975
207 | 
208 | ### Anomaly event happened only once before - be brave
209 | Z$Pred[Z$Seed1 == 16 & Z$Seed2 == 1] = 0
210 | Z$Pred[Z$Seed1 == 15 & Z$Seed2 == 2] = 0
211 | Z$Pred[Z$Seed1 == 14 & Z$Seed2 == 3] = 0
212 | Z$Pred[Z$Seed1 == 13 & Z$Seed2 == 4] = 0
213 | Z$Pred[Z$Seed1 == 1 & Z$Seed2 == 16] = 1
214 | Z$Pred[Z$Seed1 == 2 & Z$Seed2 == 15] = 1
215 | Z$Pred[Z$Seed1 == 3 & Z$Seed2 == 14] = 1
216 | Z$Pred[Z$Seed1 == 4 & Z$Seed2 == 13] = 1
217 | 
218 | write.csv(select(Z, ID, Pred), "sub.csv", row.names = FALSE)
219 | 
220 | 


--------------------------------------------------------------------------------