├── data └── .keep ├── run.sh ├── LICENSE.md ├── README.md └── win_ncaa.R /data/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | Rscript win_ncaa.R 2 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Do what ever you want, respect R & it's packages licenses. 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Win NCAA competition 2 | 3 | The 1st place solution of [NCAA women's march madness competition (2018).](https://www.kaggle.com/c/womens-machine-learning-competition-2018) 4 | 5 | The model was created using: 6 | 7 | 1. Ubuntu (16.04) 8 | 2. R (3.4.2) 9 | 3. dplyr (1.2.1) 10 | 4. lme4 (1.1-15) 11 | 5. xgboost (0.6.4.1) 12 | 13 | How to generate a submission file: 14 | 15 | 1. Populate the _data_ folder with the competition files 16 | 2. Run the _run.sh_ script 17 | 3. Submit to Kaggle & enjoy the results 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /win_ncaa.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(xgboost) 3 | library(lme4) 4 | 5 | regresults <- read.csv("data/WRegularSeasonDetailedResults_PrelimData2018.csv") 6 | results <- read.csv("data/WNCAATourneyDetailedResults_PrelimData2018.csv") 7 | sub <- read.csv("data/WSampleSubmissionStage2.csv") 8 | seeds <- read.csv("data/WNCAATourneySeeds.csv") 9 | 10 | seeds$Seed = as.numeric(substring(seeds$Seed,2,4)) 11 | 12 | 13 | ### Collect regular season results - double the data by swapping team positions 14 | 15 | r1 = regresults[, c("Season", "DayNum", "WTeamID", "WScore", "LTeamID", "LScore", "NumOT", "WFGA", "WAst", "WBlk", "LFGA", "LAst", "LBlk")] 16 | r2 = regresults[, c("Season", "DayNum", "LTeamID", "LScore", "WTeamID", "WScore", "NumOT", "LFGA", "LAst", "LBlk", "WFGA", "WAst", "WBlk")] 17 | names(r1) = c("Season", "DayNum", "T1", "T1_Points", "T2", "T2_Points", "NumOT", "T1_fga", "T1_ast", "T1_blk", "T2_fga", "T2_ast", "T2_blk") 18 | names(r2) = c("Season", "DayNum", "T1", "T1_Points", "T2", "T2_Points", "NumOT", "T1_fga", "T1_ast", "T1_blk", "T2_fga", "T2_ast", "T2_blk") 19 | regular_season = rbind(r1, r2) 20 | 21 | 22 | ### Collect tourney results - double the data by swapping team positions 23 | 24 | t1 = results[, c("Season", "DayNum", "WTeamID", "LTeamID", "WScore", "LScore")] %>% mutate(ResultDiff = WScore - LScore) 25 | t2 = results[, c("Season", "DayNum", "LTeamID", "WTeamID", "LScore", "WScore")] %>% mutate(ResultDiff = LScore - WScore) 26 | names(t1) = c("Season", "DayNum", "T1", "T2", "T1_Points", "T2_Points", "ResultDiff") 27 | names(t2) = c("Season", "DayNum", "T1", "T2", "T1_Points", "T2_Points", "ResultDiff") 28 | tourney = rbind(t1, t2) 29 | 30 | 31 | ### Fit GLMM on regular season data (selected march madness teams only) - extract random effects for each team 32 | 33 | march_teams = select(seeds, Season, Team = TeamID) 34 | X = regular_season %>% 35 | inner_join(march_teams, by = c("Season" = "Season", "T1" = "Team")) %>% 36 | inner_join(march_teams, by = c("Season" = "Season", "T2" = "Team")) %>% 37 | select(Season, T1, T2, T1_Points, T2_Points, NumOT) %>% distinct() 38 | X$T1 = as.factor(X$T1) 39 | X$T2 = as.factor(X$T2) 40 | 41 | quality = list() 42 | for (season in unique(X$Season)) { 43 | glmm = glmer(I(T1_Points > T2_Points) ~ (1 | T1) + (1 | T2), data = X[X$Season == season & X$NumOT == 0, ], family = binomial) 44 | random_effects = ranef(glmm)$T1 45 | quality[[season]] = data.frame(Season = season, Team_Id = as.numeric(row.names(random_effects)), quality = exp(random_effects[,"(Intercept)"])) 46 | } 47 | quality = do.call(rbind, quality) 48 | 49 | 50 | ### Regular season statistics 51 | 52 | season_summary = 53 | regular_season %>% 54 | mutate(win14days = ifelse(DayNum > 118 & T1_Points > T2_Points, 1, 0), 55 | last14days = ifelse(DayNum > 118, 1, 0)) %>% 56 | group_by(Season, T1) %>% 57 | summarize( 58 | WinRatio14d = sum(win14days) / sum(last14days), 59 | PointsMean = mean(T1_Points), 60 | PointsMedian = median(T1_Points), 61 | PointsDiffMean = mean(T1_Points - T2_Points), 62 | FgaMean = mean(T1_fga), 63 | FgaMedian = median(T1_fga), 64 | FgaMin = min(T1_fga), 65 | FgaMax = max(T1_fga), 66 | AstMean = mean(T1_ast), 67 | BlkMean = mean(T1_blk), 68 | OppFgaMean = mean(T2_fga), 69 | OppFgaMin = min(T2_fga) 70 | ) 71 | 72 | season_summary_X1 = season_summary 73 | season_summary_X2 = season_summary 74 | names(season_summary_X1) = c("Season", "T1", paste0("X1_",names(season_summary_X1)[-c(1,2)])) 75 | names(season_summary_X2) = c("Season", "T2", paste0("X2_",names(season_summary_X2)[-c(1,2)])) 76 | 77 | 78 | ### Combine all features into a data frame 79 | 80 | data_matrix = 81 | tourney %>% 82 | left_join(season_summary_X1, by = c("Season", "T1")) %>% 83 | left_join(season_summary_X2, by = c("Season", "T2")) %>% 84 | left_join(select(seeds, Season, T1 = TeamID, Seed1 = Seed), by = c("Season", "T1")) %>% 85 | left_join(select(seeds, Season, T2 = TeamID, Seed2 = Seed), by = c("Season", "T2")) %>% 86 | mutate(SeedDiff = Seed1 - Seed2) %>% 87 | left_join(select(quality, Season, T1 = Team_Id, quality_march_T1 = quality), by = c("Season", "T1")) %>% 88 | left_join(select(quality, Season, T2 = Team_Id, quality_march_T2 = quality), by = c("Season", "T2")) 89 | 90 | 91 | ### Prepare xgboost 92 | 93 | features = setdiff(names(data_matrix), c("Season", "DayNum", "T1", "T2", "T1_Points", "T2_Points", "ResultDiff")) 94 | dtrain = xgb.DMatrix(as.matrix(data_matrix[, features]), label = data_matrix$ResultDiff) 95 | 96 | cauchyobj <- function(preds, dtrain) { 97 | labels <- getinfo(dtrain, "label") 98 | c <- 5000 99 | x <- preds-labels 100 | grad <- x / (x^2/c^2+1) 101 | hess <- -c^2*(x^2-c^2)/(x^2+c^2)^2 102 | return(list(grad = grad, hess = hess)) 103 | } 104 | 105 | xgb_parameters = 106 | list(objective = cauchyobj, 107 | eval_metric = "mae", 108 | booster = "gbtree", 109 | eta = 0.02, 110 | subsample = 0.35, 111 | colsample_bytree = 0.7, 112 | num_parallel_tree = 10, 113 | min_child_weight = 40, 114 | gamma = 10, 115 | max_depth = 3) 116 | 117 | N = nrow(data_matrix) 118 | fold5list = c( 119 | rep( 1, floor(N/5) ), 120 | rep( 2, floor(N/5) ), 121 | rep( 3, floor(N/5) ), 122 | rep( 4, floor(N/5) ), 123 | rep( 5, N - 4*floor(N/5) ) 124 | ) 125 | 126 | 127 | ### Build cross-validation model, repeated 10-times 128 | 129 | iteration_count = c() 130 | smooth_model = list() 131 | 132 | for (i in 1:10) { 133 | 134 | ### Resample fold split 135 | set.seed(i) 136 | folds = list() 137 | fold_list = sample(fold5list) 138 | for (k in 1:5) folds[[k]] = which(fold_list == k) 139 | 140 | set.seed(120) 141 | xgb_cv = 142 | xgb.cv( 143 | params = xgb_parameters, 144 | data = dtrain, 145 | nrounds = 3000, 146 | verbose = 0, 147 | nthread = 12, 148 | folds = folds, 149 | early_stopping_rounds = 25, 150 | maximize = FALSE, 151 | prediction = TRUE 152 | ) 153 | iteration_count = c(iteration_count, xgb_cv$best_iteration) 154 | 155 | ### Fit a smoothed GAM model on predicted result point differential to get probabilities 156 | smooth_model[[i]] = smooth.spline(x = xgb_cv$pred, y = ifelse(data_matrix$ResultDiff > 0, 1, 0)) 157 | 158 | } 159 | 160 | 161 | ### Build submission models 162 | 163 | submission_model = list() 164 | 165 | for (i in 1:10) { 166 | set.seed(i) 167 | submission_model[[i]] = 168 | xgb.train( 169 | params = xgb_parameters, 170 | data = dtrain, 171 | nrounds = round(iteration_count[i]*1.05), 172 | verbose = 0, 173 | nthread = 12, 174 | maximize = FALSE, 175 | prediction = TRUE 176 | ) 177 | } 178 | 179 | 180 | ### Run predictions 181 | 182 | sub$Season = 2018 183 | sub$T1 = as.numeric(substring(sub$ID,6,9)) 184 | sub$T2 = as.numeric(substring(sub$ID,11,14)) 185 | 186 | Z = sub %>% 187 | left_join(season_summary_X1, by = c("Season", "T1")) %>% 188 | left_join(season_summary_X2, by = c("Season", "T2")) %>% 189 | left_join(select(seeds, Season, T1 = TeamID, Seed1 = Seed), by = c("Season", "T1")) %>% 190 | left_join(select(seeds, Season, T2 = TeamID, Seed2 = Seed), by = c("Season", "T2")) %>% 191 | mutate(SeedDiff = Seed1 - Seed2) %>% 192 | left_join(select(quality, Season, T1 = Team_Id, quality_march_T1 = quality), by = c("Season", "T1")) %>% 193 | left_join(select(quality, Season, T2 = Team_Id, quality_march_T2 = quality), by = c("Season", "T2")) 194 | 195 | dtest = xgb.DMatrix(as.matrix(Z[, features])) 196 | 197 | probs = list() 198 | for (i in 1:10) { 199 | preds = predict(submission_model[[i]], dtest) 200 | probs[[i]] = predict(smooth_model[[i]], preds)$y 201 | } 202 | Z$Pred = Reduce("+", probs) / 10 203 | 204 | ### Better be safe than sorry 205 | Z$Pred[Z$Pred <= 0.025] = 0.025 206 | Z$Pred[Z$Pred >= 0.975] = 0.975 207 | 208 | ### Anomaly event happened only once before - be brave 209 | Z$Pred[Z$Seed1 == 16 & Z$Seed2 == 1] = 0 210 | Z$Pred[Z$Seed1 == 15 & Z$Seed2 == 2] = 0 211 | Z$Pred[Z$Seed1 == 14 & Z$Seed2 == 3] = 0 212 | Z$Pred[Z$Seed1 == 13 & Z$Seed2 == 4] = 0 213 | Z$Pred[Z$Seed1 == 1 & Z$Seed2 == 16] = 1 214 | Z$Pred[Z$Seed1 == 2 & Z$Seed2 == 15] = 1 215 | Z$Pred[Z$Seed1 == 3 & Z$Seed2 == 14] = 1 216 | Z$Pred[Z$Seed1 == 4 & Z$Seed2 == 13] = 1 217 | 218 | write.csv(select(Z, ID, Pred), "sub.csv", row.names = FALSE) 219 | 220 | --------------------------------------------------------------------------------