├── data └── README ├── README.md ├── .gitignore ├── stan_model.stan ├── LICENSE └── bayesian_model.R /data/README: -------------------------------------------------------------------------------- 1 | Put train data, test data, and sample submission in this folder 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kaggle-winton-2016 2 | Sample Solution(s) to the Winton Stock Market Challenge on Kaggle 3 | 4 | ## bayesian_model.R 5 | Based on the stan model [posted by Tsakalis Kostas](https://www.kaggle.com/c/the-winton-stock-market-challenge/forums/t/18584/solution-sharing). 6 | 7 | Achieves public score of 1769.62394 and private score of 1728.16802 (around 25th ~ 30th place in the final leaderboard). 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 25 | .httr-oauth 26 | 27 | # knitr and R markdown default cache directories 28 | /*_cache/ 29 | /cache/ 30 | 31 | # Temporary files created by R markdown 32 | *.utf8.md 33 | *.knit.md 34 | -------------------------------------------------------------------------------- /stan_model.stan: -------------------------------------------------------------------------------- 1 | data { 2 | int N; 3 | vector[N] t; // Ret Plus One / Ret Plus two 4 | vector[N] x_1; //Ret Minus Two 5 | vector[N] x_2; //Ret Minus + First 120 mins 6 | int N_test; 7 | vector[N_test] x_1_test; 8 | vector[N_test] x_2_test; 9 | real df; // Degrees of freedom (2.6) 10 | } 11 | 12 | parameters { 13 | real w_1_0; 14 | real w_1_1; 15 | real w_1_2; 16 | real alpha; 17 | real beta; 18 | } 19 | 20 | transformed parameters { 21 | real sigma_w; 22 | real sigma_t; 23 | sigma_w = sqrt(alpha); 24 | sigma_t = sqrt(beta); 25 | } 26 | 27 | model { 28 | w_1_0 ~ normal(0,sigma_w); 29 | w_1_1 ~ normal(0,sigma_w); 30 | w_1_2 ~ normal(0,sigma_w); 31 | alpha ~ inv_gamma(1E-2, 1E-4); 32 | beta ~ inv_gamma(.3, .0001); 33 | t ~ student_t(df, w_1_0+w_1_1*x_1+w_1_2*x_2, sigma_t); 34 | } 35 | 36 | generated quantities{ 37 | vector[N_test] t_pred; 38 | t_pred = w_1_0+w_1_1*x_1_test+w_1_2*x_2_test; 39 | } 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 CeShine Lee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bayesian_model.R: -------------------------------------------------------------------------------- 1 | # Suppress scientific notation 2 | options(scipen=1) 3 | 4 | # Make sure pacman is installed 5 | install.packages("pacman") 6 | 7 | # Load the required libraries 8 | pacman::p_load(rstan) 9 | pacman::p_load(data.table) 10 | 11 | # Optional: using multi-cores (requires extra memory space) 12 | options(mc.cores=parallel::detectCores()) 13 | 14 | train_data <- fread("data/train.csv") 15 | test_data <- fread("data/test_2.csv") 16 | train_data[is.na(train_data)] <- 0 17 | test_data[is.na(test_data)] <- 0 18 | 19 | train_data[, x_2 := Ret_MinusOne] 20 | test_data[, x_2 := Ret_MinusOne] 21 | 22 | # Add intra-day returns into x_2 23 | for(i in 2:120){ 24 | stopifnot(train_data[is.na(get(paste("Ret_", i, sep=""))), .N] == 0) 25 | stopifnot(train_data[is.na(get(paste("Ret_", i, sep=""))), .N] == 0) 26 | train_data[, x_2 := x_2 + get(paste("Ret_", i, sep=""))] 27 | test_data[, x_2 := x_2 + get(paste("Ret_", i, sep=""))] 28 | } 29 | 30 | # Predict PlusOne returns 31 | fit <- stan(file = 'stan_model.stan', 32 | data = list( 33 | x_1 = train_data$Ret_MinusTwo, 34 | x_2 = train_data$x_2, 35 | x_1_test = test_data$Ret_MinusTwo, 36 | x_2_test = test_data$x_2, 37 | t = train_data$Ret_PlusOne, 38 | N_test = test_data[, .N], 39 | df = 2.6, 40 | N = train_data[, .N]), 41 | iter = 1000, chains = 4) 42 | 43 | print(fit, probs=c(0.1, 0.5, 0.9)) 44 | params <- extract(fit, par=c("t_pred", "w_1_0", "w_1_1", "w_1_2")) 45 | 46 | save(params, file="plusone.RData") 47 | 48 | test_data[,preds_plusone:=apply(data.table(t(params$t_pred)), 1, mean)] 49 | 50 | rm(fit, params) 51 | gc(T) 52 | 53 | # Predict PlusTwo Returns 54 | fit <- stan(file = 'stan_model.stan', 55 | data = list( 56 | x_1 = train_data$Ret_MinusTwo, 57 | x_2 = train_data$x_2, 58 | x_1_test = test_data$Ret_MinusTwo, 59 | x_2_test = test_data$x_2, 60 | t = train_data$Ret_PlusTwo, 61 | N_test = test_data[, .N], 62 | df = 2.6, 63 | N = train_data[, .N]), 64 | iter = 1000, chains = 4) 65 | 66 | print(fit, probs=c(0.1, 0.5, 0.9)) 67 | params <- extract(fit, par=c("t_pred", "w_1_0", "w_1_1", "w_1_2")) 68 | 69 | save(params, file="plustwo.RData") 70 | 71 | test_data[,preds.plustwo:=apply(data.table(t(params$t_pred)), 1, mean)] 72 | 73 | sub <- fread("data/sample_submission_2.csv") 74 | sub <- sub[!grep("_(61|62)$", Id), ] 75 | 76 | sub <- rbind(sub, test_data[, .(Id=paste(Id, 61, sep="_"), Predicted=preds_plusone)]) 77 | sub <- rbind(sub, test_data[, .(Id=paste(Id, 62, sep="_"), Predicted=preds_plustwo)]) 78 | 79 | write.csv(sub, file="sub.csv", row.names=F) 80 | --------------------------------------------------------------------------------