├── data
    └── README
├── README.md
├── .gitignore
├── stan_model.stan
├── LICENSE
└── bayesian_model.R


/data/README:
--------------------------------------------------------------------------------
1 | Put train data, test data, and sample submission in this folder
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # kaggle-winton-2016
2 | Sample Solution(s) to the Winton Stock Market Challenge on Kaggle
3 | 
4 | ## bayesian_model.R
5 | Based on the stan model [posted by Tsakalis Kostas](https://www.kaggle.com/c/the-winton-stock-market-challenge/forums/t/18584/solution-sharing).
6 | 
7 | Achieves public score of 1769.62394 and private score of 1728.16802 (around 25th ~ 30th place in the final leaderboard).
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # Example code in package build process
 9 | *-Ex.R
10 | 
11 | # Output files from R CMD build
12 | /*.tar.gz
13 | 
14 | # Output files from R CMD check
15 | /*.Rcheck/
16 | 
17 | # RStudio files
18 | .Rproj.user/
19 | 
20 | # produced vignettes
21 | vignettes/*.html
22 | vignettes/*.pdf
23 | 
24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
25 | .httr-oauth
26 | 
27 | # knitr and R markdown default cache directories
28 | /*_cache/
29 | /cache/
30 | 
31 | # Temporary files created by R markdown
32 | *.utf8.md
33 | *.knit.md
34 | 


--------------------------------------------------------------------------------
/stan_model.stan:
--------------------------------------------------------------------------------
 1 | data { 
 2 |   int<lower=0> N; 
 3 |   vector[N] t; // Ret Plus One / Ret Plus two
 4 |   vector[N] x_1; //Ret Minus Two
 5 |   vector[N] x_2; //Ret Minus + First 120 mins
 6 |   int<lower=0> N_test;
 7 |   vector[N_test] x_1_test;
 8 |   vector[N_test] x_2_test;
 9 |   real df; // Degrees of freedom (2.6)
10 | } 
11 | 
12 | parameters { 
13 |   real w_1_0; 
14 |   real w_1_1; 
15 |   real w_1_2; 
16 |   real<lower=0> alpha; 
17 |   real<lower=0> beta; 
18 | } 
19 | 
20 | transformed parameters { 
21 |   real<lower=0> sigma_w; 
22 |   real<lower=0> sigma_t; 
23 |   sigma_w = sqrt(alpha); 
24 |   sigma_t = sqrt(beta); 
25 | } 
26 | 
27 | model { 
28 |   w_1_0 ~ normal(0,sigma_w);
29 |   w_1_1 ~ normal(0,sigma_w); 
30 |   w_1_2 ~ normal(0,sigma_w); 
31 |   alpha ~ inv_gamma(1E-2, 1E-4); 
32 |   beta ~ inv_gamma(.3, .0001); 
33 |   t ~ student_t(df, w_1_0+w_1_1*x_1+w_1_2*x_2, sigma_t); 
34 | }
35 | 
36 | generated quantities{
37 |   vector[N_test] t_pred;
38 |   t_pred = w_1_0+w_1_1*x_1_test+w_1_2*x_2_test;
39 | }
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 CeShine Lee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bayesian_model.R:
--------------------------------------------------------------------------------
 1 | # Suppress scientific notation
 2 | options(scipen=1)
 3 | 
 4 | # Make sure pacman is installed
 5 | install.packages("pacman")
 6 | 
 7 | # Load the required libraries
 8 | pacman::p_load(rstan)
 9 | pacman::p_load(data.table)
10 | 
11 | # Optional: using multi-cores (requires extra memory space)
12 | options(mc.cores=parallel::detectCores())
13 | 
14 | train_data <- fread("data/train.csv")
15 | test_data <- fread("data/test_2.csv")
16 | train_data[is.na(train_data)] <- 0
17 | test_data[is.na(test_data)] <- 0
18 | 
19 | train_data[, x_2 := Ret_MinusOne]
20 | test_data[, x_2 := Ret_MinusOne]
21 | 
22 | # Add intra-day returns into x_2
23 | for(i in 2:120){
24 |   stopifnot(train_data[is.na(get(paste("Ret_", i, sep=""))), .N] == 0)
25 |   stopifnot(train_data[is.na(get(paste("Ret_", i, sep=""))), .N] == 0)
26 |   train_data[, x_2 := x_2 + get(paste("Ret_", i, sep=""))]
27 |   test_data[, x_2 := x_2 + get(paste("Ret_", i, sep=""))]
28 | }
29 | 
30 | # Predict PlusOne returns
31 | fit <- stan(file = 'stan_model.stan',
32 |             data = list(
33 |               x_1 = train_data$Ret_MinusTwo,
34 |               x_2 = train_data$x_2,
35 |               x_1_test = test_data$Ret_MinusTwo,
36 |               x_2_test = test_data$x_2,
37 |               t = train_data$Ret_PlusOne,
38 |               N_test = test_data[, .N],
39 |               df = 2.6,
40 |               N = train_data[, .N]),
41 |               iter = 1000, chains = 4)
42 | 
43 | print(fit, probs=c(0.1, 0.5, 0.9))
44 | params <- extract(fit, par=c("t_pred", "w_1_0", "w_1_1", "w_1_2"))
45 | 
46 | save(params, file="plusone.RData")
47 | 
48 | test_data[,preds_plusone:=apply(data.table(t(params$t_pred)), 1, mean)]
49 | 
50 | rm(fit, params)
51 | gc(T)
52 | 
53 | # Predict PlusTwo Returns
54 | fit <- stan(file = 'stan_model.stan',
55 |             data = list(
56 |               x_1 = train_data$Ret_MinusTwo,
57 |               x_2 = train_data$x_2,
58 |               x_1_test = test_data$Ret_MinusTwo,
59 |               x_2_test = test_data$x_2,
60 |               t = train_data$Ret_PlusTwo,
61 |               N_test = test_data[, .N],
62 |               df = 2.6,
63 |               N = train_data[, .N]),
64 |             iter = 1000, chains = 4)
65 | 
66 | print(fit, probs=c(0.1, 0.5, 0.9))
67 | params <- extract(fit, par=c("t_pred", "w_1_0", "w_1_1", "w_1_2"))
68 | 
69 | save(params, file="plustwo.RData")
70 | 
71 | test_data[,preds.plustwo:=apply(data.table(t(params$t_pred)), 1, mean)]
72 | 
73 | sub <- fread("data/sample_submission_2.csv")
74 | sub <- sub[!grep("_(61|62)$", Id), ]
75 | 
76 | sub <- rbind(sub, test_data[, .(Id=paste(Id, 61, sep="_"), Predicted=preds_plusone)])
77 | sub <- rbind(sub, test_data[, .(Id=paste(Id, 62, sep="_"), Predicted=preds_plustwo)])
78 | 
79 | write.csv(sub, file="sub.csv", row.names=F)
80 | 


--------------------------------------------------------------------------------