├── README.md
├── best_estimator.rdata
├── data
    ├── data.r
    ├── synthetic.rdata
    ├── timeseries.rdata
    ├── tseries_vcerq.rdata
    └── tsl_uni_90_mix.rdata
├── descriptive-model.r
├── perfestimation-rw.r
├── perfestimation-samplesize.r
├── perfestimation-synthetic.r
├── perfestimation.Rproj
├── result-analysis-ss.r
├── result-analysis-synthetic.r
├── result-analysis.R
├── series_length.rdata
├── src
    ├── dynamics.r
    ├── estimation-procedures.r
    ├── learning-models.r
    ├── metrics.r
    ├── plots.r
    ├── utils.r
    └── workflows.r
├── stationarity.r
├── stationarity_tsdl.rdata
├── synthetic
    ├── creating-synthetic-ds.r
    ├── fresults-analysis-synth.r
    └── simulate-ts.r
└── synthetic_data_generation
    ├── creating-synthetic-ds.r
    └── simulate-ts.r


/README.md:
--------------------------------------------------------------------------------
 1 | # Time Series Forecasting Performance Estimation
 2 | 
 3 | This repository contains the code and datasets used for experiments conducted in the following paper:
 4 | 
 5 | **Cerqueira, V., Torgo, L., & Mozetič, I. (2020). Evaluating time series forecasting models: An empirical study on performance estimation methods. Machine Learning, 109(11), 1997-2028.**
 6 | 
 7 | ## Citation
 8 | 
 9 | If you use this code or the findings in your research, please cite:
10 | 
11 | ```bibtex
12 | @article{cerqueira2020evaluating,
13 |   title={Evaluating time series forecasting models: An empirical study on performance estimation methods},
14 |   author={Cerqueira, Vitor and Torgo, Luis and Mozeti{\v{c}}, Igor},
15 |   journal={Machine Learning},
16 |   volume={109},
17 |   number={11},
18 |   pages={1997--2028},
19 |   year={2020},
20 |   publisher={Springer}
21 | }
22 | ```
23 | 
24 | ### Running
25 | 
26 | The main script is `perfestimation-rw.r`.
27 | 
28 | The script `perfestimation-synthetic.r` is only ran for a specific (dataset, algorithm) pair. You need to change those to get the results for a different pair.
29 | 
30 | #### Contact
31 | 
32 | Get in touch at vitorc.research@gmail.com
33 | 


--------------------------------------------------------------------------------
/best_estimator.rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/best_estimator.rdata


--------------------------------------------------------------------------------
/data/data.r:
--------------------------------------------------------------------------------
 1 | library(tsdl)
 2 | library(forecast)
 3 | 
 4 | min_len <- 500
 5 | tsdl_list <- tsdl[sapply(tsdl, length) > min_len]
 6 | #tsdl_list <- tsdl
 7 | tsdl_list <-
 8 |   tsdl_list[!sapply(tsdl_list,
 9 |                     function(x) {
10 |                       any(is.na(x))
11 |                     })]
12 | 
13 | tsdl_list[] <- lapply(tsdl_list, head, 4000)
14 | 
15 | is_univar <- sapply(tsdl_list, class) == "ts"
16 | 
17 | tsdl_list <- tsdl_list[is_univar]
18 | 
19 | load("./data/tseries_vcerq.rdata")
20 | 
21 | ids2rm <- c(14:21,24,28,35:42,53:61)
22 | 
23 | ts_list <- ts_list[-ids2rm]
24 | 
25 | tdiff <- 
26 |   sapply(ts_list,
27 |          function(x) {
28 |            xt <-
29 |              tryCatch(
30 |                as.POSIXct(head(rownames(as.data.frame(x)))),
31 |                error = function(e)
32 |                  NA
33 |              )
34 |            
35 |            difftime(xt[2],xt[1], units = "hours")
36 |          })
37 | 
38 | #lapply(ts_list[is.na(tdiff)], head, 3)
39 | tdiff[is.na(tdiff)] <- 1
40 | 
41 | FRQs <- tdiff
42 | FRQs[tdiff == 1] <- 24
43 | FRQs[tdiff == .5] <- 48
44 | FRQs[tdiff == 24] <- 365
45 | 
46 | max_len <- 4000
47 | ts_list <-
48 |   lapply(1:length(ts_list),
49 |          function(i) {
50 |            x <- ts_list[[i]]
51 |            x <- ts(as.vector(x), frequency = FRQs[i])
52 |            x <- head(x, max_len)
53 |            x
54 |          })
55 | 
56 | ts_list <- c(tsdl_list,ts_list)
57 | length(ts_list)
58 | save(ts_list, file = "data/tsl_uni_90_mix.rdata")
59 | 


--------------------------------------------------------------------------------
/data/synthetic.rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/data/synthetic.rdata


--------------------------------------------------------------------------------
/data/timeseries.rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/data/timeseries.rdata


--------------------------------------------------------------------------------
/data/tseries_vcerq.rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/data/tseries_vcerq.rdata


--------------------------------------------------------------------------------
/data/tsl_uni_90_mix.rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/data/tsl_uni_90_mix.rdata


--------------------------------------------------------------------------------
/descriptive-model.r:
--------------------------------------------------------------------------------
 1 | load("data/tsl_uni_90_mix.rdata")
 2 | load("best_estimator.rdata")
 3 | load("series_length.rdata")
 4 | load("stationarity_tsdl.rdata")
 5 | 
 6 | source("src/dynamics.r")
 7 | 
 8 | library(rpart)
 9 | library(rattle)
10 | library(party)
11 | library(ranger)
12 | 
13 | Z <- lapply(ts_list, function(x) dynamics_ts(tail(x,1000)))
14 | names(Z) <- paste0("TS_", seq_along(Z))
15 | 
16 | 
17 | Z <- do.call(rbind, Z)
18 | Z$estimator <- best_estimator
19 | #save(Z, file = "metadata.rdata")
20 | #load("metadata.rdata")
21 | colnames(Z) <- c(
22 |   "Trend","Qt05","Qt95","IQR","Skewness","Kurtosis",
23 |   "MLE","Hurst","N","SerialCorr","estimator"
24 | )
25 | Z$Stationarity <- as.character(as.integer(is_stationary_2ensemble))
26 | 
27 | Z$estimator <- as.factor(ifelse(grepl("CV",Z$estimator), "CVAL","OOS"))
28 | 
29 | library(rpart)
30 | library(rpart.plot)
31 | 
32 | m <- rpart(estimator ~., Z,
33 |            control = list(cp=.028))
34 | rpart.plot(m,type=2, extra=2,
35 |            clip.right.labs = F, 
36 |            fallen.leaves=F,cex=0.8,
37 |            under = F)
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/perfestimation-rw.r:
--------------------------------------------------------------------------------
 1 | load("data/tsl_uni_90_mix.rdata")
 2 | 
 3 | source("src/utils.r")
 4 | source("src/estimation-procedures.r")
 5 | source("src/workflows.r")
 6 | source("src/metrics.r")
 7 | source("src/learning-models.r")
 8 | 
 9 | library(tsensembler)
10 | 
11 | form <- target~.
12 | nfolds <- 30
13 | 
14 | library(parallel)
15 | final_results <-
16 |   mclapply(1:length(ts_list),
17 |          function(i) {
18 |            cat(i, "\n\n")
19 |            ds <- ts_list[[i]]
20 |            
21 |            x <-
22 |              workflow(
23 |                ds = ds,
24 |                form = form,
25 |                predictive_algorithm = "rbr",
26 |                nfolds = nfolds,
27 |                outer_split = .7)
28 |            
29 |            x
30 |          }, mc.cores = 10)
31 | 
32 | save(final_results, file = "final_results_rbr.rdata")
33 | 
34 | 


--------------------------------------------------------------------------------
/perfestimation-samplesize.r:
--------------------------------------------------------------------------------
 1 | load("data/tsl_uni_90_mix.rdata")
 2 | 
 3 | source("src/utils.r")
 4 | source("src/estimation-procedures.r")
 5 | source("src/workflows.r")
 6 | source("src/metrics.r")
 7 | source("src/learning-models.r")
 8 | 
 9 | library(tsensembler)
10 | #unloadNamespace("tsensembler")
11 | 
12 | load("series_length.rdata")
13 | 
14 | ts_list <- ts_list[len >= 1000]
15 | 
16 | form <- target~.
17 | nfolds <- 30
18 | 
19 | 
20 | library(parallel)
21 | final_results <-
22 |   mclapply(1:length(ts_list),
23 |            function(i) {
24 |              #i<-1
25 |              cat(i, "\n\n")
26 |              ds <- ts_list[[i]]
27 |              
28 |              x <-
29 |                wf_sample_size(
30 |                  ds = ds,
31 |                  form = form,
32 |                  predictive_algorithm = "rbr",
33 |                  nfolds = nfolds,
34 |                  outer_split = .7)
35 |              
36 |              x
37 |            }, mc.cores = 10)
38 | 
39 | 
40 | save(final_results, file = "final_results_samplesize_rbr.rdata")
41 | 
42 | ##
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/perfestimation-synthetic.r:
--------------------------------------------------------------------------------
 1 | load("data/synthetic.rdata")
 2 | 
 3 | source("src/utils.r")
 4 | source("src/estimation-procedures.r")
 5 | source("src/workflows.r")
 6 | source("src/metrics.r")
 7 | source("src/learning-models.r")
 8 | 
 9 | library(tsensembler)
10 | 
11 | form <- target ~ .
12 | nfolds <- 30
13 | embedded_time_series <- synthetic$TS3
14 | 
15 | library(parallel)
16 | final_results <- 
17 |   mclapply(1:length(embedded_time_series),
18 |          function(i) {
19 |            cat(i, "\n\n")
20 |            ds <- embedded_time_series[[i]]
21 |            
22 |            x <-
23 |              workflow(
24 |                ds = ds,
25 |                form = form,
26 |                predictive_algorithm = "rf",
27 |                nfolds = nfolds,
28 |                outer_split = .8, 
29 |                is_embedded = T
30 |              )
31 |            
32 |            x
33 |          }, mc.cores = 3)
34 | 
35 | save(final_results, 
36 |      file = "final_results_synthetic_ts3_rf.rdata")
37 | 
38 | 


--------------------------------------------------------------------------------
/perfestimation.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/result-analysis-ss.r:
--------------------------------------------------------------------------------
 1 | load("final_results_samplesize_rbr.rdata")
 2 | load("stationarity_tsdl.rdata")
 3 | load("series_length.rdata")
 4 | 
 5 | # err_estimation <- 
 6 | #   lapply(final_results[!(is_stationary_2ensemble[len>1000])],
 7 | #          function(x) {
 8 | #            x <- do.call(rbind, x)
 9 | #            rnk <- t(apply(abs(x),1,rank))
10 | #            rnk
11 | #          })
12 | 
13 | err_estimation <- 
14 |   lapply(final_results,
15 |          function(x) {
16 |            x <- do.call(rbind, x)
17 |            rnk <- t(apply(abs(x),1,rank))
18 |            rnk
19 |          })
20 | 
21 | err_arr <- simplify2array(err_estimation)
22 | err_ovr <- apply(err_arr,1:2,mean)
23 | 
24 | library(tsensembler)
25 | err_ovr <- 
26 |   roll_mean_matrix(as.data.frame(err_ovr), 2)
27 | 
28 | 
29 | colnames(err_ovr) <- 
30 |   c("CV", "CV-Bl", "CV-Mod","CV-hvBl",
31 |     "Preq-Bls", "Preq-Sld-Bls",
32 |     "Preq-Bls-Gap","Holdout", "Rep-Holdout",
33 |     "Preq-Slide","Preq-Grow")
34 | 
35 | df <- t(round(err_ovr,2))
36 | df <- as.data.frame(df)
37 | df$Method <- as.factor(rownames(df))
38 | rownames(df) <- NULL
39 | colnames(df) <- c(as.character(seq(from=100,to=900,by=100)),"Method")
40 | 
41 | library(hrbrthemes)
42 | library(GGally)
43 | library(viridis)
44 | ggparcoord(df,
45 |            columns = 1:9, groupColumn = 10, 
46 |            scale = "globalminmax",
47 |            showPoints = TRUE, 
48 |            alphaLines = 0.9
49 | ) + theme_minimal() +
50 |   labs(x="Training Sample Size",
51 |        y= "Average Rank")
52 | 
53 | 


--------------------------------------------------------------------------------
/result-analysis-synthetic.r:
--------------------------------------------------------------------------------
 1 | load("final_results_synthetic_ts1_lasso.rdata")
 2 | load("final_results_synthetic_ts2_lasso.rdata")
 3 | load("final_results_synthetic_ts3_lasso.rdata")
 4 | load("final_results_synthetic_ts1_rbr.rdata")
 5 | load("final_results_synthetic_ts2_rbr.rdata")
 6 | load("final_results_synthetic_ts3_rbr.rdata")
 7 | load("final_results_synthetic_ts1_rf.rdata")
 8 | load("final_results_synthetic_ts2_rf.rdata")
 9 | load("final_results_synthetic_ts3_rf.rdata")
10 | 
11 | source("src/plots.r")
12 | library(scmamp)
13 | 
14 | err_estimation <- 
15 |   lapply(final_results,
16 |          function(x) {
17 |            tryCatch(x$err_estimation,
18 |                     error =function(e) {NULL})
19 |          })
20 | 
21 | err_estimation <- err_estimation[!sapply(err_estimation, is.null)]
22 | 
23 | fr <- do.call(rbind, err_estimation)
24 | fr <- as.data.frame(fr)
25 | rownames(fr) <- NULL
26 | 
27 | colnames(fr) <- 
28 |   c("CV", "CV-Bl", "CV-Mod","CV-hvBl",
29 |     "Preq-Bls", "Preq-Sld-Bls",
30 |     "Preq-Bls-Gap","Holdout", "Rep-Holdout",
31 |     "Preq-Slide","Preq-Grow")
32 | 
33 | fr_abs <- abs(fr)
34 | fr_abs_rank <- apply(fr_abs, 1, rank)
35 | 
36 | avg_rank_plot(avg = rowMeans(fr_abs_rank),
37 |               sdev = apply(fr_abs_rank,1, sd)) + 
38 |   theme(axis.text.x  = element_text(angle = 45,
39 |                                     size = 12,
40 |                                     hjust = 1))
41 | 
42 | #
43 | 


--------------------------------------------------------------------------------
/result-analysis.R:
--------------------------------------------------------------------------------
 1 | load("final_results_rbr_all_nd.rdata")
 2 | load("final_results_lasso_all_nd.rdata")
 3 | load("final_results_rf_all_nd.rdata")
 4 | 
 5 | load("stationarity_tsdl.rdata")
 6 | 
 7 | source("src/plots.r")
 8 | library(scmamp)
 9 | 
10 | err_estimation <- 
11 |   lapply(final_results,
12 |          function(x) {
13 |            tryCatch(x$err_estimation,
14 |                     error =function(e) {NULL})
15 |          })
16 | 
17 | err_estimation <- err_estimation[!sapply(err_estimation, is.null)]
18 | 
19 | fr <- do.call(rbind, err_estimation)
20 | fr <- as.data.frame(fr)
21 | rownames(fr) <- NULL
22 | 
23 | is_stationary_final <- is_stationary_2ensemble
24 | table(is_stationary_final)
25 | 
26 | ## Subsetting ##########################################
27 | fr_final <- fr[!is_stationary_final,]
28 | fr_final <- fr[is_stationary_final,]
29 | fr_final <- fr
30 | 
31 | ########################################################
32 | 
33 | colnames(fr_final) <- 
34 |   c("CV", "CV-Bl", "CV-Mod","CV-hvBl",
35 |     "Preq-Bls", "Preq-Sld-Bls",
36 |     "Preq-Bls-Gap","Holdout", "Rep-Holdout",
37 |     "Preq-Slide","Preq-Grow")
38 | fr_abs <- abs(fr_final)
39 | rownames(fr_abs) <- NULL
40 | 
41 | # best_estimator <- colnames(fr_abs)[apply(fr_abs,1,which.min)]
42 | # save(best_estimator, file = "best_estimator.rdata")
43 | #table(best_estimator)
44 | 
45 | fr_abs_rank <- apply(fr_abs, 1, rank)
46 | 
47 | # bp_dist(t(fr_abs_rank)[!is_stationary_final,])
48 | # bp_dist(t(fr_abs_rank)[is_stationary_final,])
49 | # bp_dist(t(fr_abs_rank))
50 | 
51 | avg_rank_plot(avg = rowMeans(fr_abs_rank),
52 |               sdev = apply(fr_abs_rank,1, sd)) + 
53 |   theme(axis.text.x  = element_text(angle = 45,
54 |                                     size = 12,
55 |                                     hjust = 1))
56 | 
57 | rm(fr_final)
58 | 
59 | baseline <- "Rep-Holdout"
60 | 
61 | cID <- which(colnames(fr_abs) %in% baseline)
62 | 
63 | PerfDiff <- lapply(as.data.frame(fr_abs),
64 |                    function(x) x-fr_abs[,cID,drop=T])
65 | 
66 | PerfDiff <- as.data.frame(PerfDiff[-cID])
67 | 
68 | rope <- 2.5
69 | baout <-
70 |   lapply(PerfDiff,
71 |          function(u) {
72 |            bSignedRankTest(u, 
73 |                            rope=c(-rope,rope))$posterior.probabilities
74 |          })
75 | 
76 | baout <- lapply(baout,unlist)
77 | baout <- do.call(rbind, baout)
78 | rownames(baout) <- gsub("\\.","-",rownames(baout))
79 | colnames(baout) <- c("probLeft","probRope","probRight")
80 | proportion_plot(baout) + scale_fill_brewer(palette="Set2")
81 | 
82 | 
83 | #
84 | colnames(fr) <- colnames(fr_abs)
85 | percdiff_plot_log(fr)
86 | 
87 | 


--------------------------------------------------------------------------------
/series_length.rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/series_length.rdata


--------------------------------------------------------------------------------
/src/dynamics.r:
--------------------------------------------------------------------------------
  1 | replace_inf <- function(df) {
  2 |   do.call(data.frame,
  3 |           lapply(df, function(j) {
  4 |             replace(j, is.infinite(j), NA)
  5 |           })
  6 |   )
  7 | }
  8 | 
  9 | dynamics_ts <-
 10 |   function(x) {
 11 |     require(TTR)
 12 |     
 13 |     x_std <- as.vector(scale(x))
 14 |     
 15 |     cat("Computing trend ...\n")
 16 |     ts_trend <- trend(x)
 17 |     
 18 |     cat("Computing skewness ...\n")
 19 |     ts_skew <- moments::skewness(x)
 20 |     
 21 |     cat("Computing kurtosis ...\n")
 22 |     ts_kts <- moments::kurtosis(x)
 23 |     
 24 |     cat("Computing maximum lyapunov exponent ...\n")
 25 |     ts_mle <- max_lyapunov_exp(tail(x,500))
 26 |     
 27 |     cat("Computing hurst ...\n")
 28 |     ts_hurst <- HURST(x)
 29 |     
 30 |     cat("Computing serial correlation ...\n")
 31 |     ts_serialcorr <- tryCatch(Box.test(x)$p.val,
 32 |                               error=function(e) NA)
 33 |     
 34 |     
 35 |     ts_qt05 <- quantile(x_std, .05)
 36 |     
 37 |     ts_qt95 <- quantile(x_std, .95)
 38 |     
 39 |     ts_iqr <- IQR(x_std)
 40 |     
 41 |     ts_dyns <-
 42 |       data.frame(ts_trend = ts_trend,
 43 |                  ts_qt05=ts_qt05,
 44 |                  ts_qt95=ts_qt95,
 45 |                  ts_iqr=ts_iqr,
 46 |                  ts_skew = ts_skew,
 47 |                  ts_kts = ts_kts,
 48 |                  ts_mle = ts_mle,
 49 |                  ts_hurst = ts_hurst,
 50 |                  ts_n = length(x),
 51 |                  ts_serialcorr = ts_serialcorr)
 52 |     
 53 |     ts_dyns <- replace_inf(ts_dyns)
 54 |     
 55 |     #has_na <- DMwR::manyNAs(t(ts_dyns), .4)
 56 |     
 57 |     #if (length(has_na) > 0) {
 58 |     #  ts_dyns <- subset(ts_dyns, select = -has_na)
 59 |     #}
 60 |     
 61 |     #ts_dyns <- tsensembler::soft.completion(ts_dyns)
 62 |     
 63 |     #nzv_cols <- caret::nearZeroVar(ts_dyns)
 64 |     #if (length(nzv_cols) > 0L) {
 65 |     #  ts_dyns <- subset(ts_dyns, select = -nzv_cols)
 66 |     #}
 67 |     
 68 |     rownames(ts_dyns) <- NULL
 69 |     #preproc <- caret::preProcess(dStats)
 70 |     #dStats <- predict(preproc, dStats)
 71 |     
 72 |     dplyr::as_tibble(ts_dyns)
 73 |   }
 74 | 
 75 | 
 76 | trend <-
 77 |   function(x) {
 78 |     sd(x) / sd(diff(x)[-1])
 79 |   }
 80 | 
 81 | max_lyapunov_exp <-
 82 |   function(x) {
 83 |     require(nonlinearTseries)
 84 |     
 85 |     len <- length(x)
 86 |     Reduce(max,
 87 |            nonlinearTseries::divergence(
 88 |              nonlinearTseries::maxLyapunov(
 89 |                time.series = x,
 90 |                min.embedding.dim = ceiling(len / 4),
 91 |                max.embedding.dim = ceiling(len / 2),
 92 |                radius = ceiling(len / 6),
 93 |                do.plot = FALSE
 94 |              )
 95 |            ))
 96 |   }
 97 | 
 98 | #' Hurst exponent
 99 | #'
100 | #' @param x numeric vector
101 | HURST <-
102 |   function(x) {
103 |     require(Rwave)
104 |     
105 |     cwtwnoise <- DOG(x, 10, 3, 1, plot = FALSE)
106 |     mcwtwnoise <- Mod(cwtwnoise)
107 |     mcwtwnoise <- mcwtwnoise * mcwtwnoise
108 |     wspwnoise <- tfmean(mcwtwnoise, plot = FALSE)
109 |     
110 |     hurst.est(wspwnoise, 1:7, 3, plot = FALSE)[[2]]
111 |   }
112 | 
113 | 
114 | r_sma_ema <-
115 |   function(x) {
116 |     require(TTR)
117 |     
118 |     if (length(x) > 10)
119 |       n <- 5
120 |     else
121 |       n <- 3
122 |     
123 |     ts_sma <- SMA(rev(x), n = n)
124 |     ts_ema <- EMA(rev(x), n = n)
125 |     
126 |     ts_sma <- ts_sma[!is.na(ts_sma)]
127 |     ts_ema <- ts_ema[!is.na(ts_ema)]
128 |     
129 |     sema <- ts_sma / ts_ema
130 |     sema <- sema[!(is.infinite(sema) | is.na(sema))]
131 |     
132 |     mean(ts_sma / ts_ema)
133 |   }


--------------------------------------------------------------------------------
/src/estimation-procedures.r:
--------------------------------------------------------------------------------
  1 | #' k-fold cross validation
  2 | #' The standard cross validation procedure.
  3 | #'
  4 | #' @param x data: embedded time series
  5 | #' @param nfolds no of folds
  6 | #' @param FUN function to apply to each iteration's train and test. Typically
  7 | #' \strong{FUN} is a workflow where a predictive model is applied in a training set
  8 | #' and the model is evaluated in the test set.
  9 | #' @param ... further parameters to \code{FUN}
 10 | #'
 11 | #' @export
 12 | kf_xval <- function(x, nfolds, FUN, shuffle.rows = TRUE, average_results = TRUE, ...) {
 13 |   if (shuffle.rows) x <- cv.shuffle(x)
 14 |   f <- cv.folds(x, nfolds)
 15 | 
 16 |   cv.res <- list() 
 17 |   for (i in seq_len(nfolds)) {
 18 |     cat("X VAL iter no. ", i, "\n")
 19 |     ts.id <- which(f == i)
 20 | 
 21 |     train <- x[-ts.id, ]
 22 |     #print(dim(train))
 23 |     test  <- x[ ts.id, ]
 24 |     #print(dim(test))
 25 |     cv.res[[i]] <- FUN(train, test, ...)
 26 |   }
 27 |   
 28 |   if(average_results) {
 29 |     cv.res <- mean(unlist(cv.res, use.names = FALSE))
 30 |   }
 31 |   
 32 |   cv.res
 33 | }
 34 | 
 35 | #' Blocked k-fold cross validation
 36 | #' Standard k-fold cross validation without reshuffling rows
 37 | #'
 38 | #' @inheritParams kf_xval
 39 | #'
 40 | #' @export
 41 | blocked_kf_xval <- function(x, nfolds, FUN, ...) {
 42 |   kf_xval(x = x, nfolds = nfolds, FUN = FUN, shuffle.rows = FALSE, ...)
 43 | }
 44 | 
 45 | #' Modified k-fold Cross Validation
 46 | #' Standard CV, but removes rows from training that are
 47 | #' dependent with the test set.
 48 | #'
 49 | #' @inheritParams kf_xval
 50 | #'
 51 | #' @export
 52 | modified_xval <- function(x, nfolds, FUN, average_results = TRUE, ...) {
 53 |   K <- floor(sqrt(ncol(x)))#inventei aqui
 54 |   x$aux <- seq_len(nrow(x))
 55 |   x <- cv.shuffle(x)
 56 |   f <- cv.folds(x, nfolds)
 57 | 
 58 |   cv.res <- list()
 59 |   for (i in seq_len(nfolds)) {
 60 |     ts.id <- which(f == i)
 61 |     test  <- x[ts.id, ]
 62 | 
 63 |     depRows <- unique(unlist(lapply(test$aux, function(z) (z-K-1):(z+K-1L))))
 64 |     depRows <- depRows[depRows > 0]
 65 | 
 66 |     train <- x[-c(ts.id, depRows), ]
 67 |     if (nrow(train) < 1) {
 68 |       cv.res[[i]] <- NA#rep(NA_real_, times = 3)
 69 |     } else {
 70 |       test$aux <- NULL
 71 |       train$aux <- NULL
 72 | 
 73 |       cv.res[[i]] <- FUN(train, test, ...)
 74 |     }
 75 |   }
 76 |   
 77 |   if(average_results) {
 78 |     cv.res <- mean(unlist(cv.res), na.rm = TRUE)
 79 |   }
 80 |   
 81 |   cv.res
 82 | }
 83 | 
 84 | #' hv - block Cross Validation
 85 | #' Cross without reshuffling rows and removing dependent rows.
 86 | #' Since there is no reshuffling, the dependent rows are just
 87 | #' the \code{K}-adjacent ones in the adjacent folds. \code{K} denotes
 88 | #' the embedding dimension.
 89 | #'
 90 | #' @inheritParams kf_xval
 91 | #'
 92 | #' @export
 93 | hv.block_xval <- function(x, nfolds, FUN, average_results, ...) {
 94 |   K <- ncol(x)
 95 |   f <- cv.folds(x, nfolds)
 96 | 
 97 |   cv.res <- list()
 98 |   seq. <- seq_len(nfolds)
 99 |   lseq <- seq.[length(seq.)]
100 |   for (i in seq.) {
101 |     ts.id <- which(f == i)
102 |     # upper cut
103 |     kcuts <- integer(0L)
104 |     if (i + 1L <= lseq) {
105 |       upper.fold <- which(f == i + 1L)
106 |       upper.cut <- upper.fold[1:K]
107 |       if (any(is.na(upper.cut))) {
108 |         upper.cut <- upper.cut[!is.na(upper.cut)]
109 |       }
110 |       
111 |       kcuts <- c(kcuts, upper.cut)
112 |     }
113 |     # lower cut
114 |     if (i - 1L >= 1L) {
115 |       lower.fold <- which(f == i - 1L)
116 |       len.lf <- length(lower.fold)
117 |       idx <- (len.lf - K + 1L):len.lf
118 |       idx <- idx[idx > 0]
119 |       lower.cut <- lower.fold[idx]
120 |       kcuts <- c(kcuts, lower.cut)
121 |     }
122 | 
123 |     train <- x[-c(ts.id, kcuts), ]
124 |     test  <- x[ts.id, ]
125 | 
126 |     cv.res[[i]] <- FUN(train, test, ...)
127 |   }
128 |   
129 |   if (average_results) {
130 |     cv.res <- mean(unlist(cv.res), na.rm = TRUE)
131 |   }
132 |   
133 |   cv.res
134 | }
135 | 
136 | #' Sequential Block Evaluation
137 | #' Trains in the the up-to-i folds and tests on i+1
138 | #' 
139 | #' Prequential in blocks in a growing window fashion.
140 | #'
141 | #' @inheritParams kf_xval
142 | #'
143 | #' @export
144 | prequential_in_blocks <- function(x, nfolds, FUN, average_results, ...) {
145 |   f <- cv.folds(x, nfolds)
146 | 
147 |   cv.res <- list()
148 |   seq. <- seq_len(nfolds)
149 |   for (i in seq.[-length(seq.)]) {
150 |     tr.id <- which(f %in% seq_len(i))
151 |     ts.id <- which(f == i + 1L)
152 | 
153 |     train <- x[tr.id, ]
154 |     test  <- x[ts.id, ]
155 |     cv.res[[i]] <- FUN(train, test, ...)
156 |   }
157 |   
158 |   if (average_results) {
159 |     cv.res <- mean(unlist(cv.res), na.rm = TRUE) 
160 |   }
161 |   
162 |   cv.res
163 | }
164 | 
165 | sliding_prequential_in_blocks <-
166 |   function(x, nfolds, FUN, average_results, ...) {
167 |     nfolds0<-10
168 |     f <- cv.folds(x, nfolds0)
169 |     
170 |     cv.res <- list()
171 |     seq. <- seq_len(nfolds0)
172 |     for (i in seq.[-length(seq.)]) {
173 |       tr.id <- which(f == i)#which(f %in% seq_len(i))
174 |       ts.id <- which(f == i + 1L)
175 |       
176 |       train <- x[tr.id,]
177 |       test  <- x[ts.id,]
178 |       cv.res[[i]] <- FUN(train, test, ...)
179 |     }
180 |     
181 |     if (average_results) {
182 |       cv.res <- mean(unlist(cv.res), na.rm = TRUE)
183 |     }
184 |     
185 |     cv.res
186 |   }
187 | 
188 | #' Sequential Block Evaluation
189 | #' Trains in the the up-to-i folds and tests on i+2
190 | #' Doesn't test on i+1 to further increase independence
191 | #'
192 | #' @inheritParams kf_xval
193 | #'
194 | #' @export
195 | prequential_in_blocks_gap <- function(x, nfolds, FUN, average_results, ...) {
196 |   f <- cv.folds(x, nfolds)
197 | 
198 |   cv.res <- list()
199 |   seq. <- seq_len(nfolds); len <- length(seq.)
200 | 
201 |   seq. <- seq.[-c(len:(len - 1L))]
202 |   for (i in seq.) {
203 |     tr.id <- which(f %in% seq_len(i))
204 |     ts.id <- which(f == i + 2L)
205 | 
206 |     train <- x[tr.id, ]
207 |     test  <- x[ts.id, ]
208 |     cv.res[[i]] <- FUN(train, test, ...)
209 |   }
210 |   
211 |   if (average_results) {
212 |     cv.res <- mean(unlist(cv.res), na.rm = TRUE)
213 |   }
214 |   
215 |   cv.res
216 | }
217 | 
218 | #' Simple Block Bootstrap
219 | #' Block bootstrap with non-overlapping blocks
220 | #'
221 | #' @param x data
222 | #' @param nblocks no of block to divide \code{x} into
223 | #' @param nreps no of bootstrap repetitions
224 | #' @param FUN function to apply
225 | #' @param ... further params to \code{FUN}
226 | #'
227 | #' @export
228 | bbootstrap <- function(x, nblocks, nreps, FUN, ...) {
229 |   K <- ncol(x)
230 |   res.boot <- list()
231 |   x <- unembed.timeseries(x[-nrow(x), ], x[nrow(x), ])$data
232 |   f <- cv.folds(x = x, nfolds = nblocks)
233 |   for (i in seq_len(nreps)) {
234 |     seq.f <- seq_len(nblocks)
235 |     OOB.id <- sample(seq.f, nblocks, replace = TRUE)
236 |     train <- rbind_(
237 |       lapply(OOB.id, function(z) {
238 |         r.id <- which(z == f)
239 |         x_z <- embed.timeseries(x[r.id], K)
240 |         rownames(x_z) <- NULL
241 | 
242 |         x_z
243 |       })
244 |     )
245 |     ts.id <- setdiff(seq.f, OOB.id)
246 | 
247 |     if (length(ts.id) < 1) {
248 |       res.boot[[i]] <- rep(NA_real_, times = 3)
249 |     } else {
250 |       test <- rbind_(
251 |         lapply(ts.id, function(z) {
252 |           r.id <- which(z == f)
253 |           x_z <- embed.timeseries(x[r.id], K)
254 |           rownames(x_z) <- NULL
255 | 
256 |           x_z
257 |         })
258 |       )
259 |       res.boot[[i]] <- FUN(train, test, ...)
260 |     }
261 |   }
262 |   res.boot
263 | }
264 | 
265 | 
266 | #' Moving Block Bootstrap
267 | #' Block bootstrap with overlapping blocks
268 | #'
269 | #' @inheritParams bbootstrap
270 | #'
271 | #' @export
272 | mbbootstrap <- function(x, nblocks, nreps, FUN, ...) {
273 |   res.boot <- list()
274 |   f <- cv.folds(x, nblocks)
275 | 
276 |   for (i in seq_len(nreps)) {
277 |     OOB.id <- sample(seq_len(nblocks), nblocks, replace = TRUE)
278 |     tr.id <- unlist(lapply(OOB.id, function(z) which(z == f)))
279 |     train <- x[tr.id, ]
280 |     test  <- x[-unique(tr.id), ]
281 | 
282 |     if (nrow(test) < 1) {
283 |       res.boot[[i]] <- rep(NA_real_, times = 3)
284 |     } else {
285 |       res.boot[[i]] <- FUN(train, test, ...)
286 |     }
287 |   }
288 |   res.boot
289 | }
290 | 
291 | #' OOS Rolling Window
292 | #' Keeps same window with retraining procedures
293 | #'
294 | #' @param train train set
295 | #' @param test test set
296 | #' @param FUN function to apply
297 | #' @param ... further params to \code{FUN}
298 | #'
299 | #' @export
300 | PrequentialSliding <-
301 |   function(train, test, FUN, average_results, ...) {
302 |     seq. <- seq_len(nrow(test))
303 |     seq.diff <- seq. - 1L
304 |     
305 |     estresults <-
306 |       lapply(seq.diff, function(y) {
307 |         cut.y <- seq_len(y)
308 |         if (y < 1) {
309 |           train.set <- train
310 |           test.set  <- test
311 |         } else {
312 |           train.set <- rbind.data.frame(train[-cut.y, ], test[cut.y, ])
313 |           test.set  <- test[-cut.y, ]
314 |         }
315 |         FUN(train.set, test.set, ...)
316 |       })
317 |     
318 |     if (average_results) {
319 |       cv.res <- mean(unlist(estresults), na.rm = TRUE)
320 |     }
321 |     
322 |     cv.res
323 |   }
324 | 
325 | #' OOS Rolling Origin Update
326 | #'
327 | #' Landmark style retraining
328 | #'
329 | #' @inheritParams oos.rw.updt
330 | #'
331 | #' @export
332 | PrequentialLandmark <- 
333 |   function(train, test, FUN, average_results, ...) {
334 |   seq. <- seq_len(nrow(test))
335 |   seq.diff <- seq. - 1L
336 | 
337 |   estresults <-
338 |     lapply(seq.diff, function(y) {
339 |       cut.y <- seq_len(y)
340 |       if (y < 1) {
341 |         train.set <- train
342 |         test.set  <- test
343 |       } else {
344 |         train.set <- rbind.data.frame(train, test[cut.y,])
345 |         test.set  <- test[-cut.y,]
346 |       }
347 |       FUN(train.set, test.set, ...)
348 |     })
349 |   
350 |   if (average_results) {
351 |     cv.res <- mean(unlist(estresults), na.rm = TRUE)
352 |   }
353 |   
354 |   cv.res
355 | }
356 | 
357 | 
358 | holdout <- 
359 |   function(x, FUN, ...) {
360 |     xp <- partition(x, .7)
361 |     
362 |     train <- xp$train
363 |     test <- xp$test
364 |     
365 |     FUN(train, test, form)
366 |   }
367 | 
368 | repeated_holdout <- 
369 |   function(x, nreps, train_size, test_size, FUN, average_results, ...) {
370 |     n <- nrow(x)
371 |     tr_size <- as.integer(n * train_size)
372 |     ts_size <- as.integer(n * test_size)
373 |     selection_range <- (tr_size + 1):(n - ts_size + 1)
374 |     origins <- sample(selection_range, nreps)
375 |     
376 |     mcapprox <-
377 |       lapply(origins, function(o) {
378 |         train <- x[(o - tr_size):(o - 1),]
379 |         test <- x[o:(o + ts_size - 1),]
380 |         
381 |         FUN(train = train, test = test, form=form)
382 |       })
383 |     
384 |     if (average_results) {
385 |       mcapprox<-mean(unlist(mcapprox), na.rm = TRUE)
386 |     }
387 |     
388 |     mcapprox
389 |   }
390 | 
391 | 


--------------------------------------------------------------------------------
/src/learning-models.r:
--------------------------------------------------------------------------------
  1 | RF <- 
  2 |   function(form,train,test) {
  3 |     require(ranger)
  4 |     
  5 |     m <- ranger(formula = form, data=train, num.trees = 100)
  6 |     
  7 |     predict(m,test)$predictions
  8 |   }
  9 | 
 10 | RBR <- 
 11 |   function(form,train,test) {
 12 |     require(Cubist)
 13 |     
 14 |     X <- model.matrix(form, train)
 15 |     Y <- get_y(train, form)
 16 |     
 17 |     m <- cubist(X, Y, committees = 10)
 18 |     
 19 |     Xts <- model.matrix(form, test)
 20 |     
 21 |     predict(m,Xts)
 22 |   }
 23 | 
 24 | 
 25 | LASSO <- 
 26 |   function(form,train,test) {
 27 |     require(glmnet)
 28 |     
 29 |     X <- model.matrix(form, train)
 30 |     Y <- get_y(train, form)
 31 |     
 32 |     m.all <- glmnet(X, Y, alpha = 0, family = "gaussian")
 33 |     m <- glmnet(X,
 34 |                 Y,
 35 |                 alpha = 1,
 36 |                 lambda = min(m.all$lambda),
 37 |                 family = "gaussian")
 38 |     
 39 |     Xts <- model.matrix(form, test)
 40 |     
 41 |     unname(predict(m,Xts)[,1])
 42 |   }
 43 | 
 44 | 
 45 | 
 46 | GP <- 
 47 |   function(form,train,test) {
 48 |     require(kernlab)
 49 |     
 50 |     m <- gausspr(form,
 51 |                  train,
 52 |                  type = "regression",
 53 |                  kernel = "rbfdot",
 54 |                  tol = .001)
 55 |     
 56 |     predict(m,test)[,1]
 57 |   }
 58 | 
 59 | CART <- 
 60 |   function(form,train,test) {
 61 |     require(DMwR)
 62 |     
 63 |     m <- rpartXse(form,train)
 64 |     
 65 |     unname(predict(m,test))
 66 |   }
 67 | 
 68 | CART_loss <- 
 69 |   function(train, test, form, avg=TRUE) {
 70 |     y <- get_y(test, form)
 71 |     y_tr <- get_y(train, form)
 72 |     y_hat <- tryCatch(CART(form, train, test),
 73 |              error=function(e) {
 74 |                rep(mean(y_tr), times=length(y))
 75 |              })
 76 |     
 77 |     mase_cal(y_tr, y, y_hat, avg)
 78 |   }
 79 | 
 80 | 
 81 | 
 82 | GP_loss <- 
 83 |   function(train, test, form, avg=TRUE) {
 84 |     y <- get_y(test, form)
 85 |     y_tr <- get_y(train, form)
 86 |     y_hat <- GP(form, train, test)
 87 |     
 88 |     mase_cal(y_tr, y, y_hat, avg)
 89 |   }
 90 | 
 91 | 
 92 | 
 93 | LASSO_loss <- 
 94 |   function(train, test, form, avg=TRUE) {
 95 |     y <- get_y(test, form)
 96 |     y_tr <- get_y(train, form)
 97 |     y_hat <- 
 98 |       tryCatch(LASSO(form, train, test),
 99 |                error=function(e) {
100 |                  rep(mean(y_tr), times=length(y))
101 |                })
102 |     
103 |     mase_cal(y_tr, y, y_hat, avg)
104 |   }
105 | 
106 | RF_loss <- 
107 |   function(train, test, form, avg=TRUE) {
108 |     y <- get_y(test, form)
109 |     y_tr <- get_y(train, form)
110 |     y_hat <- RF(form, train, test)
111 |     
112 |     #rmse(y, y_hat)
113 |     mase_cal(y_tr, y, y_hat, avg)
114 |   }
115 | 
116 | RBR_loss <- 
117 |   function(train, test, form, avg=TRUE) {
118 |     y <- get_y(test, form)
119 |     y_tr <- get_y(train, form)
120 |     y_hat <- RBR(form, train, test)
121 |     
122 |     mase_cal(y_tr, y, y_hat, avg)
123 |   }
124 | 
125 | MLP <-
126 |   function(form,train,test) {
127 |     require(nnet)
128 |     m <- nnet(form, train, size=50, linout=T, MaxNWts=2^20, maxit=150)
129 | 
130 |     unname(predict(m,test))[,1]
131 |   }
132 | 
133 | MLP_loss <-
134 |   function(train, test, form, avg=TRUE) {
135 |     y <- get_y(test, form)
136 |     y_tr <- get_y(train, form)
137 |     y_hat <- MLP(form, train, test)
138 | 
139 |     mase_cal(y_tr, y, y_hat, avg)
140 |   }
141 | 
142 | 


--------------------------------------------------------------------------------
/src/metrics.r:
--------------------------------------------------------------------------------
  1 | #' Predictive Accuracy Error
  2 | #'
  3 | #' @param E_hat estimated error
  4 | #' @param E true error
  5 | #'
  6 | #' @export
  7 | pae <- function(E_hat, E) E_hat - E
  8 | 
  9 | #' Mean Predictive Accuracy Error
 10 | #'
 11 | #' @inheritParams pae
 12 | #'
 13 | #' @export
 14 | mpae <- function(E_hat, E) mean(pae(E_hat, E))
 15 | 
 16 | #' Mean Absolute Predictive Accuracy Error
 17 | #'
 18 | #' @inheritParams pae
 19 | #'
 20 | #' @export
 21 | mapae <- function(E_hat, E) mean(abs(pae(E_hat, E)))
 22 | 
 23 | #' rmse
 24 | #'
 25 | #' Utility function to compute Root Mean Squared Error (RMSE)
 26 | #'
 27 | #' @inheritParams se
 28 | #'
 29 | #' @export
 30 | rmse <- function(y, y_hat) sqrt(mse(y, y_hat))
 31 | 
 32 | #' ae
 33 | #'
 34 | #' Element-wise computation of the absolute error loss function.
 35 | #'
 36 | #' @inheritParams se
 37 | #'
 38 | #' @export
 39 | ae <- function(y, y_hat) {
 40 |   stopifnot(length(y) == length(y_hat),
 41 |             is.numeric(y),
 42 |             is.numeric(y_hat))
 43 | 
 44 |   abs(y - y_hat)
 45 | }
 46 | 
 47 | #' se
 48 | #'
 49 | #' Utility function to compute pointwise squared error (SE)
 50 | #'
 51 | #' @param y A numeric vector representing the actual values.
 52 | #' @param y_hat A numeric vector representing the forecasted values.
 53 | #'
 54 | #' @return squared error of forecasted values.
 55 | #'
 56 | #' @export
 57 | se <- function(y, y_hat) {
 58 |   stopifnot(length(y) == length(y_hat),
 59 |             is.numeric(y),
 60 |             is.numeric(y_hat))
 61 | 
 62 |   (y - y_hat) ^ 2
 63 | }
 64 | 
 65 | #' mse
 66 | #'
 67 | #' Utility function to compute mean squared error (MSE)
 68 | #'
 69 | #' @inheritParams se
 70 | #'
 71 | #' @export
 72 | mse <- function(y, y_hat) mean(se(y, y_hat), na.rm = TRUE)
 73 | 
 74 | #' mae
 75 | #'
 76 | #' Mean Absolute Error loss function.
 77 | #'
 78 | #' @inheritParams se
 79 | #'
 80 | #' @export
 81 | mae <- function(y, y_hat) mean(ae(y, y_hat), na.rm = TRUE)
 82 | 
 83 | 
 84 | 
 85 | mase_cal <- function(insample, outsample, forecasts, avg=TRUE){
 86 |   frq <- 1#frequency(insample)
 87 |   forecastsNaiveSD <- rep(NA,frq)
 88 |   for (j in (frq+1):length(insample)){
 89 |     forecastsNaiveSD <- c(forecastsNaiveSD, insample[j-frq])
 90 |   }
 91 |   masep<-mean(abs(insample-forecastsNaiveSD),na.rm = TRUE)
 92 |   
 93 |   outsample <- as.numeric(outsample) ; forecasts <- as.numeric(forecasts)
 94 |   
 95 |   if (avg) {
 96 |     mase <- mean((abs(outsample-forecasts)))/masep
 97 |   } else {
 98 |     mase <- (abs(outsample-forecasts))/masep
 99 |   }
100 |   
101 |   return(mase)
102 | }
103 | 


--------------------------------------------------------------------------------
/src/plots.r:
--------------------------------------------------------------------------------
  1 | bp_dist <-
  2 |   function(ranks) {
  3 |     if (!is.data.frame(ranks)) {
  4 |       ranks <- as.data.frame(ranks)
  5 |     }
  6 |     
  7 |     avg.rank <- apply(ranks,2,mean,na.rm=TRUE)
  8 |     nms.sort <- names(sort(avg.rank))
  9 |     
 10 |     ranks <- ranks[,nms.sort]
 11 |     ranks <- as.data.frame(ranks)
 12 |     
 13 |     x <- melt(ranks)
 14 |     
 15 |     p <- ggplot(x, aes(variable, value))
 16 |     
 17 |     p <- p  +
 18 |       geom_boxplot() +
 19 |       theme_minimal() +
 20 |       labs(x="",
 21 |            y="Rank") +
 22 |       theme(axis.text.x  = element_text(size=10,
 23 |                                         angle=45)) +
 24 |       theme(axis.text.y  = element_text(size = 10),
 25 |             axis.title.y = element_text(size = 10))
 26 |     
 27 |     p
 28 |   }
 29 | 
 30 | proportion_plot <-
 31 |   function(ds) {
 32 |     require(reshape2)
 33 |     require(ggplot2)
 34 |     ds <- as.data.frame(ds)
 35 |     ds[ ] <- lapply(ds, unlist)
 36 |     ds$method <- rownames(ds)
 37 |     dsm <- melt(ds)
 38 |     colnames(dsm) <- c("method", "Result","val")
 39 |     dsm$Result <- gsub("left|probLeft|winLeft","Rep-Holdout loses",dsm$Result)
 40 |     dsm$Result <- gsub("rope|probRope|winRope","draw",dsm$Result)
 41 |     dsm$Result <- gsub("right|probRight|winRight","Rep-Holdout wins",dsm$Result)
 42 |     dsm$Result <-
 43 |       factor(dsm$Result, levels = c("Rep-Holdout wins",
 44 |                                        "draw",
 45 |                                        "Rep-Holdout loses"))
 46 |     
 47 |     ggplot(dsm, aes(x = method,
 48 |                     y = val,
 49 |                     fill = Result)) +
 50 |       geom_col(position = "fill") +
 51 |       ylab("Proportion of probability") +
 52 |       xlab("") +
 53 |       theme_minimal() +
 54 |       theme(axis.text.x  = element_text(angle = 30,
 55 |                                         size = 11,
 56 |                                         hjust = 1),
 57 |             legend.position = "top") +
 58 |       theme(axis.text.y  = element_text(size = 11),
 59 |             axis.title.y = element_text(size = 11))# +
 60 |       #scale_fill_hue(h = c(180, 300))
 61 |     #scale_fill_grey()
 62 |     #scale_fill_manual( values=c("red","green","blue"))
 63 |     #scale_fill_brewer(palette = "Set1")
 64 |   }
 65 | 
 66 | percdiff_plot <- 
 67 |   function(fresults) {
 68 |     require(reshape2)
 69 |     require(ggplot2)
 70 |     
 71 |     log_trans <- function(x) sign(x) * log(abs(x) + 1)
 72 |     
 73 |     r <- melt(fresults)
 74 |     
 75 |     ggplot(r, aes(x=1,y= value)) +
 76 |       facet_wrap( ~ variable,
 77 |                   nrow = 1,
 78 |                   scales="fixed") +
 79 |       geom_boxplot() +
 80 |       geom_hline(yintercept = 0,col="red") +
 81 |       theme_minimal() +
 82 |       labs(x="",
 83 |            y="Percentual difference to true loss") +
 84 |       theme(axis.text.x  = element_blank()) +
 85 |       theme(axis.text.y  = element_text(size = 14),
 86 |             axis.title.y = element_text(size = 14))
 87 |   }
 88 | 
 89 | 
 90 | percdiff_plot_log <- 
 91 |   function(fresults) {
 92 |     require(reshape2)
 93 |     require(ggplot2)
 94 |     
 95 |     log_trans <- function(x) sign(x) * log(abs(x) + 1)
 96 |     
 97 |     r <- melt(fresults)
 98 |     
 99 |     ggplot(r, aes(x=1,y= log_trans(value))) +
100 |       facet_wrap( ~ variable,
101 |                   nrow = 1,
102 |                   scales="fixed") +
103 |       geom_boxplot() +
104 |       geom_hline(yintercept = 0,col="red") +
105 |       theme_minimal() +
106 |       labs(x="",
107 |            y="Percentual difference to true loss") +
108 |       theme(axis.text.x  = element_blank()) +
109 |       theme(axis.text.y  = element_text(size = 14),
110 |             axis.title.y = element_text(size = 14))
111 |   }
112 | 
113 | 
114 | avg_rank_plot <- 
115 |   function(avg, sdev) {
116 |     require(reshape2)
117 |     require(ggplot2)
118 |     
119 |     ord0 <- order(avg)
120 |     ord <- names(sort(avg))
121 |     
122 |     methods <- names(avg)
123 |     
124 |     ds <- data.frame(avg=avg,sdev=sdev, methods=methods, row.names = NULL)
125 |     ds$methods <- factor(ds$methods, levels = ord)
126 |     
127 |     ##
128 |     meths <- as.character(ds$methods)
129 |     ids <- grep("^CV", meths)
130 |     cols <- rep("#33CCCC",times=length(meths))
131 |     cols[ids] <- "darkorange3"
132 |     cols <- cols[ord0]
133 |     ##
134 |     
135 |     #ds <- melt(ds)
136 |     
137 |     ggplot(data = ds,
138 |            aes(x = methods,
139 |                y = avg)) +
140 |       geom_bar(stat="identity",
141 |                fill=cols) +
142 |       theme_minimal() +
143 |       theme(axis.text.x  = element_text(angle = 35,
144 |                                         size = 14,
145 |                                         hjust = 1)) +
146 |       theme(axis.text.y  = element_text(size = 12),
147 |             axis.title.y = element_text(size = 12)) +
148 |       #geom_hline(yintercept = 1) +
149 |       geom_errorbar(aes(ymin = avg - sdev,
150 |                         ymax = avg + sdev),
151 |                     width = .5,
152 |                     position = position_dodge(.9)) +
153 |       labs(x="",
154 |            y="Avg. rank & Std dev.",
155 |            title = "")
156 |   }


--------------------------------------------------------------------------------
/src/utils.r:
--------------------------------------------------------------------------------
  1 | #' Randomly shuffle the data
  2 | #'
  3 | #' @param x data
  4 | #'
  5 | #' @export
  6 | cv.shuffle <- function(x) x[sample(NROW(x)), ]
  7 | 
  8 | resample_timeseries <-
  9 |   function (x,
 10 |             resamples,
 11 |             size_estimation,
 12 |             size_validation) {
 13 |     n <- nrow(x)
 14 |     tr_size <- as.integer(n * size_estimation)
 15 |     ts_size <- as.integer(n * size_validation)
 16 |     selection_range <- (tr_size + 1):(n - ts_size + 1)
 17 |     origins <- sample(selection_range, resamples)
 18 |     
 19 |     lapply(origins, function(o) {
 20 |       list(train = x[(o - tr_size):(o - 1), ], test = x[o:(o + ts_size - 1), ])
 21 |     })
 22 |   }
 23 | 
 24 | #' Create cross validation folds
 25 | #'
 26 | #' @param x data
 27 | #' @param nfolds no of folds
 28 | #'
 29 | #' @export
 30 | cv.folds <- function(x, nfolds) {
 31 |   cut(seq_len(NROW(x)), breaks = nfolds, labels = FALSE)
 32 | }
 33 | 
 34 | #' train test partitioning holdout style
 35 | #' 
 36 | #' @param x data
 37 | #' @param hat.ratio estimation ratio
 38 | #' 
 39 | #' @export
 40 | partition <- function(x, hat.ratio) {
 41 |   len <- NROW(x)
 42 | 
 43 |   if (class(x)[1] == "data.frame") {
 44 |     train <- x[ seq_len(hat.ratio * len),]
 45 |     test <-  x[-seq_len(hat.ratio * len),]
 46 |   } else {
 47 |     train <- x[ seq_len(hat.ratio * len)]
 48 |     test <-  x[-seq_len(hat.ratio * len)]
 49 |   }
 50 |   list(train = train, test = test)
 51 | }
 52 | 
 53 | 
 54 | rbind_l <- function(x) do.call(rbind, x)
 55 | 
 56 | 
 57 | get_y <- function(test, form) model.response(model.frame(form, test, na.action = NULL))
 58 | 
 59 | 
 60 | roll_mean_matrix <-
 61 |   function(x, lambda) {
 62 |     require(RcppRoll)
 63 |     if (class(x) != "data.frame")
 64 |       stop("x obj class in roll_mean_matrix must be a data.frame")
 65 |     dim1 <- NROW(x)
 66 |     
 67 |     MASE <-
 68 |       lapply(x,
 69 |              function(z) {
 70 |                rollm <- RcppRoll::roll_mean(z, n = lambda)
 71 |                blanks <- dim1 - length(rollm)
 72 |                aux <- numeric(blanks)
 73 |                for (y in seq_along(aux)) {
 74 |                  aux[y] <- RcppRoll::roll_mean(z, n = y)[1]
 75 |                }
 76 |                c(aux, rollm)
 77 |              })
 78 |     as.data.frame(MASE)
 79 |   }
 80 | 
 81 | 
 82 | as_positive <- 
 83 |   function(y) {
 84 |     y - min(y) + 1
 85 |   }
 86 | 
 87 | 
 88 | estimate_k <-
 89 |   function(x, m.max=20,tol=.15) {
 90 |     require(tseriesChaos)
 91 |     
 92 |     fn.out <- false.nearest(x, m.max, d=1, t=1)
 93 |     fn.out <- round(fn.out,4)
 94 |     fn.out[is.na(fn.out)] <- 0
 95 |     #plot(fn.out)
 96 |     
 97 |     fnp.tol <- fn.out["fraction",] > tol
 98 |     fnp.tol.sum <- sum(fnp.tol)
 99 |     
100 |     m <- ifelse(fnp.tol.sum < m.max,fnp.tol.sum + 1, m.max)
101 |     
102 |     m
103 |   }
104 | 
105 | 


--------------------------------------------------------------------------------
/src/workflows.r:
--------------------------------------------------------------------------------
  1 | # workflow_test_size <- 
  2 | #   function(ds,
  3 | #            form,
  4 | #            predictive_algorithm = "rf",
  5 | #            nfolds,
  6 | #            outer_split) {
  7 | #     
  8 | #     khat <- estimate_k(ds[1:(length(ds) * outer_split)], m.max = 30, tol = .01)
  9 | #     if (khat < 8) khat <- 8
 10 | #     
 11 | #     x <- embed_timeseries(as.numeric(ds), khat+1)
 12 | #     
 13 | #     pred_model <-
 14 | #       switch(predictive_algorithm,
 15 | #              "rf" = RF_loss,
 16 | #              "rbr" = RBR_loss,
 17 | #              "gp" = GP_loss,
 18 | #              "lasso" = LASSO_loss,
 19 | #              RF_loss)
 20 | #     
 21 | #     ## 70/30
 22 | #     xp <- partition(x, outer_split)
 23 | #     
 24 | #     train <- xp$train
 25 | #     test <- xp$test
 26 | #     
 27 | #     e7030_loss <-
 28 | #       pred_model(train = train,
 29 | #                  test = test,
 30 | #                  form = form, avg=FALSE)
 31 | #     
 32 | #     batch_70_30 <- mean(e7030_loss)
 33 | #     
 34 | #     test_0.05_upd <- round(nrow(test) * 0.05,0)
 35 | #     test_0.1_upd <- round(nrow(test) * 0.1,0)
 36 | #     test_0.25_upd <- round(nrow(test) * 0.25,0)
 37 | #     test_onl_upd <- 1
 38 | #     
 39 | #     upd_schemes <- c(test_onl_upd,
 40 | #                      test_0.05_upd,
 41 | #                      test_0.1_upd,
 42 | #                      test_0.25_upd)
 43 | #     
 44 | #     results_upds <- c()
 45 | #     for (updsch in upd_schemes) {
 46 | #       cat(".")
 47 | #       #updsch <- upd_schemes[2]
 48 | #       
 49 | #       s_vec <- split(1:nrow(test), ceiling(seq_along(1:nrow(test)) / updsch))
 50 | #       
 51 | #       err_roll <- c()
 52 | #       train0 <- train
 53 | #       for (i in 1:length(s_vec)) {
 54 | #         
 55 | #         if (i == 1) {
 56 | #           err_i <-
 57 | #             pred_model(train = train,
 58 | #                        test = test[s_vec[[i]],],
 59 | #                        form = form, avg=FALSE)
 60 | #           
 61 | #           err_roll <- c(err_roll, err_i)
 62 | #         } else {
 63 | #           idx <- unlist(s_vec[seq_len(i-1)], use.names = FALSE)
 64 | #           traini <- rbind.data.frame(train, test[idx,])
 65 | #           
 66 | #           err_i <-
 67 | #             pred_model(train = traini,
 68 | #                        test = test[s_vec[[i]],],
 69 | #                        form = form, avg=FALSE)
 70 | #           
 71 | #           err_roll <- c(err_roll, err_i)
 72 | #         }
 73 | #         
 74 | #       }
 75 | #       
 76 | #       results_upds <- c(results_upds, mean(err_roll))
 77 | #     }
 78 | #     
 79 | #     results_upds <- c(results_upds, batch_70_30)
 80 | #     names(results_upds) <- c("online","x5perc","x10perc","x25perc","batch")
 81 | #     
 82 | #     results_upds
 83 | #   }
 84 | 
 85 | 
 86 | wf_sample_size <- 
 87 |   function(ds,
 88 |            form,
 89 |            predictive_algorithm = "lasso",
 90 |            nfolds,
 91 |            outer_split) {
 92 |     
 93 |     pred_model <-
 94 |       switch(predictive_algorithm,
 95 |              "rf" = RF_loss,
 96 |              "rbr" = RBR_loss,
 97 |              "mlp" = MLP_loss,
 98 |              "lasso" = LASSO_loss,
 99 |              "cart" = CART_loss,
100 |              "gp" = GP_loss,
101 |              RF_loss)
102 |     
103 |     library(forecast)
104 |     nd <- ndiffs(ds)
105 |     if (nd > 0) {
106 |       ds <- diff(ds, differences=nd)
107 |     }
108 |     
109 |     khat <- estimate_k(ds[1:(length(ds) * outer_split)], m.max = 30, tol = .01)
110 |     if (khat < 8) khat <- 8
111 |     
112 |     x <- embed_timeseries(as.numeric(ds), khat+1)
113 |     x <- head(x, 1000)
114 |     
115 |     iters <- seq(from = 100, to = 1000, by = 100)
116 |     
117 |     results <- vector("list", length(iters)-1)
118 |     for (i in seq_along(iters)[-length(iters)]) {
119 |       it <- iters[i]
120 |       
121 |       train <- x[1:it,]
122 |       test <- x[(it+1):(iters[i+1]),]
123 |       
124 |       #print(nrow(train))
125 |       #print(nrow(test))
126 |       #cat("\n\n")
127 |       true_loss <-
128 |         pred_model(train = train,
129 |                    test = test,
130 |                    form = form)
131 |       
132 |       estimated_loss <- 
133 |         performance_estimation(
134 |           train = train,
135 |           form = form,
136 |           pred_model = pred_model,
137 |           nfolds = nfolds
138 |         )
139 |       
140 |       estimated_loss_on <-
141 |         online_methods_pe(train = train,
142 |                           form = form,
143 |                           pred_model = pred_model)
144 | 
145 |       estimated_loss <-
146 |         c(estimated_loss,
147 |           estimated_loss_on)
148 |       
149 |       err_estimation <- sapply(estimated_loss,
150 |                                function(u) {
151 |                                  ((u - true_loss) / true_loss) * 100
152 |                                })
153 |       
154 |       results[[i]] <- err_estimation
155 |     }
156 |     
157 |     results
158 |   }
159 | 
160 | 
161 | workflow <- 
162 |   function(ds,
163 |            form,
164 |            predictive_algorithm = "rf",
165 |            nfolds,
166 |            outer_split, is_embedded=FALSE) {
167 |     
168 |     if(!is_embedded){
169 |       if (!(predictive_algorithm == "rbr")) {
170 |         library(forecast)
171 |         nd <- ndiffs(ds)
172 |         if (nd > 0) {
173 |           ds <- diff(ds, differences=nd)
174 |         }
175 |       }
176 |       # 
177 |       khat <- estimate_k(ds[1:(length(ds) * outer_split)], m.max = 30, tol = .01)
178 |       if (khat < 8) khat <- 8
179 |       
180 |       x <- embed_timeseries(as.numeric(ds), khat+1)
181 |     } else {
182 |       x<-ds
183 |     }
184 |     #head(x)
185 |     
186 |     xp <- partition(x, outer_split)
187 |     
188 |     train <- xp$train
189 |     test <- xp$test
190 |     
191 |     pred_model <-
192 |       switch(predictive_algorithm,
193 |              "rf" = RF_loss,
194 |              "rbr" = RBR_loss,
195 |              "mlp" = MLP_loss,
196 |              "lasso" = LASSO_loss,
197 |              "cart" = CART_loss,
198 |              "gp" = GP_loss,
199 |              RF_loss)
200 |     
201 |     true_loss <-
202 |       pred_model(train = train,
203 |                  test = test,
204 |                  form = form)
205 |     
206 |     estimated_loss <- 
207 |       performance_estimation(
208 |         train = train,
209 |         form = form,
210 |         pred_model = pred_model,
211 |         nfolds = nfolds
212 |       )
213 |     
214 |     estimated_loss_on <- 
215 |       online_methods_pe(train = train,
216 |                         form = form,
217 |                         pred_model = pred_model)
218 |     
219 |     estimated_loss <- 
220 |       c(estimated_loss,
221 |         estimated_loss_on)
222 |     
223 |     err_estimation <- sapply(estimated_loss,
224 |                              function(u) {
225 |                                ((u - true_loss) / true_loss) * 100
226 |                              })
227 |     
228 |     list(err_estimation=err_estimation, 
229 |          est_err = estimated_loss,
230 |          err=true_loss)
231 |   }
232 | 
233 | 
234 | performance_estimation <- 
235 |   function(train, form, pred_model, nfolds) {
236 |     cat("Estimating loss using ...\n")
237 |     cat("... std. x val ...\n")
238 |     std_x_val <-
239 |       kf_xval(
240 |         x = train,
241 |         nfolds = nfolds,
242 |         FUN = pred_model,
243 |         shuffle.rows = TRUE,
244 |         form = form, 
245 |         average_results = TRUE)
246 |     
247 |     cat("... blocked x val ...\n")
248 |     blocked_x_val <- 
249 |       blocked_kf_xval(
250 |         x = train,
251 |         nfolds = nfolds,
252 |         FUN = pred_model,
253 |         form = form)
254 |     
255 |     cat("... modified x val ...\n")
256 |     mod_x_val <- 
257 |       modified_xval(
258 |         x = train,
259 |         nfolds = nfolds,
260 |         FUN = pred_model,
261 |         average_results = TRUE,
262 |         form = form)
263 |     
264 |     cat("... hv blocked x val ...\n")
265 |     hvb_x_val <- 
266 |       hv.block_xval(
267 |         x = train,
268 |         nfolds = nfolds,
269 |         FUN = pred_model,
270 |         average_results = TRUE,
271 |         form = form
272 |       )
273 |     
274 |     cat("... preq blocks ...\n")
275 |     preq_b <-
276 |       prequential_in_blocks(
277 |         x = train,
278 |         nfolds = nfolds,
279 |         FUN = pred_model,
280 |         average_results = TRUE,
281 |         form = form
282 |       )
283 |     
284 |     cat("... sliding preq blocks ...\n")
285 |     sl_preq_b <-
286 |       sliding_prequential_in_blocks(
287 |         x = train,
288 |         nfolds = nfolds,
289 |         FUN = pred_model,
290 |         average_results = TRUE,
291 |         form = form
292 |       )
293 |     
294 |     cat("... preq blocks w gap ...\n")
295 |     preq_b_gap <-
296 |       prequential_in_blocks_gap(
297 |         x = train,
298 |         nfolds = nfolds,
299 |         FUN = pred_model,
300 |         average_results = TRUE,
301 |         form = form
302 |       )
303 |     
304 |     cat("... holdout ...\n")
305 |     hout <-
306 |       holdout(x = train,
307 |               FUN = pred_model,
308 |               form = form)
309 |     
310 |     cat("... repeated holdout ...\n")
311 |     rephout <-
312 |       repeated_holdout(
313 |         x = train,
314 |         nreps = nfolds,
315 |         train_size = .6,
316 |         test_size = .1,
317 |         average_results = TRUE,
318 |         FUN = pred_model,
319 |         form = form
320 |       )
321 |     
322 |     loss_estimations <- 
323 |       c(x_val = std_x_val,
324 |         b_x_val = blocked_x_val,
325 |         m_x_val = mod_x_val,
326 |         hvb_x_val = hvb_x_val,
327 |         preq_b = preq_b,
328 |         sl_preq_b = sl_preq_b,
329 |         preq_b_gap = preq_b_gap,
330 |         hout = hout,
331 |         rephout = rephout)
332 |     
333 |     loss_estimations
334 |   }
335 | 
336 | online_methods_pe <- 
337 |   function(train, form, pred_model) {
338 |     cat("Estimating loss using ...\n")
339 |     
340 |     xp <- partition(train, .8)
341 |     
342 |     train_ <- xp$train
343 |     validation <- xp$test
344 |     
345 |     cat("... preq lw ...\n")
346 |     preq_lw <-
347 |       PrequentialLandmark(
348 |         train = train_,
349 |         test = validation,
350 |         FUN = pred_model,
351 |         average_results = TRUE,
352 |         form=form
353 |       )
354 |     
355 |     cat("... preq sw ...\n")
356 |     preq_sw <-
357 |       PrequentialSliding(
358 |         train = train_,
359 |         test = validation,
360 |         FUN = pred_model,
361 |         average_results = TRUE,
362 |         form=form
363 |       )
364 |     
365 |     loss_estimations <- 
366 |       c(preq_lw = preq_lw,
367 |         preq_sw = preq_sw)
368 |     
369 |     loss_estimations
370 |   }
371 | 


--------------------------------------------------------------------------------
/stationarity.r:
--------------------------------------------------------------------------------
 1 | load("data/tsl_uni_90_mix.rdata")
 2 | 
 3 | len <- sapply(ts_list, length)
 4 | 
 5 | save(len, file = "series_length.rdata")
 6 | 
 7 | stationarity_test_df <- 
 8 |   function(x) {
 9 |     cat(".")
10 |     library(forecast)
11 |     library(urca)
12 |     
13 |     nd <- ndiffs(x)
14 |     if (nd>0) {
15 |       x <- diff(x,differences=nd)
16 |     }
17 |     
18 |     x <- as.vector(x)
19 |     
20 |     xtest <- ur.df(x,type="none",selectlags="AIC",lags=10)
21 |     
22 |     tr <- unname(xtest@teststat < min(xtest@cval))[1,1]
23 |     
24 |     st <- ifelse(tr, F, T)
25 |     
26 |     st
27 |   }
28 | 
29 | stationarity_test_wave <- 
30 |   function(x) {
31 |     library(locits)
32 |     
33 |     nd <- ndiffs(x)
34 |     if (nd>0) {
35 |       x <- diff(x,differences=nd)
36 |     }
37 |     
38 |     xp <- trunc_timeseries(x)
39 |     
40 |     xtest <- hwtos2(as.vector(xp))
41 |     
42 |     nrej <- xtest$nreject
43 |     
44 |     st <- ifelse(nrej > 0, F, T)
45 |     
46 |     st
47 |   }
48 | 
49 | 
50 | trunc_timeseries <-
51 |   function(x) {
52 |     max_power <- 4096
53 |     seq_powers <- c(max_power,
54 |                     max_power / 2,
55 |                     max_power / 4, 
56 |                     max_power / 8,
57 |                     max_power / 16)
58 |     
59 |     len <- length(x)
60 |     x <- as.vector(x)
61 |     dim_trc <- seq_powers[which(len - seq_powers + 1 > 0)[1]]
62 |     
63 |     truncd_ts <- x[(len - dim_trc + 1):len]
64 |     
65 |     truncd_ts
66 |   }
67 | 
68 | 
69 | is_st_wave <- sapply(ts_list, stationarity_test_wave)
70 | is_st_df <- sapply(ts_list, stationarity_test_df)
71 | is_stationary_2ensemble <- is_st_wave & is_st_df
72 | 
73 | table(is_st_wave)
74 | table(is_st_df)
75 | table(is_stationary_2ensemble)
76 | 
77 | save(is_st_wave,
78 |      is_st_df,
79 |      is_stationary_2ensemble, 
80 |      file = "stationarity_tsdl.rdata")
81 | 
82 | 


--------------------------------------------------------------------------------
/stationarity_tsdl.rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/stationarity_tsdl.rdata


--------------------------------------------------------------------------------
/synthetic/creating-synthetic-ds.r:
--------------------------------------------------------------------------------
 1 | source("synthetic/simulate-ts.r")
 2 | source("src/utils.r")
 3 | 
 4 | library(forecast)
 5 | library(tsensembler)
 6 | 
 7 | mcreps <- 1000; 
 8 | seq. <- seq_len(mcreps)
 9 | ts.len <- 200
10 | 
11 | TS1 <- lapply(seq., function(j) {
12 |   as_positive(
13 |     as.vector(
14 |       simulateLinearTS(ts.len,
15 |                        ar = TRUE,
16 |                        ma = FALSE,
17 |                        lags = 3,
18 |                        maxRoot = 5,
19 |                        n.start = 300)[["ts"]]
20 |     )
21 |   )
22 | })
23 | 
24 | TS2 <- lapply(seq., function(j) {
25 |   as_positive(
26 |     as.vector(
27 |       simulateLinearTS(ts.len,
28 |                        ar = FALSE,
29 |                        ma = TRUE,
30 |                        lags = 1,
31 |                        maxRoot = 5,
32 |                        n.start = 100)[["ts"]]
33 |     )
34 |   )
35 | })
36 | 
37 | data(USAccDeaths)
38 | Y <- as.vector(USAccDeaths)
39 | arima.fit <- Arima(Y, order=c(12,0,0), seasonal = c(1, 0, 0))
40 | 
41 | TS3 <- lapply(seq., function(j) {
42 |   as.vector(
43 |     simulate(object = arima.fit,
44 |              nsim = ts.len)
45 |   )
46 | })
47 | 
48 | TS1 <- lapply(TS1, embed_timeseries, embedding.dimension = 5)
49 | TS2 <- lapply(TS2, embed_timeseries, embedding.dimension = 5)
50 | TS3 <- lapply(TS3, embed_timeseries, embedding.dimension = 5)
51 | 
52 | synthetic <- list(TS1=TS1, TS2=TS2,TS3=TS3)
53 | 
54 | 
55 | plot_df(synthetic$TS1[[180]]$target)
56 | plot_df(synthetic$TS2[[110]]$target)
57 | plot_df(synthetic$TS3[[2]]$target)
58 | 
59 | plot_df <- 
60 |   function(x) {
61 |     df <- as.data.frame(x)
62 |     df$Time <- 1:nrow(df)
63 |     
64 |     ggplot(data=df, aes(x=Time, y=x)) +
65 |       geom_line() +
66 |       theme_minimal() +
67 |       theme(axis.title.y = element_blank(),
68 |             axis.text = element_text(size=12))
69 |     
70 |   }
71 | 
72 | 
73 | 
74 | save(synthetic, file = "data/synthetic.rdata")
75 | 


--------------------------------------------------------------------------------
/synthetic/fresults-analysis-synth.r:
--------------------------------------------------------------------------------
 1 | load("/Users/vcerqueira/Desktop/Results/final_results_synthetic_ts1_preq_rbr.rdata")
 2 | final_results2<-final_results
 3 | load("final_results_synthetic_ts1_rbr.rdata")
 4 | 
 5 | 
 6 | source("src/correlated-t-test.r")
 7 | source("src/plots.r")
 8 | 
 9 | err_estimation <- 
10 |   lapply(final_results,
11 |          function(x) {
12 |            x$err_estimation
13 |          })
14 | 
15 | err_estimation2 <- 
16 |   lapply(final_results2,
17 |          function(x) {
18 |            x$err_estimation
19 |          })
20 | 
21 | for (iter in seq_along(err_estimation)) {
22 |   err_estimation[[iter]] <- 
23 |     c(err_estimation[[iter]], err_estimation2[[iter]])
24 | }
25 | 
26 | nms <- names(final_results[[1]][[1]])
27 | names(final_results2[[1]][[1]])
28 | fr <- do.call(rbind.data.frame, err_estimation)
29 | colnames(fr) <- nms
30 | colnames(fr) <- 
31 |   c("CV", "CV-Bl", "CV-Mod","CV-hvBl",
32 |     "Preq-Bls", "Preq-Sld-Bls",
33 |     "Preq-Bls-Gap","Holdout", "Rep-Holdout",
34 |     "Preq-Grow","Preq-Slide")
35 | 
36 | fr_abs <- abs(fr)
37 | 
38 | fr_abs_rank <- apply(fr_abs, 1, rank)
39 | avg <- rowMeans(fr_abs_rank)
40 | sdev <- apply(fr_abs_rank,1, sd)
41 | 
42 | avg_rank_plot <-
43 |   function(avg, sdev) {
44 |     require(reshape2)
45 |     require(ggplot2)
46 |     
47 |     ord <- names(sort(avg))
48 |     
49 |     methods <- names(avg)
50 |     
51 |     ds <-
52 |       data.frame(
53 |         avg = avg,
54 |         sdev = sdev,
55 |         methods = methods,
56 |         row.names = NULL
57 |       )
58 |     ds$methods <- factor(ds$methods, levels = ord)
59 |     
60 |     #ds <- melt(ds)
61 |     
62 |     ggplot(data = ds,
63 |            aes(x = methods,
64 |                y = avg)) +
65 |       geom_bar(stat = "identity",
66 |                fill = "#33CCCC") +
67 |       theme_minimal() +
68 |       theme(axis.text.x  = element_text(
69 |         angle = 45,
70 |         size = 14,
71 |         hjust = 1
72 |       )) +
73 |       theme(axis.text.y  = element_text(size = 12),
74 |             axis.title.y = element_text(size = 12)) +
75 |       #geom_hline(yintercept = 1) +
76 |       geom_errorbar(aes(ymin = avg - sdev,
77 |                         ymax = avg + sdev),
78 |                     width = .5,
79 |                     position = position_dodge(.9)) +
80 |       labs(x = "",
81 |            y = "Avg. rank & Std dev.",
82 |            title = "")
83 |   }
84 | 
85 | avg_rank_plot(avg,sdev)
86 | percdiff_plot_log(fr)
87 | 
88 | colMeans(fr)
89 | apply(fr,2, sd)
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/synthetic/simulate-ts.r:
--------------------------------------------------------------------------------
  1 | # FROM https://github.com/cbergmeir/tsExpKit/blob/master/R/simulateTS.R
  2 | 
  3 | # The coefficients that are generated are chosen in a way that the resulting 
  4 | # processes are stationary (AR) and invertible (MA).
  5 | # 
  6 | # TODO: function also necessary (and present) in tsDyn
  7 | # For detailed explanation see the INS paper.
  8 | # 
  9 | # @title Generate random coefficients for an ARMA model.
 10 | # @param lags the number of lags
 11 | # @param maxRoot the root of the characteristic polynomial is chosen between 1.1 and \code{maxRoot}. 
 12 | # @export
 13 | generateRandomARMAParameters <- function(lags, maxRoot)  {
 14 |   
 15 |   if(maxRoot <= 1.1) stop("maxRoot has to be bigger than 1.1")
 16 |   
 17 |   l <- lags# + 1
 18 |   s <- sign(runif(l,-1,1))
 19 |   polyRoots <- s*runif(l,1.1,maxRoot)
 20 |   
 21 |   #calculate coefficients
 22 |   coeff <- 1
 23 |   for(root in polyRoots) coeff <- c(0,coeff) - c(root*coeff,0)
 24 |   
 25 |   nCoeff <- coeff / coeff[1]
 26 |   params <- - nCoeff[2:length(nCoeff)]  
 27 |   
 28 |   return(params)
 29 | }
 30 | 
 31 | ##old version for parameter generation
 32 | #generateRandomARMAParameters <- function(lags)  {
 33 | #  rVals <- rnorm(lags)
 34 | #  values <- -sort(-abs(rVals))/(max(abs(rVals))+0.05)
 35 | #  factors <- rep(c(1, -1),floor(lags/2))
 36 | #  if(lags %% 2 == 1) factors <- c(factors, 1)
 37 | #  params <- factors * values
 38 | #  return(params)
 39 | #}
 40 | 
 41 | ##old version for non-linear time series simulation
 42 | #simulateNonlinearTS <- function(length, lags=4, seed=1)  {
 43 | #  
 44 | #  set.seed(seed)
 45 | # 
 46 | #  maxlags <- lags + 10 
 47 | #  model <- "nonlin"
 48 | #  
 49 | #  w <- rnorm(length+maxlags, sd = 0.01)
 50 | #  
 51 | #  lts <- simulateLinearTS(length+maxlags, lags=lags, seed=seed)[["ts"]]
 52 | #  
 53 | #  z <- rep(0, length+maxlags)
 54 | #  for(t in (maxlags-5):(length+maxlags)) 
 55 | #    z[t] <- 0.7 * exp (-z[t - 1] / length) + 0.4 * cos (z[t - 2]) + 0.25 * atan(z[t - 3]) + w[t] +lts[t]
 56 | #  
 57 | #  ts <- z[maxlags:(length+maxlags-1)]
 58 | #  
 59 | #  return(list(lags=lags, model=model, seed=seed, ts=ts))
 60 | #  
 61 | #}
 62 | 
 63 | #' This function can be used to generate pure AR, pure MA, and ARMA series. The AR part will be
 64 | #' stationary and the MA part invertible. Therefore, the coefficients are not given directly,
 65 | #' but a value \code{maxRoot} which controls the interval from which the roots for the 
 66 | #' characteristic polynomials are chosen. So, all the roots of the characteristic polynomials 
 67 | #' are real-valued. For a detailed explanation see the referenced literature.
 68 | #' 
 69 | #' @title Generate AR, MA, and ARMA series 
 70 | #' @param length the length of the series to be generated
 71 | #' @param ar if \code{TRUE}, series has an AR part
 72 | #' @param ma if \code{TRUE}, series has an MA part
 73 | #' @param lags the number of lags
 74 | #' @param seed a seed used for random number generation
 75 | #' @param maxRoot the roots of the characteristic polynomials are chosen between 1.1 and \code{maxRoot} 
 76 | #' @param n.start burn-in period. if NA, calculated automatically by arima.sim
 77 | #' @param ... parameters passed to arima.sim
 78 | #' @return a list containing all the parameters, and a member \code{ts} with the generated series
 79 | #' @references
 80 | #' C. Bergmeir, J.M. Benítez, On the Use of Cross-validation for Time Series Predictor Evaluation. Information Sciences 191 (2012) 192-213.
 81 | #' @export
 82 | simulateLinearTS <- function (length, ar=TRUE, ma=TRUE, lags=4, seed=1, maxRoot=5.0, n.start = NA, ...)  {
 83 |   
 84 |   set.seed(seed)
 85 |   
 86 |   params <- generateRandomARMAParameters(lags, maxRoot)
 87 |   
 88 |   #print(params)
 89 |   
 90 |   if(ar && !ma)  {
 91 |     model <- "AR"    
 92 |     paramsAr <- params
 93 |     paramsMa <- 0     
 94 |     ts <- arima.sim(model=list(ar=params),n=length, n.start = n.start, ...)
 95 |   } else if(!ar && ma) {
 96 |     model <- "MA"    
 97 |     paramsMa <- params
 98 |     paramsAr <- 0
 99 |     ts <- arima.sim(model=list(ma=params),n=length, n.start = n.start, ...)
100 |   } else if(ar && ma)  {
101 |     model <- "ARMA"    
102 |     paramsMa <- params    
103 |     paramsAr <- generateRandomARMAParameters(lags, maxRoot)    
104 |     ts <- arima.sim(model=list(ar=paramsAr, ma=params),n=length, n.start = n.start)
105 |   }
106 |   
107 |   return(list(paramsAr=paramsAr, paramsMa=paramsMa, lags=lags, model=model, seed=seed, maxRoot=maxRoot, ts=ts))
108 | }
109 | 
110 | #' This function can be used to generate nonlinear time series. 
111 | #' It is similar to the function \code{\link{simulateLinearTS}}, but applies nonlinear functions to certain lags.
112 | #' The nonlinear functions currently used are: cos, sin, tanh, atan, and exp(-x/10000).
113 | #' For a detailed explanation see the referenced literature.
114 | #' 
115 | #' @title Generate nonlinear time series 
116 | #' @param length the length of the series to be generated
117 | #' @param lags the number of lags
118 | #' @param seed a seed used for random number generation
119 | #' @param maxRoot the roots of the characteristic polynomials are chosen between 1.1 and \code{maxRoot} 
120 | #' @return a list containing all the parameters, and a member \code{ts} with the generated series
121 | #' @references
122 | #' C. Bergmeir, J.M. Benítez, On the Use of Cross-validation for Time Series Predictor Evaluation. Information Sciences 191 (2012) 192-213.
123 | #' @export
124 | #' @export
125 | simulateNonlinearTS <- function(length, lags=4, seed=1, maxRoot=5.0)  {
126 |   
127 |   #  length=10000
128 |   #  lags=20
129 |   #  seed=3
130 |   #  maxRoot=5.0
131 |   
132 |   x <- NULL
133 |   
134 |   randNonlinX <- function(type, x)  {
135 |     nonlinX <- switch(type, cos(x), sin(x), tanh(x), atan(x), exp(-x/10000))#, cosh(x), sinh(x))#,1/(x*x), 1/(x*x*x))#x,tan(x)),  
136 |     return(nonlinX)
137 |   }
138 |   
139 |   set.seed(seed)
140 |   
141 |   params <- generateRandomARMAParameters(lags, maxRoot)
142 |   #print(params)
143 |   
144 |   x[1:lags] <- rnorm (lags, sd = 0.5)
145 |   
146 |   type <- ceiling(runif(lags,min=0, max=5))
147 |   
148 |   #print(type)
149 |   
150 |   #type[1:lags] <- 1
151 |   
152 |   for (i in ((lags+1):(length+2*lags))) {
153 |     
154 |     x[i] <- rnorm (1, sd = 0.5) 
155 |     
156 |     for (j in 1:lags)  {
157 |       x[i] <- x[i] + params[j]*randNonlinX(type[j], x[i-j]) 
158 |     }
159 |   }
160 |   
161 |   model <- "nonlin"
162 |   ts <- x[(2*lags+1):(length+2*lags)]
163 |   
164 |   return(list(lags=lags, model=model, seed=seed, ts=ts))
165 |   
166 | }
167 | 
168 | ##example code
169 | #par(mfrow = c(4, 1))
170 | #
171 | #ts <- simulateNonlinearTS(1000, seed=3, lags=15)
172 | #ts
173 | #plot(ts$ts, type="l")
174 | #acf(ts$ts)
175 | #tnnTest(ts$ts)
176 | #
177 | #ts <- simulateNonlinearTS(1000, seed=3, lags=9)
178 | #ts
179 | #plot(ts$ts, type="l")
180 | #acf(ts$ts)
181 | 
182 | 
183 | 
184 | ##-----------------------
185 | #VAR
186 | 
187 | mcevec <- function(lambda, c, n, d) {
188 |   
189 |   v <- vector()
190 |   v[1:n] <- 0
191 |   v[(n-d+1):n] <- c
192 |   
193 |   for(j in (n-d):1) {
194 |     v[j] <- lambda*v[j+d]
195 |   }
196 |   
197 |   v
198 | }
199 | 
200 | #mcgenevec <- function(lambda, c, w, n, d) {
201 | #  
202 | #  v <- vector()
203 | #  v[1:n] <- 0
204 | #  v[(n-d+1):n] <- c
205 | #  
206 | #  for(j in (n-d):1) {
207 | #    v[j] <- lambda*v[j+d] + w[j+d]
208 | #  }
209 | #  
210 | #  v
211 | #}
212 | 
213 | companionMatrixFromEigenvalues <- function(q, lambda, c, d) {
214 |   
215 |   nchain <- length(q)
216 |   n <- sum(q)
217 |   
218 |   #v <- matrix(0, nrow=n, ncol=n)
219 |   
220 |   v <- matrix(0, nrow=n, ncol=n)
221 |   
222 |   i <- 0
223 |   for(k in 1:nchain) {
224 |     v[, i+1] <- mcevec(lambda[k], c[, i+1], n, d)
225 |     
226 |     #    if(q[k] > 1) {
227 |     #      for(j in 2:q[k]) {
228 |     #        mcgenevec(lambda[k], c[, i+1], v[,i+j-1], n, d)
229 |     #      }      
230 |     #    }
231 |     i <- i+q[k]
232 |   }
233 |   
234 |   X <- v
235 |   #X
236 |   
237 |   F <- X%*%diag(lambda) %*% solve(X)
238 |   F
239 | }
240 | 
241 | 
242 | generateRandomStableCompanionMatrix <- function(dim = 2, order = 2) {
243 |   
244 |   lenq <- order*dim
245 |   
246 |   #  c <- matrix(runif(lenq), nrow=order, ncol=lenq)
247 |   #  c[which(c>0.5)] <- 1
248 |   #  c[which(c<=0.5)] <- 0
249 |   
250 |   c <- matrix(1-diag(dim),nrow=dim,ncol=lenq)
251 |   
252 |   lambda <- runif(lenq, min=0.01)
253 |   
254 |   #TODO: generation of complex eigenvalues results in complex values in F. This shouldn't be the case.
255 |   #  ##generate eigenvalues within unit circle. 
256 |   #  
257 |   #  if((lenq%%2) == 1) {
258 |   #    
259 |   #    if(lenq==1 || lenq==3) {
260 |   #      lambda <- runif(lenq, min=0.01)      
261 |   #    } else {
262 |   #      
263 |   #      theta <- runif((lenq%/%2) - 1, min=0, max=2*pi)
264 |   #      r <- runif((lenq%/%2) - 1, min=0.01)
265 |   #      
266 |   #      lambda <- complex(real=r*cos(theta), imaginary=r*sin(theta))
267 |   #      lambda <- c(lambda, Conj(lambda))
268 |   #      
269 |   #      lambda <- c(lambda, runif(3, min=0.01))  
270 |   #      
271 |   #    }
272 |   #  } else {
273 |   #    
274 |   #    theta <- runif(lenq%/%2, min=0, max=2*pi)
275 |   #    r <- runif(lenq%/%2, min=0.01)
276 |   #    
277 |   #    lambda <- complex(real=r*cos(theta), imaginary=r*sin(theta))
278 |   #    lambda <- c(lambda, Conj(lambda))
279 |   #    
280 |   #  }
281 |   
282 |   F <- companionMatrixFromEigenvalues(seq(1,1,length=lenq), lambda, c, dim)
283 |   
284 |   F[which(Mod(F)<1e-10)] <- 0
285 |   
286 |   if(sum(Mod(Im(F)))> 1e-10) {
287 |     warning("generateRandomStableCompanionMatrix: Companion matrix is not real-valued")
288 |     print(F)
289 |   }
290 |   
291 |   F <- Re(F)
292 |   
293 |   list(F=F, lambda=lambda)
294 | }
295 | 
296 | #' This function can be used to simulate data from a random, stable VAR process.
297 | #' 
298 | #' @title Simulate data from stable VAR process 
299 | #' @param dim the dimension of the VAR (bivariate, trivariate,...)
300 | #' @param order the order of the VAR (how many lags)
301 | #' @param sd the standard deviation of the noise to be included
302 | #' @param length the length of the series to be generated
303 | #' @return a matrix containing the simulated data
304 | #' @references
305 | #' G.N. Boshnakov, B.M. Iqelan (2009). Generation of time series models with given spectral properties. Journal of Time Series Analysis 30(3):349-368. 
306 | #' @export
307 | simulateStableVarProcess <- function(dim=3, order=2, sd=1, length=1000) {
308 |   
309 |   compMat <- generateRandomStableCompanionMatrix(dim=dim, order=order)
310 |   
311 |   A <- list()
312 |   for(i in 1:order) {
313 |     
314 |     A[[i]] <- compMat$F[1:dim, ((i-1)*dim+1):(i*dim)]
315 |     
316 |   }
317 |   #A
318 |   
319 |   l <- length+100
320 |   
321 |   repeat {
322 |     
323 |     #------
324 |     #generate random covariance structure and noise using this covariance structure
325 |     cov <- matrix(runif(dim*dim, min=-1), ncol=dim)
326 |     diag(cov) <- 1
327 |     cov <- cov*t(cov)
328 |     
329 |     L <- try(chol(cov))
330 |     if(class(L) != "try-error") break
331 |   }  
332 |   
333 |   L <- chol(cov)
334 |   
335 |   u <- t(L) %*% matrix(rnorm(l*dim, sd=sd), nrow=dim)
336 |   #------
337 |   
338 |   y <- matrix(0, ncol=l, nrow=dim)
339 |   
340 |   y[,1:order] <- rnorm(dim*order, sd=sd)
341 |   
342 |   for(i in (order+1):l) {
343 |     
344 |     y[,i] <- u[,i-order]
345 |     
346 |     for(j in 1:order) {
347 |       y[,i] <- y[,i] + A[[j]]%*%y[,i-j]
348 |     }
349 |     #y[,i] <- A1%*%y[,i-1] + A2%*%y[,i-2] + u[,i-2]
350 |   }
351 |   
352 |   y[,101:l]
353 | }


--------------------------------------------------------------------------------
/synthetic_data_generation/creating-synthetic-ds.r:
--------------------------------------------------------------------------------
 1 | source("simulate-ts.r")
 2 | source("../src/utils.r")
 3 | 
 4 | library(forecast)
 5 | library(tsensembler)
 6 | 
 7 | mcreps <- 1000; 
 8 | seq. <- seq_len(mcreps)
 9 | ts.len <- 200
10 | 
11 | TS1 <- lapply(seq., function(j) {
12 |   as_positive(
13 |     as.vector(
14 |       simulateLinearTS(ts.len,
15 |                        ar = TRUE,
16 |                        ma = FALSE,
17 |                        lags = 3,
18 |                        maxRoot = 5,
19 |                        n.start = 300)[["ts"]]
20 |     )
21 |   )
22 | })
23 | 
24 | TS2 <- lapply(seq., function(j) {
25 |   as_positive(
26 |     as.vector(
27 |       simulateLinearTS(ts.len,
28 |                        ar = FALSE,
29 |                        ma = TRUE,
30 |                        lags = 1,
31 |                        maxRoot = 5,
32 |                        n.start = 100)[["ts"]]
33 |     )
34 |   )
35 | })
36 | 
37 | data(USAccDeaths)
38 | Y <- as.vector(USAccDeaths)
39 | arima.fit <- Arima(Y, order=c(12,0,0), seasonal = c(1, 0, 0))
40 | 
41 | TS3 <- lapply(seq., function(j) {
42 |   as.vector(
43 |     simulate(object = arima.fit,
44 |              nsim = ts.len)
45 |   )
46 | })
47 | 
48 | TS1 <- lapply(TS1, embed_timeseries, embedding.dimension = 5)
49 | TS2 <- lapply(TS2, embed_timeseries, embedding.dimension = 5)
50 | TS3 <- lapply(TS3, embed_timeseries, embedding.dimension = 5)
51 | 
52 | synthetic <- list(TS1=TS1, TS2=TS2,TS3=TS3)
53 | 
54 | 
55 | save(synthetic, file = "data/synthetic.rdata")
56 | 


--------------------------------------------------------------------------------
/synthetic_data_generation/simulate-ts.r:
--------------------------------------------------------------------------------
  1 | # FROM https://github.com/cbergmeir/tsExpKit/blob/master/R/simulateTS.R
  2 | 
  3 | # The coefficients that are generated are chosen in a way that the resulting 
  4 | # processes are stationary (AR) and invertible (MA).
  5 | # 
  6 | # TODO: function also necessary (and present) in tsDyn
  7 | # For detailed explanation see the INS paper.
  8 | # 
  9 | # @title Generate random coefficients for an ARMA model.
 10 | # @param lags the number of lags
 11 | # @param maxRoot the root of the characteristic polynomial is chosen between 1.1 and \code{maxRoot}. 
 12 | # @export
 13 | generateRandomARMAParameters <- function(lags, maxRoot)  {
 14 |   
 15 |   if(maxRoot <= 1.1) stop("maxRoot has to be bigger than 1.1")
 16 |   
 17 |   l <- lags# + 1
 18 |   s <- sign(runif(l,-1,1))
 19 |   polyRoots <- s*runif(l,1.1,maxRoot)
 20 |   
 21 |   #calculate coefficients
 22 |   coeff <- 1
 23 |   for(root in polyRoots) coeff <- c(0,coeff) - c(root*coeff,0)
 24 |   
 25 |   nCoeff <- coeff / coeff[1]
 26 |   params <- - nCoeff[2:length(nCoeff)]  
 27 |   
 28 |   return(params)
 29 | }
 30 | 
 31 | ##old version for parameter generation
 32 | #generateRandomARMAParameters <- function(lags)  {
 33 | #  rVals <- rnorm(lags)
 34 | #  values <- -sort(-abs(rVals))/(max(abs(rVals))+0.05)
 35 | #  factors <- rep(c(1, -1),floor(lags/2))
 36 | #  if(lags %% 2 == 1) factors <- c(factors, 1)
 37 | #  params <- factors * values
 38 | #  return(params)
 39 | #}
 40 | 
 41 | ##old version for non-linear time series simulation
 42 | #simulateNonlinearTS <- function(length, lags=4, seed=1)  {
 43 | #  
 44 | #  set.seed(seed)
 45 | # 
 46 | #  maxlags <- lags + 10 
 47 | #  model <- "nonlin"
 48 | #  
 49 | #  w <- rnorm(length+maxlags, sd = 0.01)
 50 | #  
 51 | #  lts <- simulateLinearTS(length+maxlags, lags=lags, seed=seed)[["ts"]]
 52 | #  
 53 | #  z <- rep(0, length+maxlags)
 54 | #  for(t in (maxlags-5):(length+maxlags)) 
 55 | #    z[t] <- 0.7 * exp (-z[t - 1] / length) + 0.4 * cos (z[t - 2]) + 0.25 * atan(z[t - 3]) + w[t] +lts[t]
 56 | #  
 57 | #  ts <- z[maxlags:(length+maxlags-1)]
 58 | #  
 59 | #  return(list(lags=lags, model=model, seed=seed, ts=ts))
 60 | #  
 61 | #}
 62 | 
 63 | #' This function can be used to generate pure AR, pure MA, and ARMA series. The AR part will be
 64 | #' stationary and the MA part invertible. Therefore, the coefficients are not given directly,
 65 | #' but a value \code{maxRoot} which controls the interval from which the roots for the 
 66 | #' characteristic polynomials are chosen. So, all the roots of the characteristic polynomials 
 67 | #' are real-valued. For a detailed explanation see the referenced literature.
 68 | #' 
 69 | #' @title Generate AR, MA, and ARMA series 
 70 | #' @param length the length of the series to be generated
 71 | #' @param ar if \code{TRUE}, series has an AR part
 72 | #' @param ma if \code{TRUE}, series has an MA part
 73 | #' @param lags the number of lags
 74 | #' @param seed a seed used for random number generation
 75 | #' @param maxRoot the roots of the characteristic polynomials are chosen between 1.1 and \code{maxRoot} 
 76 | #' @param n.start burn-in period. if NA, calculated automatically by arima.sim
 77 | #' @param ... parameters passed to arima.sim
 78 | #' @return a list containing all the parameters, and a member \code{ts} with the generated series
 79 | #' @references
 80 | #' C. Bergmeir, J.M. Benítez, On the Use of Cross-validation for Time Series Predictor Evaluation. Information Sciences 191 (2012) 192-213.
 81 | #' @export
 82 | simulateLinearTS <- function (length, ar=TRUE, ma=TRUE, lags=4, seed=1, maxRoot=5.0, n.start = NA, ...)  {
 83 |   
 84 |   set.seed(seed)
 85 |   
 86 |   params <- generateRandomARMAParameters(lags, maxRoot)
 87 |   
 88 |   #print(params)
 89 |   
 90 |   if(ar && !ma)  {
 91 |     model <- "AR"    
 92 |     paramsAr <- params
 93 |     paramsMa <- 0     
 94 |     ts <- arima.sim(model=list(ar=params),n=length, n.start = n.start, ...)
 95 |   } else if(!ar && ma) {
 96 |     model <- "MA"    
 97 |     paramsMa <- params
 98 |     paramsAr <- 0
 99 |     ts <- arima.sim(model=list(ma=params),n=length, n.start = n.start, ...)
100 |   } else if(ar && ma)  {
101 |     model <- "ARMA"    
102 |     paramsMa <- params    
103 |     paramsAr <- generateRandomARMAParameters(lags, maxRoot)    
104 |     ts <- arima.sim(model=list(ar=paramsAr, ma=params),n=length, n.start = n.start)
105 |   }
106 |   
107 |   return(list(paramsAr=paramsAr, paramsMa=paramsMa, lags=lags, model=model, seed=seed, maxRoot=maxRoot, ts=ts))
108 | }
109 | 
110 | #' This function can be used to generate nonlinear time series. 
111 | #' It is similar to the function \code{\link{simulateLinearTS}}, but applies nonlinear functions to certain lags.
112 | #' The nonlinear functions currently used are: cos, sin, tanh, atan, and exp(-x/10000).
113 | #' For a detailed explanation see the referenced literature.
114 | #' 
115 | #' @title Generate nonlinear time series 
116 | #' @param length the length of the series to be generated
117 | #' @param lags the number of lags
118 | #' @param seed a seed used for random number generation
119 | #' @param maxRoot the roots of the characteristic polynomials are chosen between 1.1 and \code{maxRoot} 
120 | #' @return a list containing all the parameters, and a member \code{ts} with the generated series
121 | #' @references
122 | #' C. Bergmeir, J.M. Benítez, On the Use of Cross-validation for Time Series Predictor Evaluation. Information Sciences 191 (2012) 192-213.
123 | #' @export
124 | #' @export
125 | simulateNonlinearTS <- function(length, lags=4, seed=1, maxRoot=5.0)  {
126 |   
127 |   #  length=10000
128 |   #  lags=20
129 |   #  seed=3
130 |   #  maxRoot=5.0
131 |   
132 |   x <- NULL
133 |   
134 |   randNonlinX <- function(type, x)  {
135 |     nonlinX <- switch(type, cos(x), sin(x), tanh(x), atan(x), exp(-x/10000))#, cosh(x), sinh(x))#,1/(x*x), 1/(x*x*x))#x,tan(x)),  
136 |     return(nonlinX)
137 |   }
138 |   
139 |   set.seed(seed)
140 |   
141 |   params <- generateRandomARMAParameters(lags, maxRoot)
142 |   #print(params)
143 |   
144 |   x[1:lags] <- rnorm (lags, sd = 0.5)
145 |   
146 |   type <- ceiling(runif(lags,min=0, max=5))
147 |   
148 |   #print(type)
149 |   
150 |   #type[1:lags] <- 1
151 |   
152 |   for (i in ((lags+1):(length+2*lags))) {
153 |     
154 |     x[i] <- rnorm (1, sd = 0.5) 
155 |     
156 |     for (j in 1:lags)  {
157 |       x[i] <- x[i] + params[j]*randNonlinX(type[j], x[i-j]) 
158 |     }
159 |   }
160 |   
161 |   model <- "nonlin"
162 |   ts <- x[(2*lags+1):(length+2*lags)]
163 |   
164 |   return(list(lags=lags, model=model, seed=seed, ts=ts))
165 |   
166 | }
167 | 
168 | ##example code
169 | #par(mfrow = c(4, 1))
170 | #
171 | #ts <- simulateNonlinearTS(1000, seed=3, lags=15)
172 | #ts
173 | #plot(ts$ts, type="l")
174 | #acf(ts$ts)
175 | #tnnTest(ts$ts)
176 | #
177 | #ts <- simulateNonlinearTS(1000, seed=3, lags=9)
178 | #ts
179 | #plot(ts$ts, type="l")
180 | #acf(ts$ts)
181 | 
182 | 
183 | 
184 | ##-----------------------
185 | #VAR
186 | 
187 | mcevec <- function(lambda, c, n, d) {
188 |   
189 |   v <- vector()
190 |   v[1:n] <- 0
191 |   v[(n-d+1):n] <- c
192 |   
193 |   for(j in (n-d):1) {
194 |     v[j] <- lambda*v[j+d]
195 |   }
196 |   
197 |   v
198 | }
199 | 
200 | #mcgenevec <- function(lambda, c, w, n, d) {
201 | #  
202 | #  v <- vector()
203 | #  v[1:n] <- 0
204 | #  v[(n-d+1):n] <- c
205 | #  
206 | #  for(j in (n-d):1) {
207 | #    v[j] <- lambda*v[j+d] + w[j+d]
208 | #  }
209 | #  
210 | #  v
211 | #}
212 | 
213 | companionMatrixFromEigenvalues <- function(q, lambda, c, d) {
214 |   
215 |   nchain <- length(q)
216 |   n <- sum(q)
217 |   
218 |   #v <- matrix(0, nrow=n, ncol=n)
219 |   
220 |   v <- matrix(0, nrow=n, ncol=n)
221 |   
222 |   i <- 0
223 |   for(k in 1:nchain) {
224 |     v[, i+1] <- mcevec(lambda[k], c[, i+1], n, d)
225 |     
226 |     #    if(q[k] > 1) {
227 |     #      for(j in 2:q[k]) {
228 |     #        mcgenevec(lambda[k], c[, i+1], v[,i+j-1], n, d)
229 |     #      }      
230 |     #    }
231 |     i <- i+q[k]
232 |   }
233 |   
234 |   X <- v
235 |   #X
236 |   
237 |   F <- X%*%diag(lambda) %*% solve(X)
238 |   F
239 | }
240 | 
241 | 
242 | generateRandomStableCompanionMatrix <- function(dim = 2, order = 2) {
243 |   
244 |   lenq <- order*dim
245 |   
246 |   #  c <- matrix(runif(lenq), nrow=order, ncol=lenq)
247 |   #  c[which(c>0.5)] <- 1
248 |   #  c[which(c<=0.5)] <- 0
249 |   
250 |   c <- matrix(1-diag(dim),nrow=dim,ncol=lenq)
251 |   
252 |   lambda <- runif(lenq, min=0.01)
253 |   
254 |   #TODO: generation of complex eigenvalues results in complex values in F. This shouldn't be the case.
255 |   #  ##generate eigenvalues within unit circle. 
256 |   #  
257 |   #  if((lenq%%2) == 1) {
258 |   #    
259 |   #    if(lenq==1 || lenq==3) {
260 |   #      lambda <- runif(lenq, min=0.01)      
261 |   #    } else {
262 |   #      
263 |   #      theta <- runif((lenq%/%2) - 1, min=0, max=2*pi)
264 |   #      r <- runif((lenq%/%2) - 1, min=0.01)
265 |   #      
266 |   #      lambda <- complex(real=r*cos(theta), imaginary=r*sin(theta))
267 |   #      lambda <- c(lambda, Conj(lambda))
268 |   #      
269 |   #      lambda <- c(lambda, runif(3, min=0.01))  
270 |   #      
271 |   #    }
272 |   #  } else {
273 |   #    
274 |   #    theta <- runif(lenq%/%2, min=0, max=2*pi)
275 |   #    r <- runif(lenq%/%2, min=0.01)
276 |   #    
277 |   #    lambda <- complex(real=r*cos(theta), imaginary=r*sin(theta))
278 |   #    lambda <- c(lambda, Conj(lambda))
279 |   #    
280 |   #  }
281 |   
282 |   F <- companionMatrixFromEigenvalues(seq(1,1,length=lenq), lambda, c, dim)
283 |   
284 |   F[which(Mod(F)<1e-10)] <- 0
285 |   
286 |   if(sum(Mod(Im(F)))> 1e-10) {
287 |     warning("generateRandomStableCompanionMatrix: Companion matrix is not real-valued")
288 |     print(F)
289 |   }
290 |   
291 |   F <- Re(F)
292 |   
293 |   list(F=F, lambda=lambda)
294 | }
295 | 
296 | #' This function can be used to simulate data from a random, stable VAR process.
297 | #' 
298 | #' @title Simulate data from stable VAR process 
299 | #' @param dim the dimension of the VAR (bivariate, trivariate,...)
300 | #' @param order the order of the VAR (how many lags)
301 | #' @param sd the standard deviation of the noise to be included
302 | #' @param length the length of the series to be generated
303 | #' @return a matrix containing the simulated data
304 | #' @references
305 | #' G.N. Boshnakov, B.M. Iqelan (2009). Generation of time series models with given spectral properties. Journal of Time Series Analysis 30(3):349-368. 
306 | #' @export
307 | simulateStableVarProcess <- function(dim=3, order=2, sd=1, length=1000) {
308 |   
309 |   compMat <- generateRandomStableCompanionMatrix(dim=dim, order=order)
310 |   
311 |   A <- list()
312 |   for(i in 1:order) {
313 |     
314 |     A[[i]] <- compMat$F[1:dim, ((i-1)*dim+1):(i*dim)]
315 |     
316 |   }
317 |   #A
318 |   
319 |   l <- length+100
320 |   
321 |   repeat {
322 |     
323 |     #------
324 |     #generate random covariance structure and noise using this covariance structure
325 |     cov <- matrix(runif(dim*dim, min=-1), ncol=dim)
326 |     diag(cov) <- 1
327 |     cov <- cov*t(cov)
328 |     
329 |     L <- try(chol(cov))
330 |     if(class(L) != "try-error") break
331 |   }  
332 |   
333 |   L <- chol(cov)
334 |   
335 |   u <- t(L) %*% matrix(rnorm(l*dim, sd=sd), nrow=dim)
336 |   #------
337 |   
338 |   y <- matrix(0, ncol=l, nrow=dim)
339 |   
340 |   y[,1:order] <- rnorm(dim*order, sd=sd)
341 |   
342 |   for(i in (order+1):l) {
343 |     
344 |     y[,i] <- u[,i-order]
345 |     
346 |     for(j in 1:order) {
347 |       y[,i] <- y[,i] + A[[j]]%*%y[,i-j]
348 |     }
349 |     #y[,i] <- A1%*%y[,i-1] + A2%*%y[,i-2] + u[,i-2]
350 |   }
351 |   
352 |   y[,101:l]
353 | }


--------------------------------------------------------------------------------