├── README.md ├── best_estimator.rdata ├── data ├── data.r ├── synthetic.rdata ├── timeseries.rdata ├── tseries_vcerq.rdata └── tsl_uni_90_mix.rdata ├── descriptive-model.r ├── perfestimation-rw.r ├── perfestimation-samplesize.r ├── perfestimation-synthetic.r ├── perfestimation.Rproj ├── result-analysis-ss.r ├── result-analysis-synthetic.r ├── result-analysis.R ├── series_length.rdata ├── src ├── dynamics.r ├── estimation-procedures.r ├── learning-models.r ├── metrics.r ├── plots.r ├── utils.r └── workflows.r ├── stationarity.r ├── stationarity_tsdl.rdata ├── synthetic ├── creating-synthetic-ds.r ├── fresults-analysis-synth.r └── simulate-ts.r └── synthetic_data_generation ├── creating-synthetic-ds.r └── simulate-ts.r /README.md: -------------------------------------------------------------------------------- 1 | # Time Series Forecasting Performance Estimation 2 | 3 | This repository contains the code and datasets used for experiments conducted in the following paper: 4 | 5 | **Cerqueira, V., Torgo, L., & Mozetič, I. (2020). Evaluating time series forecasting models: An empirical study on performance estimation methods. Machine Learning, 109(11), 1997-2028.** 6 | 7 | ## Citation 8 | 9 | If you use this code or the findings in your research, please cite: 10 | 11 | ```bibtex 12 | @article{cerqueira2020evaluating, 13 | title={Evaluating time series forecasting models: An empirical study on performance estimation methods}, 14 | author={Cerqueira, Vitor and Torgo, Luis and Mozeti{\v{c}}, Igor}, 15 | journal={Machine Learning}, 16 | volume={109}, 17 | number={11}, 18 | pages={1997--2028}, 19 | year={2020}, 20 | publisher={Springer} 21 | } 22 | ``` 23 | 24 | ### Running 25 | 26 | The main script is `perfestimation-rw.r`. 27 | 28 | The script `perfestimation-synthetic.r` is only ran for a specific (dataset, algorithm) pair. You need to change those to get the results for a different pair. 29 | 30 | #### Contact 31 | 32 | Get in touch at vitorc.research@gmail.com 33 | -------------------------------------------------------------------------------- /best_estimator.rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/best_estimator.rdata -------------------------------------------------------------------------------- /data/data.r: -------------------------------------------------------------------------------- 1 | library(tsdl) 2 | library(forecast) 3 | 4 | min_len <- 500 5 | tsdl_list <- tsdl[sapply(tsdl, length) > min_len] 6 | #tsdl_list <- tsdl 7 | tsdl_list <- 8 | tsdl_list[!sapply(tsdl_list, 9 | function(x) { 10 | any(is.na(x)) 11 | })] 12 | 13 | tsdl_list[] <- lapply(tsdl_list, head, 4000) 14 | 15 | is_univar <- sapply(tsdl_list, class) == "ts" 16 | 17 | tsdl_list <- tsdl_list[is_univar] 18 | 19 | load("./data/tseries_vcerq.rdata") 20 | 21 | ids2rm <- c(14:21,24,28,35:42,53:61) 22 | 23 | ts_list <- ts_list[-ids2rm] 24 | 25 | tdiff <- 26 | sapply(ts_list, 27 | function(x) { 28 | xt <- 29 | tryCatch( 30 | as.POSIXct(head(rownames(as.data.frame(x)))), 31 | error = function(e) 32 | NA 33 | ) 34 | 35 | difftime(xt[2],xt[1], units = "hours") 36 | }) 37 | 38 | #lapply(ts_list[is.na(tdiff)], head, 3) 39 | tdiff[is.na(tdiff)] <- 1 40 | 41 | FRQs <- tdiff 42 | FRQs[tdiff == 1] <- 24 43 | FRQs[tdiff == .5] <- 48 44 | FRQs[tdiff == 24] <- 365 45 | 46 | max_len <- 4000 47 | ts_list <- 48 | lapply(1:length(ts_list), 49 | function(i) { 50 | x <- ts_list[[i]] 51 | x <- ts(as.vector(x), frequency = FRQs[i]) 52 | x <- head(x, max_len) 53 | x 54 | }) 55 | 56 | ts_list <- c(tsdl_list,ts_list) 57 | length(ts_list) 58 | save(ts_list, file = "data/tsl_uni_90_mix.rdata") 59 | -------------------------------------------------------------------------------- /data/synthetic.rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/data/synthetic.rdata -------------------------------------------------------------------------------- /data/timeseries.rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/data/timeseries.rdata -------------------------------------------------------------------------------- /data/tseries_vcerq.rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/data/tseries_vcerq.rdata -------------------------------------------------------------------------------- /data/tsl_uni_90_mix.rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/data/tsl_uni_90_mix.rdata -------------------------------------------------------------------------------- /descriptive-model.r: -------------------------------------------------------------------------------- 1 | load("data/tsl_uni_90_mix.rdata") 2 | load("best_estimator.rdata") 3 | load("series_length.rdata") 4 | load("stationarity_tsdl.rdata") 5 | 6 | source("src/dynamics.r") 7 | 8 | library(rpart) 9 | library(rattle) 10 | library(party) 11 | library(ranger) 12 | 13 | Z <- lapply(ts_list, function(x) dynamics_ts(tail(x,1000))) 14 | names(Z) <- paste0("TS_", seq_along(Z)) 15 | 16 | 17 | Z <- do.call(rbind, Z) 18 | Z$estimator <- best_estimator 19 | #save(Z, file = "metadata.rdata") 20 | #load("metadata.rdata") 21 | colnames(Z) <- c( 22 | "Trend","Qt05","Qt95","IQR","Skewness","Kurtosis", 23 | "MLE","Hurst","N","SerialCorr","estimator" 24 | ) 25 | Z$Stationarity <- as.character(as.integer(is_stationary_2ensemble)) 26 | 27 | Z$estimator <- as.factor(ifelse(grepl("CV",Z$estimator), "CVAL","OOS")) 28 | 29 | library(rpart) 30 | library(rpart.plot) 31 | 32 | m <- rpart(estimator ~., Z, 33 | control = list(cp=.028)) 34 | rpart.plot(m,type=2, extra=2, 35 | clip.right.labs = F, 36 | fallen.leaves=F,cex=0.8, 37 | under = F) 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /perfestimation-rw.r: -------------------------------------------------------------------------------- 1 | load("data/tsl_uni_90_mix.rdata") 2 | 3 | source("src/utils.r") 4 | source("src/estimation-procedures.r") 5 | source("src/workflows.r") 6 | source("src/metrics.r") 7 | source("src/learning-models.r") 8 | 9 | library(tsensembler) 10 | 11 | form <- target~. 12 | nfolds <- 30 13 | 14 | library(parallel) 15 | final_results <- 16 | mclapply(1:length(ts_list), 17 | function(i) { 18 | cat(i, "\n\n") 19 | ds <- ts_list[[i]] 20 | 21 | x <- 22 | workflow( 23 | ds = ds, 24 | form = form, 25 | predictive_algorithm = "rbr", 26 | nfolds = nfolds, 27 | outer_split = .7) 28 | 29 | x 30 | }, mc.cores = 10) 31 | 32 | save(final_results, file = "final_results_rbr.rdata") 33 | 34 | -------------------------------------------------------------------------------- /perfestimation-samplesize.r: -------------------------------------------------------------------------------- 1 | load("data/tsl_uni_90_mix.rdata") 2 | 3 | source("src/utils.r") 4 | source("src/estimation-procedures.r") 5 | source("src/workflows.r") 6 | source("src/metrics.r") 7 | source("src/learning-models.r") 8 | 9 | library(tsensembler) 10 | #unloadNamespace("tsensembler") 11 | 12 | load("series_length.rdata") 13 | 14 | ts_list <- ts_list[len >= 1000] 15 | 16 | form <- target~. 17 | nfolds <- 30 18 | 19 | 20 | library(parallel) 21 | final_results <- 22 | mclapply(1:length(ts_list), 23 | function(i) { 24 | #i<-1 25 | cat(i, "\n\n") 26 | ds <- ts_list[[i]] 27 | 28 | x <- 29 | wf_sample_size( 30 | ds = ds, 31 | form = form, 32 | predictive_algorithm = "rbr", 33 | nfolds = nfolds, 34 | outer_split = .7) 35 | 36 | x 37 | }, mc.cores = 10) 38 | 39 | 40 | save(final_results, file = "final_results_samplesize_rbr.rdata") 41 | 42 | ## 43 | 44 | 45 | -------------------------------------------------------------------------------- /perfestimation-synthetic.r: -------------------------------------------------------------------------------- 1 | load("data/synthetic.rdata") 2 | 3 | source("src/utils.r") 4 | source("src/estimation-procedures.r") 5 | source("src/workflows.r") 6 | source("src/metrics.r") 7 | source("src/learning-models.r") 8 | 9 | library(tsensembler) 10 | 11 | form <- target ~ . 12 | nfolds <- 30 13 | embedded_time_series <- synthetic$TS3 14 | 15 | library(parallel) 16 | final_results <- 17 | mclapply(1:length(embedded_time_series), 18 | function(i) { 19 | cat(i, "\n\n") 20 | ds <- embedded_time_series[[i]] 21 | 22 | x <- 23 | workflow( 24 | ds = ds, 25 | form = form, 26 | predictive_algorithm = "rf", 27 | nfolds = nfolds, 28 | outer_split = .8, 29 | is_embedded = T 30 | ) 31 | 32 | x 33 | }, mc.cores = 3) 34 | 35 | save(final_results, 36 | file = "final_results_synthetic_ts3_rf.rdata") 37 | 38 | -------------------------------------------------------------------------------- /perfestimation.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /result-analysis-ss.r: -------------------------------------------------------------------------------- 1 | load("final_results_samplesize_rbr.rdata") 2 | load("stationarity_tsdl.rdata") 3 | load("series_length.rdata") 4 | 5 | # err_estimation <- 6 | # lapply(final_results[!(is_stationary_2ensemble[len>1000])], 7 | # function(x) { 8 | # x <- do.call(rbind, x) 9 | # rnk <- t(apply(abs(x),1,rank)) 10 | # rnk 11 | # }) 12 | 13 | err_estimation <- 14 | lapply(final_results, 15 | function(x) { 16 | x <- do.call(rbind, x) 17 | rnk <- t(apply(abs(x),1,rank)) 18 | rnk 19 | }) 20 | 21 | err_arr <- simplify2array(err_estimation) 22 | err_ovr <- apply(err_arr,1:2,mean) 23 | 24 | library(tsensembler) 25 | err_ovr <- 26 | roll_mean_matrix(as.data.frame(err_ovr), 2) 27 | 28 | 29 | colnames(err_ovr) <- 30 | c("CV", "CV-Bl", "CV-Mod","CV-hvBl", 31 | "Preq-Bls", "Preq-Sld-Bls", 32 | "Preq-Bls-Gap","Holdout", "Rep-Holdout", 33 | "Preq-Slide","Preq-Grow") 34 | 35 | df <- t(round(err_ovr,2)) 36 | df <- as.data.frame(df) 37 | df$Method <- as.factor(rownames(df)) 38 | rownames(df) <- NULL 39 | colnames(df) <- c(as.character(seq(from=100,to=900,by=100)),"Method") 40 | 41 | library(hrbrthemes) 42 | library(GGally) 43 | library(viridis) 44 | ggparcoord(df, 45 | columns = 1:9, groupColumn = 10, 46 | scale = "globalminmax", 47 | showPoints = TRUE, 48 | alphaLines = 0.9 49 | ) + theme_minimal() + 50 | labs(x="Training Sample Size", 51 | y= "Average Rank") 52 | 53 | -------------------------------------------------------------------------------- /result-analysis-synthetic.r: -------------------------------------------------------------------------------- 1 | load("final_results_synthetic_ts1_lasso.rdata") 2 | load("final_results_synthetic_ts2_lasso.rdata") 3 | load("final_results_synthetic_ts3_lasso.rdata") 4 | load("final_results_synthetic_ts1_rbr.rdata") 5 | load("final_results_synthetic_ts2_rbr.rdata") 6 | load("final_results_synthetic_ts3_rbr.rdata") 7 | load("final_results_synthetic_ts1_rf.rdata") 8 | load("final_results_synthetic_ts2_rf.rdata") 9 | load("final_results_synthetic_ts3_rf.rdata") 10 | 11 | source("src/plots.r") 12 | library(scmamp) 13 | 14 | err_estimation <- 15 | lapply(final_results, 16 | function(x) { 17 | tryCatch(x$err_estimation, 18 | error =function(e) {NULL}) 19 | }) 20 | 21 | err_estimation <- err_estimation[!sapply(err_estimation, is.null)] 22 | 23 | fr <- do.call(rbind, err_estimation) 24 | fr <- as.data.frame(fr) 25 | rownames(fr) <- NULL 26 | 27 | colnames(fr) <- 28 | c("CV", "CV-Bl", "CV-Mod","CV-hvBl", 29 | "Preq-Bls", "Preq-Sld-Bls", 30 | "Preq-Bls-Gap","Holdout", "Rep-Holdout", 31 | "Preq-Slide","Preq-Grow") 32 | 33 | fr_abs <- abs(fr) 34 | fr_abs_rank <- apply(fr_abs, 1, rank) 35 | 36 | avg_rank_plot(avg = rowMeans(fr_abs_rank), 37 | sdev = apply(fr_abs_rank,1, sd)) + 38 | theme(axis.text.x = element_text(angle = 45, 39 | size = 12, 40 | hjust = 1)) 41 | 42 | # 43 | -------------------------------------------------------------------------------- /result-analysis.R: -------------------------------------------------------------------------------- 1 | load("final_results_rbr_all_nd.rdata") 2 | load("final_results_lasso_all_nd.rdata") 3 | load("final_results_rf_all_nd.rdata") 4 | 5 | load("stationarity_tsdl.rdata") 6 | 7 | source("src/plots.r") 8 | library(scmamp) 9 | 10 | err_estimation <- 11 | lapply(final_results, 12 | function(x) { 13 | tryCatch(x$err_estimation, 14 | error =function(e) {NULL}) 15 | }) 16 | 17 | err_estimation <- err_estimation[!sapply(err_estimation, is.null)] 18 | 19 | fr <- do.call(rbind, err_estimation) 20 | fr <- as.data.frame(fr) 21 | rownames(fr) <- NULL 22 | 23 | is_stationary_final <- is_stationary_2ensemble 24 | table(is_stationary_final) 25 | 26 | ## Subsetting ########################################## 27 | fr_final <- fr[!is_stationary_final,] 28 | fr_final <- fr[is_stationary_final,] 29 | fr_final <- fr 30 | 31 | ######################################################## 32 | 33 | colnames(fr_final) <- 34 | c("CV", "CV-Bl", "CV-Mod","CV-hvBl", 35 | "Preq-Bls", "Preq-Sld-Bls", 36 | "Preq-Bls-Gap","Holdout", "Rep-Holdout", 37 | "Preq-Slide","Preq-Grow") 38 | fr_abs <- abs(fr_final) 39 | rownames(fr_abs) <- NULL 40 | 41 | # best_estimator <- colnames(fr_abs)[apply(fr_abs,1,which.min)] 42 | # save(best_estimator, file = "best_estimator.rdata") 43 | #table(best_estimator) 44 | 45 | fr_abs_rank <- apply(fr_abs, 1, rank) 46 | 47 | # bp_dist(t(fr_abs_rank)[!is_stationary_final,]) 48 | # bp_dist(t(fr_abs_rank)[is_stationary_final,]) 49 | # bp_dist(t(fr_abs_rank)) 50 | 51 | avg_rank_plot(avg = rowMeans(fr_abs_rank), 52 | sdev = apply(fr_abs_rank,1, sd)) + 53 | theme(axis.text.x = element_text(angle = 45, 54 | size = 12, 55 | hjust = 1)) 56 | 57 | rm(fr_final) 58 | 59 | baseline <- "Rep-Holdout" 60 | 61 | cID <- which(colnames(fr_abs) %in% baseline) 62 | 63 | PerfDiff <- lapply(as.data.frame(fr_abs), 64 | function(x) x-fr_abs[,cID,drop=T]) 65 | 66 | PerfDiff <- as.data.frame(PerfDiff[-cID]) 67 | 68 | rope <- 2.5 69 | baout <- 70 | lapply(PerfDiff, 71 | function(u) { 72 | bSignedRankTest(u, 73 | rope=c(-rope,rope))$posterior.probabilities 74 | }) 75 | 76 | baout <- lapply(baout,unlist) 77 | baout <- do.call(rbind, baout) 78 | rownames(baout) <- gsub("\\.","-",rownames(baout)) 79 | colnames(baout) <- c("probLeft","probRope","probRight") 80 | proportion_plot(baout) + scale_fill_brewer(palette="Set2") 81 | 82 | 83 | # 84 | colnames(fr) <- colnames(fr_abs) 85 | percdiff_plot_log(fr) 86 | 87 | -------------------------------------------------------------------------------- /series_length.rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/series_length.rdata -------------------------------------------------------------------------------- /src/dynamics.r: -------------------------------------------------------------------------------- 1 | replace_inf <- function(df) { 2 | do.call(data.frame, 3 | lapply(df, function(j) { 4 | replace(j, is.infinite(j), NA) 5 | }) 6 | ) 7 | } 8 | 9 | dynamics_ts <- 10 | function(x) { 11 | require(TTR) 12 | 13 | x_std <- as.vector(scale(x)) 14 | 15 | cat("Computing trend ...\n") 16 | ts_trend <- trend(x) 17 | 18 | cat("Computing skewness ...\n") 19 | ts_skew <- moments::skewness(x) 20 | 21 | cat("Computing kurtosis ...\n") 22 | ts_kts <- moments::kurtosis(x) 23 | 24 | cat("Computing maximum lyapunov exponent ...\n") 25 | ts_mle <- max_lyapunov_exp(tail(x,500)) 26 | 27 | cat("Computing hurst ...\n") 28 | ts_hurst <- HURST(x) 29 | 30 | cat("Computing serial correlation ...\n") 31 | ts_serialcorr <- tryCatch(Box.test(x)$p.val, 32 | error=function(e) NA) 33 | 34 | 35 | ts_qt05 <- quantile(x_std, .05) 36 | 37 | ts_qt95 <- quantile(x_std, .95) 38 | 39 | ts_iqr <- IQR(x_std) 40 | 41 | ts_dyns <- 42 | data.frame(ts_trend = ts_trend, 43 | ts_qt05=ts_qt05, 44 | ts_qt95=ts_qt95, 45 | ts_iqr=ts_iqr, 46 | ts_skew = ts_skew, 47 | ts_kts = ts_kts, 48 | ts_mle = ts_mle, 49 | ts_hurst = ts_hurst, 50 | ts_n = length(x), 51 | ts_serialcorr = ts_serialcorr) 52 | 53 | ts_dyns <- replace_inf(ts_dyns) 54 | 55 | #has_na <- DMwR::manyNAs(t(ts_dyns), .4) 56 | 57 | #if (length(has_na) > 0) { 58 | # ts_dyns <- subset(ts_dyns, select = -has_na) 59 | #} 60 | 61 | #ts_dyns <- tsensembler::soft.completion(ts_dyns) 62 | 63 | #nzv_cols <- caret::nearZeroVar(ts_dyns) 64 | #if (length(nzv_cols) > 0L) { 65 | # ts_dyns <- subset(ts_dyns, select = -nzv_cols) 66 | #} 67 | 68 | rownames(ts_dyns) <- NULL 69 | #preproc <- caret::preProcess(dStats) 70 | #dStats <- predict(preproc, dStats) 71 | 72 | dplyr::as_tibble(ts_dyns) 73 | } 74 | 75 | 76 | trend <- 77 | function(x) { 78 | sd(x) / sd(diff(x)[-1]) 79 | } 80 | 81 | max_lyapunov_exp <- 82 | function(x) { 83 | require(nonlinearTseries) 84 | 85 | len <- length(x) 86 | Reduce(max, 87 | nonlinearTseries::divergence( 88 | nonlinearTseries::maxLyapunov( 89 | time.series = x, 90 | min.embedding.dim = ceiling(len / 4), 91 | max.embedding.dim = ceiling(len / 2), 92 | radius = ceiling(len / 6), 93 | do.plot = FALSE 94 | ) 95 | )) 96 | } 97 | 98 | #' Hurst exponent 99 | #' 100 | #' @param x numeric vector 101 | HURST <- 102 | function(x) { 103 | require(Rwave) 104 | 105 | cwtwnoise <- DOG(x, 10, 3, 1, plot = FALSE) 106 | mcwtwnoise <- Mod(cwtwnoise) 107 | mcwtwnoise <- mcwtwnoise * mcwtwnoise 108 | wspwnoise <- tfmean(mcwtwnoise, plot = FALSE) 109 | 110 | hurst.est(wspwnoise, 1:7, 3, plot = FALSE)[[2]] 111 | } 112 | 113 | 114 | r_sma_ema <- 115 | function(x) { 116 | require(TTR) 117 | 118 | if (length(x) > 10) 119 | n <- 5 120 | else 121 | n <- 3 122 | 123 | ts_sma <- SMA(rev(x), n = n) 124 | ts_ema <- EMA(rev(x), n = n) 125 | 126 | ts_sma <- ts_sma[!is.na(ts_sma)] 127 | ts_ema <- ts_ema[!is.na(ts_ema)] 128 | 129 | sema <- ts_sma / ts_ema 130 | sema <- sema[!(is.infinite(sema) | is.na(sema))] 131 | 132 | mean(ts_sma / ts_ema) 133 | } -------------------------------------------------------------------------------- /src/estimation-procedures.r: -------------------------------------------------------------------------------- 1 | #' k-fold cross validation 2 | #' The standard cross validation procedure. 3 | #' 4 | #' @param x data: embedded time series 5 | #' @param nfolds no of folds 6 | #' @param FUN function to apply to each iteration's train and test. Typically 7 | #' \strong{FUN} is a workflow where a predictive model is applied in a training set 8 | #' and the model is evaluated in the test set. 9 | #' @param ... further parameters to \code{FUN} 10 | #' 11 | #' @export 12 | kf_xval <- function(x, nfolds, FUN, shuffle.rows = TRUE, average_results = TRUE, ...) { 13 | if (shuffle.rows) x <- cv.shuffle(x) 14 | f <- cv.folds(x, nfolds) 15 | 16 | cv.res <- list() 17 | for (i in seq_len(nfolds)) { 18 | cat("X VAL iter no. ", i, "\n") 19 | ts.id <- which(f == i) 20 | 21 | train <- x[-ts.id, ] 22 | #print(dim(train)) 23 | test <- x[ ts.id, ] 24 | #print(dim(test)) 25 | cv.res[[i]] <- FUN(train, test, ...) 26 | } 27 | 28 | if(average_results) { 29 | cv.res <- mean(unlist(cv.res, use.names = FALSE)) 30 | } 31 | 32 | cv.res 33 | } 34 | 35 | #' Blocked k-fold cross validation 36 | #' Standard k-fold cross validation without reshuffling rows 37 | #' 38 | #' @inheritParams kf_xval 39 | #' 40 | #' @export 41 | blocked_kf_xval <- function(x, nfolds, FUN, ...) { 42 | kf_xval(x = x, nfolds = nfolds, FUN = FUN, shuffle.rows = FALSE, ...) 43 | } 44 | 45 | #' Modified k-fold Cross Validation 46 | #' Standard CV, but removes rows from training that are 47 | #' dependent with the test set. 48 | #' 49 | #' @inheritParams kf_xval 50 | #' 51 | #' @export 52 | modified_xval <- function(x, nfolds, FUN, average_results = TRUE, ...) { 53 | K <- floor(sqrt(ncol(x)))#inventei aqui 54 | x$aux <- seq_len(nrow(x)) 55 | x <- cv.shuffle(x) 56 | f <- cv.folds(x, nfolds) 57 | 58 | cv.res <- list() 59 | for (i in seq_len(nfolds)) { 60 | ts.id <- which(f == i) 61 | test <- x[ts.id, ] 62 | 63 | depRows <- unique(unlist(lapply(test$aux, function(z) (z-K-1):(z+K-1L)))) 64 | depRows <- depRows[depRows > 0] 65 | 66 | train <- x[-c(ts.id, depRows), ] 67 | if (nrow(train) < 1) { 68 | cv.res[[i]] <- NA#rep(NA_real_, times = 3) 69 | } else { 70 | test$aux <- NULL 71 | train$aux <- NULL 72 | 73 | cv.res[[i]] <- FUN(train, test, ...) 74 | } 75 | } 76 | 77 | if(average_results) { 78 | cv.res <- mean(unlist(cv.res), na.rm = TRUE) 79 | } 80 | 81 | cv.res 82 | } 83 | 84 | #' hv - block Cross Validation 85 | #' Cross without reshuffling rows and removing dependent rows. 86 | #' Since there is no reshuffling, the dependent rows are just 87 | #' the \code{K}-adjacent ones in the adjacent folds. \code{K} denotes 88 | #' the embedding dimension. 89 | #' 90 | #' @inheritParams kf_xval 91 | #' 92 | #' @export 93 | hv.block_xval <- function(x, nfolds, FUN, average_results, ...) { 94 | K <- ncol(x) 95 | f <- cv.folds(x, nfolds) 96 | 97 | cv.res <- list() 98 | seq. <- seq_len(nfolds) 99 | lseq <- seq.[length(seq.)] 100 | for (i in seq.) { 101 | ts.id <- which(f == i) 102 | # upper cut 103 | kcuts <- integer(0L) 104 | if (i + 1L <= lseq) { 105 | upper.fold <- which(f == i + 1L) 106 | upper.cut <- upper.fold[1:K] 107 | if (any(is.na(upper.cut))) { 108 | upper.cut <- upper.cut[!is.na(upper.cut)] 109 | } 110 | 111 | kcuts <- c(kcuts, upper.cut) 112 | } 113 | # lower cut 114 | if (i - 1L >= 1L) { 115 | lower.fold <- which(f == i - 1L) 116 | len.lf <- length(lower.fold) 117 | idx <- (len.lf - K + 1L):len.lf 118 | idx <- idx[idx > 0] 119 | lower.cut <- lower.fold[idx] 120 | kcuts <- c(kcuts, lower.cut) 121 | } 122 | 123 | train <- x[-c(ts.id, kcuts), ] 124 | test <- x[ts.id, ] 125 | 126 | cv.res[[i]] <- FUN(train, test, ...) 127 | } 128 | 129 | if (average_results) { 130 | cv.res <- mean(unlist(cv.res), na.rm = TRUE) 131 | } 132 | 133 | cv.res 134 | } 135 | 136 | #' Sequential Block Evaluation 137 | #' Trains in the the up-to-i folds and tests on i+1 138 | #' 139 | #' Prequential in blocks in a growing window fashion. 140 | #' 141 | #' @inheritParams kf_xval 142 | #' 143 | #' @export 144 | prequential_in_blocks <- function(x, nfolds, FUN, average_results, ...) { 145 | f <- cv.folds(x, nfolds) 146 | 147 | cv.res <- list() 148 | seq. <- seq_len(nfolds) 149 | for (i in seq.[-length(seq.)]) { 150 | tr.id <- which(f %in% seq_len(i)) 151 | ts.id <- which(f == i + 1L) 152 | 153 | train <- x[tr.id, ] 154 | test <- x[ts.id, ] 155 | cv.res[[i]] <- FUN(train, test, ...) 156 | } 157 | 158 | if (average_results) { 159 | cv.res <- mean(unlist(cv.res), na.rm = TRUE) 160 | } 161 | 162 | cv.res 163 | } 164 | 165 | sliding_prequential_in_blocks <- 166 | function(x, nfolds, FUN, average_results, ...) { 167 | nfolds0<-10 168 | f <- cv.folds(x, nfolds0) 169 | 170 | cv.res <- list() 171 | seq. <- seq_len(nfolds0) 172 | for (i in seq.[-length(seq.)]) { 173 | tr.id <- which(f == i)#which(f %in% seq_len(i)) 174 | ts.id <- which(f == i + 1L) 175 | 176 | train <- x[tr.id,] 177 | test <- x[ts.id,] 178 | cv.res[[i]] <- FUN(train, test, ...) 179 | } 180 | 181 | if (average_results) { 182 | cv.res <- mean(unlist(cv.res), na.rm = TRUE) 183 | } 184 | 185 | cv.res 186 | } 187 | 188 | #' Sequential Block Evaluation 189 | #' Trains in the the up-to-i folds and tests on i+2 190 | #' Doesn't test on i+1 to further increase independence 191 | #' 192 | #' @inheritParams kf_xval 193 | #' 194 | #' @export 195 | prequential_in_blocks_gap <- function(x, nfolds, FUN, average_results, ...) { 196 | f <- cv.folds(x, nfolds) 197 | 198 | cv.res <- list() 199 | seq. <- seq_len(nfolds); len <- length(seq.) 200 | 201 | seq. <- seq.[-c(len:(len - 1L))] 202 | for (i in seq.) { 203 | tr.id <- which(f %in% seq_len(i)) 204 | ts.id <- which(f == i + 2L) 205 | 206 | train <- x[tr.id, ] 207 | test <- x[ts.id, ] 208 | cv.res[[i]] <- FUN(train, test, ...) 209 | } 210 | 211 | if (average_results) { 212 | cv.res <- mean(unlist(cv.res), na.rm = TRUE) 213 | } 214 | 215 | cv.res 216 | } 217 | 218 | #' Simple Block Bootstrap 219 | #' Block bootstrap with non-overlapping blocks 220 | #' 221 | #' @param x data 222 | #' @param nblocks no of block to divide \code{x} into 223 | #' @param nreps no of bootstrap repetitions 224 | #' @param FUN function to apply 225 | #' @param ... further params to \code{FUN} 226 | #' 227 | #' @export 228 | bbootstrap <- function(x, nblocks, nreps, FUN, ...) { 229 | K <- ncol(x) 230 | res.boot <- list() 231 | x <- unembed.timeseries(x[-nrow(x), ], x[nrow(x), ])$data 232 | f <- cv.folds(x = x, nfolds = nblocks) 233 | for (i in seq_len(nreps)) { 234 | seq.f <- seq_len(nblocks) 235 | OOB.id <- sample(seq.f, nblocks, replace = TRUE) 236 | train <- rbind_( 237 | lapply(OOB.id, function(z) { 238 | r.id <- which(z == f) 239 | x_z <- embed.timeseries(x[r.id], K) 240 | rownames(x_z) <- NULL 241 | 242 | x_z 243 | }) 244 | ) 245 | ts.id <- setdiff(seq.f, OOB.id) 246 | 247 | if (length(ts.id) < 1) { 248 | res.boot[[i]] <- rep(NA_real_, times = 3) 249 | } else { 250 | test <- rbind_( 251 | lapply(ts.id, function(z) { 252 | r.id <- which(z == f) 253 | x_z <- embed.timeseries(x[r.id], K) 254 | rownames(x_z) <- NULL 255 | 256 | x_z 257 | }) 258 | ) 259 | res.boot[[i]] <- FUN(train, test, ...) 260 | } 261 | } 262 | res.boot 263 | } 264 | 265 | 266 | #' Moving Block Bootstrap 267 | #' Block bootstrap with overlapping blocks 268 | #' 269 | #' @inheritParams bbootstrap 270 | #' 271 | #' @export 272 | mbbootstrap <- function(x, nblocks, nreps, FUN, ...) { 273 | res.boot <- list() 274 | f <- cv.folds(x, nblocks) 275 | 276 | for (i in seq_len(nreps)) { 277 | OOB.id <- sample(seq_len(nblocks), nblocks, replace = TRUE) 278 | tr.id <- unlist(lapply(OOB.id, function(z) which(z == f))) 279 | train <- x[tr.id, ] 280 | test <- x[-unique(tr.id), ] 281 | 282 | if (nrow(test) < 1) { 283 | res.boot[[i]] <- rep(NA_real_, times = 3) 284 | } else { 285 | res.boot[[i]] <- FUN(train, test, ...) 286 | } 287 | } 288 | res.boot 289 | } 290 | 291 | #' OOS Rolling Window 292 | #' Keeps same window with retraining procedures 293 | #' 294 | #' @param train train set 295 | #' @param test test set 296 | #' @param FUN function to apply 297 | #' @param ... further params to \code{FUN} 298 | #' 299 | #' @export 300 | PrequentialSliding <- 301 | function(train, test, FUN, average_results, ...) { 302 | seq. <- seq_len(nrow(test)) 303 | seq.diff <- seq. - 1L 304 | 305 | estresults <- 306 | lapply(seq.diff, function(y) { 307 | cut.y <- seq_len(y) 308 | if (y < 1) { 309 | train.set <- train 310 | test.set <- test 311 | } else { 312 | train.set <- rbind.data.frame(train[-cut.y, ], test[cut.y, ]) 313 | test.set <- test[-cut.y, ] 314 | } 315 | FUN(train.set, test.set, ...) 316 | }) 317 | 318 | if (average_results) { 319 | cv.res <- mean(unlist(estresults), na.rm = TRUE) 320 | } 321 | 322 | cv.res 323 | } 324 | 325 | #' OOS Rolling Origin Update 326 | #' 327 | #' Landmark style retraining 328 | #' 329 | #' @inheritParams oos.rw.updt 330 | #' 331 | #' @export 332 | PrequentialLandmark <- 333 | function(train, test, FUN, average_results, ...) { 334 | seq. <- seq_len(nrow(test)) 335 | seq.diff <- seq. - 1L 336 | 337 | estresults <- 338 | lapply(seq.diff, function(y) { 339 | cut.y <- seq_len(y) 340 | if (y < 1) { 341 | train.set <- train 342 | test.set <- test 343 | } else { 344 | train.set <- rbind.data.frame(train, test[cut.y,]) 345 | test.set <- test[-cut.y,] 346 | } 347 | FUN(train.set, test.set, ...) 348 | }) 349 | 350 | if (average_results) { 351 | cv.res <- mean(unlist(estresults), na.rm = TRUE) 352 | } 353 | 354 | cv.res 355 | } 356 | 357 | 358 | holdout <- 359 | function(x, FUN, ...) { 360 | xp <- partition(x, .7) 361 | 362 | train <- xp$train 363 | test <- xp$test 364 | 365 | FUN(train, test, form) 366 | } 367 | 368 | repeated_holdout <- 369 | function(x, nreps, train_size, test_size, FUN, average_results, ...) { 370 | n <- nrow(x) 371 | tr_size <- as.integer(n * train_size) 372 | ts_size <- as.integer(n * test_size) 373 | selection_range <- (tr_size + 1):(n - ts_size + 1) 374 | origins <- sample(selection_range, nreps) 375 | 376 | mcapprox <- 377 | lapply(origins, function(o) { 378 | train <- x[(o - tr_size):(o - 1),] 379 | test <- x[o:(o + ts_size - 1),] 380 | 381 | FUN(train = train, test = test, form=form) 382 | }) 383 | 384 | if (average_results) { 385 | mcapprox<-mean(unlist(mcapprox), na.rm = TRUE) 386 | } 387 | 388 | mcapprox 389 | } 390 | 391 | -------------------------------------------------------------------------------- /src/learning-models.r: -------------------------------------------------------------------------------- 1 | RF <- 2 | function(form,train,test) { 3 | require(ranger) 4 | 5 | m <- ranger(formula = form, data=train, num.trees = 100) 6 | 7 | predict(m,test)$predictions 8 | } 9 | 10 | RBR <- 11 | function(form,train,test) { 12 | require(Cubist) 13 | 14 | X <- model.matrix(form, train) 15 | Y <- get_y(train, form) 16 | 17 | m <- cubist(X, Y, committees = 10) 18 | 19 | Xts <- model.matrix(form, test) 20 | 21 | predict(m,Xts) 22 | } 23 | 24 | 25 | LASSO <- 26 | function(form,train,test) { 27 | require(glmnet) 28 | 29 | X <- model.matrix(form, train) 30 | Y <- get_y(train, form) 31 | 32 | m.all <- glmnet(X, Y, alpha = 0, family = "gaussian") 33 | m <- glmnet(X, 34 | Y, 35 | alpha = 1, 36 | lambda = min(m.all$lambda), 37 | family = "gaussian") 38 | 39 | Xts <- model.matrix(form, test) 40 | 41 | unname(predict(m,Xts)[,1]) 42 | } 43 | 44 | 45 | 46 | GP <- 47 | function(form,train,test) { 48 | require(kernlab) 49 | 50 | m <- gausspr(form, 51 | train, 52 | type = "regression", 53 | kernel = "rbfdot", 54 | tol = .001) 55 | 56 | predict(m,test)[,1] 57 | } 58 | 59 | CART <- 60 | function(form,train,test) { 61 | require(DMwR) 62 | 63 | m <- rpartXse(form,train) 64 | 65 | unname(predict(m,test)) 66 | } 67 | 68 | CART_loss <- 69 | function(train, test, form, avg=TRUE) { 70 | y <- get_y(test, form) 71 | y_tr <- get_y(train, form) 72 | y_hat <- tryCatch(CART(form, train, test), 73 | error=function(e) { 74 | rep(mean(y_tr), times=length(y)) 75 | }) 76 | 77 | mase_cal(y_tr, y, y_hat, avg) 78 | } 79 | 80 | 81 | 82 | GP_loss <- 83 | function(train, test, form, avg=TRUE) { 84 | y <- get_y(test, form) 85 | y_tr <- get_y(train, form) 86 | y_hat <- GP(form, train, test) 87 | 88 | mase_cal(y_tr, y, y_hat, avg) 89 | } 90 | 91 | 92 | 93 | LASSO_loss <- 94 | function(train, test, form, avg=TRUE) { 95 | y <- get_y(test, form) 96 | y_tr <- get_y(train, form) 97 | y_hat <- 98 | tryCatch(LASSO(form, train, test), 99 | error=function(e) { 100 | rep(mean(y_tr), times=length(y)) 101 | }) 102 | 103 | mase_cal(y_tr, y, y_hat, avg) 104 | } 105 | 106 | RF_loss <- 107 | function(train, test, form, avg=TRUE) { 108 | y <- get_y(test, form) 109 | y_tr <- get_y(train, form) 110 | y_hat <- RF(form, train, test) 111 | 112 | #rmse(y, y_hat) 113 | mase_cal(y_tr, y, y_hat, avg) 114 | } 115 | 116 | RBR_loss <- 117 | function(train, test, form, avg=TRUE) { 118 | y <- get_y(test, form) 119 | y_tr <- get_y(train, form) 120 | y_hat <- RBR(form, train, test) 121 | 122 | mase_cal(y_tr, y, y_hat, avg) 123 | } 124 | 125 | MLP <- 126 | function(form,train,test) { 127 | require(nnet) 128 | m <- nnet(form, train, size=50, linout=T, MaxNWts=2^20, maxit=150) 129 | 130 | unname(predict(m,test))[,1] 131 | } 132 | 133 | MLP_loss <- 134 | function(train, test, form, avg=TRUE) { 135 | y <- get_y(test, form) 136 | y_tr <- get_y(train, form) 137 | y_hat <- MLP(form, train, test) 138 | 139 | mase_cal(y_tr, y, y_hat, avg) 140 | } 141 | 142 | -------------------------------------------------------------------------------- /src/metrics.r: -------------------------------------------------------------------------------- 1 | #' Predictive Accuracy Error 2 | #' 3 | #' @param E_hat estimated error 4 | #' @param E true error 5 | #' 6 | #' @export 7 | pae <- function(E_hat, E) E_hat - E 8 | 9 | #' Mean Predictive Accuracy Error 10 | #' 11 | #' @inheritParams pae 12 | #' 13 | #' @export 14 | mpae <- function(E_hat, E) mean(pae(E_hat, E)) 15 | 16 | #' Mean Absolute Predictive Accuracy Error 17 | #' 18 | #' @inheritParams pae 19 | #' 20 | #' @export 21 | mapae <- function(E_hat, E) mean(abs(pae(E_hat, E))) 22 | 23 | #' rmse 24 | #' 25 | #' Utility function to compute Root Mean Squared Error (RMSE) 26 | #' 27 | #' @inheritParams se 28 | #' 29 | #' @export 30 | rmse <- function(y, y_hat) sqrt(mse(y, y_hat)) 31 | 32 | #' ae 33 | #' 34 | #' Element-wise computation of the absolute error loss function. 35 | #' 36 | #' @inheritParams se 37 | #' 38 | #' @export 39 | ae <- function(y, y_hat) { 40 | stopifnot(length(y) == length(y_hat), 41 | is.numeric(y), 42 | is.numeric(y_hat)) 43 | 44 | abs(y - y_hat) 45 | } 46 | 47 | #' se 48 | #' 49 | #' Utility function to compute pointwise squared error (SE) 50 | #' 51 | #' @param y A numeric vector representing the actual values. 52 | #' @param y_hat A numeric vector representing the forecasted values. 53 | #' 54 | #' @return squared error of forecasted values. 55 | #' 56 | #' @export 57 | se <- function(y, y_hat) { 58 | stopifnot(length(y) == length(y_hat), 59 | is.numeric(y), 60 | is.numeric(y_hat)) 61 | 62 | (y - y_hat) ^ 2 63 | } 64 | 65 | #' mse 66 | #' 67 | #' Utility function to compute mean squared error (MSE) 68 | #' 69 | #' @inheritParams se 70 | #' 71 | #' @export 72 | mse <- function(y, y_hat) mean(se(y, y_hat), na.rm = TRUE) 73 | 74 | #' mae 75 | #' 76 | #' Mean Absolute Error loss function. 77 | #' 78 | #' @inheritParams se 79 | #' 80 | #' @export 81 | mae <- function(y, y_hat) mean(ae(y, y_hat), na.rm = TRUE) 82 | 83 | 84 | 85 | mase_cal <- function(insample, outsample, forecasts, avg=TRUE){ 86 | frq <- 1#frequency(insample) 87 | forecastsNaiveSD <- rep(NA,frq) 88 | for (j in (frq+1):length(insample)){ 89 | forecastsNaiveSD <- c(forecastsNaiveSD, insample[j-frq]) 90 | } 91 | masep<-mean(abs(insample-forecastsNaiveSD),na.rm = TRUE) 92 | 93 | outsample <- as.numeric(outsample) ; forecasts <- as.numeric(forecasts) 94 | 95 | if (avg) { 96 | mase <- mean((abs(outsample-forecasts)))/masep 97 | } else { 98 | mase <- (abs(outsample-forecasts))/masep 99 | } 100 | 101 | return(mase) 102 | } 103 | -------------------------------------------------------------------------------- /src/plots.r: -------------------------------------------------------------------------------- 1 | bp_dist <- 2 | function(ranks) { 3 | if (!is.data.frame(ranks)) { 4 | ranks <- as.data.frame(ranks) 5 | } 6 | 7 | avg.rank <- apply(ranks,2,mean,na.rm=TRUE) 8 | nms.sort <- names(sort(avg.rank)) 9 | 10 | ranks <- ranks[,nms.sort] 11 | ranks <- as.data.frame(ranks) 12 | 13 | x <- melt(ranks) 14 | 15 | p <- ggplot(x, aes(variable, value)) 16 | 17 | p <- p + 18 | geom_boxplot() + 19 | theme_minimal() + 20 | labs(x="", 21 | y="Rank") + 22 | theme(axis.text.x = element_text(size=10, 23 | angle=45)) + 24 | theme(axis.text.y = element_text(size = 10), 25 | axis.title.y = element_text(size = 10)) 26 | 27 | p 28 | } 29 | 30 | proportion_plot <- 31 | function(ds) { 32 | require(reshape2) 33 | require(ggplot2) 34 | ds <- as.data.frame(ds) 35 | ds[ ] <- lapply(ds, unlist) 36 | ds$method <- rownames(ds) 37 | dsm <- melt(ds) 38 | colnames(dsm) <- c("method", "Result","val") 39 | dsm$Result <- gsub("left|probLeft|winLeft","Rep-Holdout loses",dsm$Result) 40 | dsm$Result <- gsub("rope|probRope|winRope","draw",dsm$Result) 41 | dsm$Result <- gsub("right|probRight|winRight","Rep-Holdout wins",dsm$Result) 42 | dsm$Result <- 43 | factor(dsm$Result, levels = c("Rep-Holdout wins", 44 | "draw", 45 | "Rep-Holdout loses")) 46 | 47 | ggplot(dsm, aes(x = method, 48 | y = val, 49 | fill = Result)) + 50 | geom_col(position = "fill") + 51 | ylab("Proportion of probability") + 52 | xlab("") + 53 | theme_minimal() + 54 | theme(axis.text.x = element_text(angle = 30, 55 | size = 11, 56 | hjust = 1), 57 | legend.position = "top") + 58 | theme(axis.text.y = element_text(size = 11), 59 | axis.title.y = element_text(size = 11))# + 60 | #scale_fill_hue(h = c(180, 300)) 61 | #scale_fill_grey() 62 | #scale_fill_manual( values=c("red","green","blue")) 63 | #scale_fill_brewer(palette = "Set1") 64 | } 65 | 66 | percdiff_plot <- 67 | function(fresults) { 68 | require(reshape2) 69 | require(ggplot2) 70 | 71 | log_trans <- function(x) sign(x) * log(abs(x) + 1) 72 | 73 | r <- melt(fresults) 74 | 75 | ggplot(r, aes(x=1,y= value)) + 76 | facet_wrap( ~ variable, 77 | nrow = 1, 78 | scales="fixed") + 79 | geom_boxplot() + 80 | geom_hline(yintercept = 0,col="red") + 81 | theme_minimal() + 82 | labs(x="", 83 | y="Percentual difference to true loss") + 84 | theme(axis.text.x = element_blank()) + 85 | theme(axis.text.y = element_text(size = 14), 86 | axis.title.y = element_text(size = 14)) 87 | } 88 | 89 | 90 | percdiff_plot_log <- 91 | function(fresults) { 92 | require(reshape2) 93 | require(ggplot2) 94 | 95 | log_trans <- function(x) sign(x) * log(abs(x) + 1) 96 | 97 | r <- melt(fresults) 98 | 99 | ggplot(r, aes(x=1,y= log_trans(value))) + 100 | facet_wrap( ~ variable, 101 | nrow = 1, 102 | scales="fixed") + 103 | geom_boxplot() + 104 | geom_hline(yintercept = 0,col="red") + 105 | theme_minimal() + 106 | labs(x="", 107 | y="Percentual difference to true loss") + 108 | theme(axis.text.x = element_blank()) + 109 | theme(axis.text.y = element_text(size = 14), 110 | axis.title.y = element_text(size = 14)) 111 | } 112 | 113 | 114 | avg_rank_plot <- 115 | function(avg, sdev) { 116 | require(reshape2) 117 | require(ggplot2) 118 | 119 | ord0 <- order(avg) 120 | ord <- names(sort(avg)) 121 | 122 | methods <- names(avg) 123 | 124 | ds <- data.frame(avg=avg,sdev=sdev, methods=methods, row.names = NULL) 125 | ds$methods <- factor(ds$methods, levels = ord) 126 | 127 | ## 128 | meths <- as.character(ds$methods) 129 | ids <- grep("^CV", meths) 130 | cols <- rep("#33CCCC",times=length(meths)) 131 | cols[ids] <- "darkorange3" 132 | cols <- cols[ord0] 133 | ## 134 | 135 | #ds <- melt(ds) 136 | 137 | ggplot(data = ds, 138 | aes(x = methods, 139 | y = avg)) + 140 | geom_bar(stat="identity", 141 | fill=cols) + 142 | theme_minimal() + 143 | theme(axis.text.x = element_text(angle = 35, 144 | size = 14, 145 | hjust = 1)) + 146 | theme(axis.text.y = element_text(size = 12), 147 | axis.title.y = element_text(size = 12)) + 148 | #geom_hline(yintercept = 1) + 149 | geom_errorbar(aes(ymin = avg - sdev, 150 | ymax = avg + sdev), 151 | width = .5, 152 | position = position_dodge(.9)) + 153 | labs(x="", 154 | y="Avg. rank & Std dev.", 155 | title = "") 156 | } -------------------------------------------------------------------------------- /src/utils.r: -------------------------------------------------------------------------------- 1 | #' Randomly shuffle the data 2 | #' 3 | #' @param x data 4 | #' 5 | #' @export 6 | cv.shuffle <- function(x) x[sample(NROW(x)), ] 7 | 8 | resample_timeseries <- 9 | function (x, 10 | resamples, 11 | size_estimation, 12 | size_validation) { 13 | n <- nrow(x) 14 | tr_size <- as.integer(n * size_estimation) 15 | ts_size <- as.integer(n * size_validation) 16 | selection_range <- (tr_size + 1):(n - ts_size + 1) 17 | origins <- sample(selection_range, resamples) 18 | 19 | lapply(origins, function(o) { 20 | list(train = x[(o - tr_size):(o - 1), ], test = x[o:(o + ts_size - 1), ]) 21 | }) 22 | } 23 | 24 | #' Create cross validation folds 25 | #' 26 | #' @param x data 27 | #' @param nfolds no of folds 28 | #' 29 | #' @export 30 | cv.folds <- function(x, nfolds) { 31 | cut(seq_len(NROW(x)), breaks = nfolds, labels = FALSE) 32 | } 33 | 34 | #' train test partitioning holdout style 35 | #' 36 | #' @param x data 37 | #' @param hat.ratio estimation ratio 38 | #' 39 | #' @export 40 | partition <- function(x, hat.ratio) { 41 | len <- NROW(x) 42 | 43 | if (class(x)[1] == "data.frame") { 44 | train <- x[ seq_len(hat.ratio * len),] 45 | test <- x[-seq_len(hat.ratio * len),] 46 | } else { 47 | train <- x[ seq_len(hat.ratio * len)] 48 | test <- x[-seq_len(hat.ratio * len)] 49 | } 50 | list(train = train, test = test) 51 | } 52 | 53 | 54 | rbind_l <- function(x) do.call(rbind, x) 55 | 56 | 57 | get_y <- function(test, form) model.response(model.frame(form, test, na.action = NULL)) 58 | 59 | 60 | roll_mean_matrix <- 61 | function(x, lambda) { 62 | require(RcppRoll) 63 | if (class(x) != "data.frame") 64 | stop("x obj class in roll_mean_matrix must be a data.frame") 65 | dim1 <- NROW(x) 66 | 67 | MASE <- 68 | lapply(x, 69 | function(z) { 70 | rollm <- RcppRoll::roll_mean(z, n = lambda) 71 | blanks <- dim1 - length(rollm) 72 | aux <- numeric(blanks) 73 | for (y in seq_along(aux)) { 74 | aux[y] <- RcppRoll::roll_mean(z, n = y)[1] 75 | } 76 | c(aux, rollm) 77 | }) 78 | as.data.frame(MASE) 79 | } 80 | 81 | 82 | as_positive <- 83 | function(y) { 84 | y - min(y) + 1 85 | } 86 | 87 | 88 | estimate_k <- 89 | function(x, m.max=20,tol=.15) { 90 | require(tseriesChaos) 91 | 92 | fn.out <- false.nearest(x, m.max, d=1, t=1) 93 | fn.out <- round(fn.out,4) 94 | fn.out[is.na(fn.out)] <- 0 95 | #plot(fn.out) 96 | 97 | fnp.tol <- fn.out["fraction",] > tol 98 | fnp.tol.sum <- sum(fnp.tol) 99 | 100 | m <- ifelse(fnp.tol.sum < m.max,fnp.tol.sum + 1, m.max) 101 | 102 | m 103 | } 104 | 105 | -------------------------------------------------------------------------------- /src/workflows.r: -------------------------------------------------------------------------------- 1 | # workflow_test_size <- 2 | # function(ds, 3 | # form, 4 | # predictive_algorithm = "rf", 5 | # nfolds, 6 | # outer_split) { 7 | # 8 | # khat <- estimate_k(ds[1:(length(ds) * outer_split)], m.max = 30, tol = .01) 9 | # if (khat < 8) khat <- 8 10 | # 11 | # x <- embed_timeseries(as.numeric(ds), khat+1) 12 | # 13 | # pred_model <- 14 | # switch(predictive_algorithm, 15 | # "rf" = RF_loss, 16 | # "rbr" = RBR_loss, 17 | # "gp" = GP_loss, 18 | # "lasso" = LASSO_loss, 19 | # RF_loss) 20 | # 21 | # ## 70/30 22 | # xp <- partition(x, outer_split) 23 | # 24 | # train <- xp$train 25 | # test <- xp$test 26 | # 27 | # e7030_loss <- 28 | # pred_model(train = train, 29 | # test = test, 30 | # form = form, avg=FALSE) 31 | # 32 | # batch_70_30 <- mean(e7030_loss) 33 | # 34 | # test_0.05_upd <- round(nrow(test) * 0.05,0) 35 | # test_0.1_upd <- round(nrow(test) * 0.1,0) 36 | # test_0.25_upd <- round(nrow(test) * 0.25,0) 37 | # test_onl_upd <- 1 38 | # 39 | # upd_schemes <- c(test_onl_upd, 40 | # test_0.05_upd, 41 | # test_0.1_upd, 42 | # test_0.25_upd) 43 | # 44 | # results_upds <- c() 45 | # for (updsch in upd_schemes) { 46 | # cat(".") 47 | # #updsch <- upd_schemes[2] 48 | # 49 | # s_vec <- split(1:nrow(test), ceiling(seq_along(1:nrow(test)) / updsch)) 50 | # 51 | # err_roll <- c() 52 | # train0 <- train 53 | # for (i in 1:length(s_vec)) { 54 | # 55 | # if (i == 1) { 56 | # err_i <- 57 | # pred_model(train = train, 58 | # test = test[s_vec[[i]],], 59 | # form = form, avg=FALSE) 60 | # 61 | # err_roll <- c(err_roll, err_i) 62 | # } else { 63 | # idx <- unlist(s_vec[seq_len(i-1)], use.names = FALSE) 64 | # traini <- rbind.data.frame(train, test[idx,]) 65 | # 66 | # err_i <- 67 | # pred_model(train = traini, 68 | # test = test[s_vec[[i]],], 69 | # form = form, avg=FALSE) 70 | # 71 | # err_roll <- c(err_roll, err_i) 72 | # } 73 | # 74 | # } 75 | # 76 | # results_upds <- c(results_upds, mean(err_roll)) 77 | # } 78 | # 79 | # results_upds <- c(results_upds, batch_70_30) 80 | # names(results_upds) <- c("online","x5perc","x10perc","x25perc","batch") 81 | # 82 | # results_upds 83 | # } 84 | 85 | 86 | wf_sample_size <- 87 | function(ds, 88 | form, 89 | predictive_algorithm = "lasso", 90 | nfolds, 91 | outer_split) { 92 | 93 | pred_model <- 94 | switch(predictive_algorithm, 95 | "rf" = RF_loss, 96 | "rbr" = RBR_loss, 97 | "mlp" = MLP_loss, 98 | "lasso" = LASSO_loss, 99 | "cart" = CART_loss, 100 | "gp" = GP_loss, 101 | RF_loss) 102 | 103 | library(forecast) 104 | nd <- ndiffs(ds) 105 | if (nd > 0) { 106 | ds <- diff(ds, differences=nd) 107 | } 108 | 109 | khat <- estimate_k(ds[1:(length(ds) * outer_split)], m.max = 30, tol = .01) 110 | if (khat < 8) khat <- 8 111 | 112 | x <- embed_timeseries(as.numeric(ds), khat+1) 113 | x <- head(x, 1000) 114 | 115 | iters <- seq(from = 100, to = 1000, by = 100) 116 | 117 | results <- vector("list", length(iters)-1) 118 | for (i in seq_along(iters)[-length(iters)]) { 119 | it <- iters[i] 120 | 121 | train <- x[1:it,] 122 | test <- x[(it+1):(iters[i+1]),] 123 | 124 | #print(nrow(train)) 125 | #print(nrow(test)) 126 | #cat("\n\n") 127 | true_loss <- 128 | pred_model(train = train, 129 | test = test, 130 | form = form) 131 | 132 | estimated_loss <- 133 | performance_estimation( 134 | train = train, 135 | form = form, 136 | pred_model = pred_model, 137 | nfolds = nfolds 138 | ) 139 | 140 | estimated_loss_on <- 141 | online_methods_pe(train = train, 142 | form = form, 143 | pred_model = pred_model) 144 | 145 | estimated_loss <- 146 | c(estimated_loss, 147 | estimated_loss_on) 148 | 149 | err_estimation <- sapply(estimated_loss, 150 | function(u) { 151 | ((u - true_loss) / true_loss) * 100 152 | }) 153 | 154 | results[[i]] <- err_estimation 155 | } 156 | 157 | results 158 | } 159 | 160 | 161 | workflow <- 162 | function(ds, 163 | form, 164 | predictive_algorithm = "rf", 165 | nfolds, 166 | outer_split, is_embedded=FALSE) { 167 | 168 | if(!is_embedded){ 169 | if (!(predictive_algorithm == "rbr")) { 170 | library(forecast) 171 | nd <- ndiffs(ds) 172 | if (nd > 0) { 173 | ds <- diff(ds, differences=nd) 174 | } 175 | } 176 | # 177 | khat <- estimate_k(ds[1:(length(ds) * outer_split)], m.max = 30, tol = .01) 178 | if (khat < 8) khat <- 8 179 | 180 | x <- embed_timeseries(as.numeric(ds), khat+1) 181 | } else { 182 | x<-ds 183 | } 184 | #head(x) 185 | 186 | xp <- partition(x, outer_split) 187 | 188 | train <- xp$train 189 | test <- xp$test 190 | 191 | pred_model <- 192 | switch(predictive_algorithm, 193 | "rf" = RF_loss, 194 | "rbr" = RBR_loss, 195 | "mlp" = MLP_loss, 196 | "lasso" = LASSO_loss, 197 | "cart" = CART_loss, 198 | "gp" = GP_loss, 199 | RF_loss) 200 | 201 | true_loss <- 202 | pred_model(train = train, 203 | test = test, 204 | form = form) 205 | 206 | estimated_loss <- 207 | performance_estimation( 208 | train = train, 209 | form = form, 210 | pred_model = pred_model, 211 | nfolds = nfolds 212 | ) 213 | 214 | estimated_loss_on <- 215 | online_methods_pe(train = train, 216 | form = form, 217 | pred_model = pred_model) 218 | 219 | estimated_loss <- 220 | c(estimated_loss, 221 | estimated_loss_on) 222 | 223 | err_estimation <- sapply(estimated_loss, 224 | function(u) { 225 | ((u - true_loss) / true_loss) * 100 226 | }) 227 | 228 | list(err_estimation=err_estimation, 229 | est_err = estimated_loss, 230 | err=true_loss) 231 | } 232 | 233 | 234 | performance_estimation <- 235 | function(train, form, pred_model, nfolds) { 236 | cat("Estimating loss using ...\n") 237 | cat("... std. x val ...\n") 238 | std_x_val <- 239 | kf_xval( 240 | x = train, 241 | nfolds = nfolds, 242 | FUN = pred_model, 243 | shuffle.rows = TRUE, 244 | form = form, 245 | average_results = TRUE) 246 | 247 | cat("... blocked x val ...\n") 248 | blocked_x_val <- 249 | blocked_kf_xval( 250 | x = train, 251 | nfolds = nfolds, 252 | FUN = pred_model, 253 | form = form) 254 | 255 | cat("... modified x val ...\n") 256 | mod_x_val <- 257 | modified_xval( 258 | x = train, 259 | nfolds = nfolds, 260 | FUN = pred_model, 261 | average_results = TRUE, 262 | form = form) 263 | 264 | cat("... hv blocked x val ...\n") 265 | hvb_x_val <- 266 | hv.block_xval( 267 | x = train, 268 | nfolds = nfolds, 269 | FUN = pred_model, 270 | average_results = TRUE, 271 | form = form 272 | ) 273 | 274 | cat("... preq blocks ...\n") 275 | preq_b <- 276 | prequential_in_blocks( 277 | x = train, 278 | nfolds = nfolds, 279 | FUN = pred_model, 280 | average_results = TRUE, 281 | form = form 282 | ) 283 | 284 | cat("... sliding preq blocks ...\n") 285 | sl_preq_b <- 286 | sliding_prequential_in_blocks( 287 | x = train, 288 | nfolds = nfolds, 289 | FUN = pred_model, 290 | average_results = TRUE, 291 | form = form 292 | ) 293 | 294 | cat("... preq blocks w gap ...\n") 295 | preq_b_gap <- 296 | prequential_in_blocks_gap( 297 | x = train, 298 | nfolds = nfolds, 299 | FUN = pred_model, 300 | average_results = TRUE, 301 | form = form 302 | ) 303 | 304 | cat("... holdout ...\n") 305 | hout <- 306 | holdout(x = train, 307 | FUN = pred_model, 308 | form = form) 309 | 310 | cat("... repeated holdout ...\n") 311 | rephout <- 312 | repeated_holdout( 313 | x = train, 314 | nreps = nfolds, 315 | train_size = .6, 316 | test_size = .1, 317 | average_results = TRUE, 318 | FUN = pred_model, 319 | form = form 320 | ) 321 | 322 | loss_estimations <- 323 | c(x_val = std_x_val, 324 | b_x_val = blocked_x_val, 325 | m_x_val = mod_x_val, 326 | hvb_x_val = hvb_x_val, 327 | preq_b = preq_b, 328 | sl_preq_b = sl_preq_b, 329 | preq_b_gap = preq_b_gap, 330 | hout = hout, 331 | rephout = rephout) 332 | 333 | loss_estimations 334 | } 335 | 336 | online_methods_pe <- 337 | function(train, form, pred_model) { 338 | cat("Estimating loss using ...\n") 339 | 340 | xp <- partition(train, .8) 341 | 342 | train_ <- xp$train 343 | validation <- xp$test 344 | 345 | cat("... preq lw ...\n") 346 | preq_lw <- 347 | PrequentialLandmark( 348 | train = train_, 349 | test = validation, 350 | FUN = pred_model, 351 | average_results = TRUE, 352 | form=form 353 | ) 354 | 355 | cat("... preq sw ...\n") 356 | preq_sw <- 357 | PrequentialSliding( 358 | train = train_, 359 | test = validation, 360 | FUN = pred_model, 361 | average_results = TRUE, 362 | form=form 363 | ) 364 | 365 | loss_estimations <- 366 | c(preq_lw = preq_lw, 367 | preq_sw = preq_sw) 368 | 369 | loss_estimations 370 | } 371 | -------------------------------------------------------------------------------- /stationarity.r: -------------------------------------------------------------------------------- 1 | load("data/tsl_uni_90_mix.rdata") 2 | 3 | len <- sapply(ts_list, length) 4 | 5 | save(len, file = "series_length.rdata") 6 | 7 | stationarity_test_df <- 8 | function(x) { 9 | cat(".") 10 | library(forecast) 11 | library(urca) 12 | 13 | nd <- ndiffs(x) 14 | if (nd>0) { 15 | x <- diff(x,differences=nd) 16 | } 17 | 18 | x <- as.vector(x) 19 | 20 | xtest <- ur.df(x,type="none",selectlags="AIC",lags=10) 21 | 22 | tr <- unname(xtest@teststat < min(xtest@cval))[1,1] 23 | 24 | st <- ifelse(tr, F, T) 25 | 26 | st 27 | } 28 | 29 | stationarity_test_wave <- 30 | function(x) { 31 | library(locits) 32 | 33 | nd <- ndiffs(x) 34 | if (nd>0) { 35 | x <- diff(x,differences=nd) 36 | } 37 | 38 | xp <- trunc_timeseries(x) 39 | 40 | xtest <- hwtos2(as.vector(xp)) 41 | 42 | nrej <- xtest$nreject 43 | 44 | st <- ifelse(nrej > 0, F, T) 45 | 46 | st 47 | } 48 | 49 | 50 | trunc_timeseries <- 51 | function(x) { 52 | max_power <- 4096 53 | seq_powers <- c(max_power, 54 | max_power / 2, 55 | max_power / 4, 56 | max_power / 8, 57 | max_power / 16) 58 | 59 | len <- length(x) 60 | x <- as.vector(x) 61 | dim_trc <- seq_powers[which(len - seq_powers + 1 > 0)[1]] 62 | 63 | truncd_ts <- x[(len - dim_trc + 1):len] 64 | 65 | truncd_ts 66 | } 67 | 68 | 69 | is_st_wave <- sapply(ts_list, stationarity_test_wave) 70 | is_st_df <- sapply(ts_list, stationarity_test_df) 71 | is_stationary_2ensemble <- is_st_wave & is_st_df 72 | 73 | table(is_st_wave) 74 | table(is_st_df) 75 | table(is_stationary_2ensemble) 76 | 77 | save(is_st_wave, 78 | is_st_df, 79 | is_stationary_2ensemble, 80 | file = "stationarity_tsdl.rdata") 81 | 82 | -------------------------------------------------------------------------------- /stationarity_tsdl.rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vcerqueira/experiments-performance_estimation/659e079b98f4aff97dcf959391431fc6da4a9d82/stationarity_tsdl.rdata -------------------------------------------------------------------------------- /synthetic/creating-synthetic-ds.r: -------------------------------------------------------------------------------- 1 | source("synthetic/simulate-ts.r") 2 | source("src/utils.r") 3 | 4 | library(forecast) 5 | library(tsensembler) 6 | 7 | mcreps <- 1000; 8 | seq. <- seq_len(mcreps) 9 | ts.len <- 200 10 | 11 | TS1 <- lapply(seq., function(j) { 12 | as_positive( 13 | as.vector( 14 | simulateLinearTS(ts.len, 15 | ar = TRUE, 16 | ma = FALSE, 17 | lags = 3, 18 | maxRoot = 5, 19 | n.start = 300)[["ts"]] 20 | ) 21 | ) 22 | }) 23 | 24 | TS2 <- lapply(seq., function(j) { 25 | as_positive( 26 | as.vector( 27 | simulateLinearTS(ts.len, 28 | ar = FALSE, 29 | ma = TRUE, 30 | lags = 1, 31 | maxRoot = 5, 32 | n.start = 100)[["ts"]] 33 | ) 34 | ) 35 | }) 36 | 37 | data(USAccDeaths) 38 | Y <- as.vector(USAccDeaths) 39 | arima.fit <- Arima(Y, order=c(12,0,0), seasonal = c(1, 0, 0)) 40 | 41 | TS3 <- lapply(seq., function(j) { 42 | as.vector( 43 | simulate(object = arima.fit, 44 | nsim = ts.len) 45 | ) 46 | }) 47 | 48 | TS1 <- lapply(TS1, embed_timeseries, embedding.dimension = 5) 49 | TS2 <- lapply(TS2, embed_timeseries, embedding.dimension = 5) 50 | TS3 <- lapply(TS3, embed_timeseries, embedding.dimension = 5) 51 | 52 | synthetic <- list(TS1=TS1, TS2=TS2,TS3=TS3) 53 | 54 | 55 | plot_df(synthetic$TS1[[180]]$target) 56 | plot_df(synthetic$TS2[[110]]$target) 57 | plot_df(synthetic$TS3[[2]]$target) 58 | 59 | plot_df <- 60 | function(x) { 61 | df <- as.data.frame(x) 62 | df$Time <- 1:nrow(df) 63 | 64 | ggplot(data=df, aes(x=Time, y=x)) + 65 | geom_line() + 66 | theme_minimal() + 67 | theme(axis.title.y = element_blank(), 68 | axis.text = element_text(size=12)) 69 | 70 | } 71 | 72 | 73 | 74 | save(synthetic, file = "data/synthetic.rdata") 75 | -------------------------------------------------------------------------------- /synthetic/fresults-analysis-synth.r: -------------------------------------------------------------------------------- 1 | load("/Users/vcerqueira/Desktop/Results/final_results_synthetic_ts1_preq_rbr.rdata") 2 | final_results2<-final_results 3 | load("final_results_synthetic_ts1_rbr.rdata") 4 | 5 | 6 | source("src/correlated-t-test.r") 7 | source("src/plots.r") 8 | 9 | err_estimation <- 10 | lapply(final_results, 11 | function(x) { 12 | x$err_estimation 13 | }) 14 | 15 | err_estimation2 <- 16 | lapply(final_results2, 17 | function(x) { 18 | x$err_estimation 19 | }) 20 | 21 | for (iter in seq_along(err_estimation)) { 22 | err_estimation[[iter]] <- 23 | c(err_estimation[[iter]], err_estimation2[[iter]]) 24 | } 25 | 26 | nms <- names(final_results[[1]][[1]]) 27 | names(final_results2[[1]][[1]]) 28 | fr <- do.call(rbind.data.frame, err_estimation) 29 | colnames(fr) <- nms 30 | colnames(fr) <- 31 | c("CV", "CV-Bl", "CV-Mod","CV-hvBl", 32 | "Preq-Bls", "Preq-Sld-Bls", 33 | "Preq-Bls-Gap","Holdout", "Rep-Holdout", 34 | "Preq-Grow","Preq-Slide") 35 | 36 | fr_abs <- abs(fr) 37 | 38 | fr_abs_rank <- apply(fr_abs, 1, rank) 39 | avg <- rowMeans(fr_abs_rank) 40 | sdev <- apply(fr_abs_rank,1, sd) 41 | 42 | avg_rank_plot <- 43 | function(avg, sdev) { 44 | require(reshape2) 45 | require(ggplot2) 46 | 47 | ord <- names(sort(avg)) 48 | 49 | methods <- names(avg) 50 | 51 | ds <- 52 | data.frame( 53 | avg = avg, 54 | sdev = sdev, 55 | methods = methods, 56 | row.names = NULL 57 | ) 58 | ds$methods <- factor(ds$methods, levels = ord) 59 | 60 | #ds <- melt(ds) 61 | 62 | ggplot(data = ds, 63 | aes(x = methods, 64 | y = avg)) + 65 | geom_bar(stat = "identity", 66 | fill = "#33CCCC") + 67 | theme_minimal() + 68 | theme(axis.text.x = element_text( 69 | angle = 45, 70 | size = 14, 71 | hjust = 1 72 | )) + 73 | theme(axis.text.y = element_text(size = 12), 74 | axis.title.y = element_text(size = 12)) + 75 | #geom_hline(yintercept = 1) + 76 | geom_errorbar(aes(ymin = avg - sdev, 77 | ymax = avg + sdev), 78 | width = .5, 79 | position = position_dodge(.9)) + 80 | labs(x = "", 81 | y = "Avg. rank & Std dev.", 82 | title = "") 83 | } 84 | 85 | avg_rank_plot(avg,sdev) 86 | percdiff_plot_log(fr) 87 | 88 | colMeans(fr) 89 | apply(fr,2, sd) 90 | 91 | 92 | -------------------------------------------------------------------------------- /synthetic/simulate-ts.r: -------------------------------------------------------------------------------- 1 | # FROM https://github.com/cbergmeir/tsExpKit/blob/master/R/simulateTS.R 2 | 3 | # The coefficients that are generated are chosen in a way that the resulting 4 | # processes are stationary (AR) and invertible (MA). 5 | # 6 | # TODO: function also necessary (and present) in tsDyn 7 | # For detailed explanation see the INS paper. 8 | # 9 | # @title Generate random coefficients for an ARMA model. 10 | # @param lags the number of lags 11 | # @param maxRoot the root of the characteristic polynomial is chosen between 1.1 and \code{maxRoot}. 12 | # @export 13 | generateRandomARMAParameters <- function(lags, maxRoot) { 14 | 15 | if(maxRoot <= 1.1) stop("maxRoot has to be bigger than 1.1") 16 | 17 | l <- lags# + 1 18 | s <- sign(runif(l,-1,1)) 19 | polyRoots <- s*runif(l,1.1,maxRoot) 20 | 21 | #calculate coefficients 22 | coeff <- 1 23 | for(root in polyRoots) coeff <- c(0,coeff) - c(root*coeff,0) 24 | 25 | nCoeff <- coeff / coeff[1] 26 | params <- - nCoeff[2:length(nCoeff)] 27 | 28 | return(params) 29 | } 30 | 31 | ##old version for parameter generation 32 | #generateRandomARMAParameters <- function(lags) { 33 | # rVals <- rnorm(lags) 34 | # values <- -sort(-abs(rVals))/(max(abs(rVals))+0.05) 35 | # factors <- rep(c(1, -1),floor(lags/2)) 36 | # if(lags %% 2 == 1) factors <- c(factors, 1) 37 | # params <- factors * values 38 | # return(params) 39 | #} 40 | 41 | ##old version for non-linear time series simulation 42 | #simulateNonlinearTS <- function(length, lags=4, seed=1) { 43 | # 44 | # set.seed(seed) 45 | # 46 | # maxlags <- lags + 10 47 | # model <- "nonlin" 48 | # 49 | # w <- rnorm(length+maxlags, sd = 0.01) 50 | # 51 | # lts <- simulateLinearTS(length+maxlags, lags=lags, seed=seed)[["ts"]] 52 | # 53 | # z <- rep(0, length+maxlags) 54 | # for(t in (maxlags-5):(length+maxlags)) 55 | # z[t] <- 0.7 * exp (-z[t - 1] / length) + 0.4 * cos (z[t - 2]) + 0.25 * atan(z[t - 3]) + w[t] +lts[t] 56 | # 57 | # ts <- z[maxlags:(length+maxlags-1)] 58 | # 59 | # return(list(lags=lags, model=model, seed=seed, ts=ts)) 60 | # 61 | #} 62 | 63 | #' This function can be used to generate pure AR, pure MA, and ARMA series. The AR part will be 64 | #' stationary and the MA part invertible. Therefore, the coefficients are not given directly, 65 | #' but a value \code{maxRoot} which controls the interval from which the roots for the 66 | #' characteristic polynomials are chosen. So, all the roots of the characteristic polynomials 67 | #' are real-valued. For a detailed explanation see the referenced literature. 68 | #' 69 | #' @title Generate AR, MA, and ARMA series 70 | #' @param length the length of the series to be generated 71 | #' @param ar if \code{TRUE}, series has an AR part 72 | #' @param ma if \code{TRUE}, series has an MA part 73 | #' @param lags the number of lags 74 | #' @param seed a seed used for random number generation 75 | #' @param maxRoot the roots of the characteristic polynomials are chosen between 1.1 and \code{maxRoot} 76 | #' @param n.start burn-in period. if NA, calculated automatically by arima.sim 77 | #' @param ... parameters passed to arima.sim 78 | #' @return a list containing all the parameters, and a member \code{ts} with the generated series 79 | #' @references 80 | #' C. Bergmeir, J.M. Benítez, On the Use of Cross-validation for Time Series Predictor Evaluation. Information Sciences 191 (2012) 192-213. 81 | #' @export 82 | simulateLinearTS <- function (length, ar=TRUE, ma=TRUE, lags=4, seed=1, maxRoot=5.0, n.start = NA, ...) { 83 | 84 | set.seed(seed) 85 | 86 | params <- generateRandomARMAParameters(lags, maxRoot) 87 | 88 | #print(params) 89 | 90 | if(ar && !ma) { 91 | model <- "AR" 92 | paramsAr <- params 93 | paramsMa <- 0 94 | ts <- arima.sim(model=list(ar=params),n=length, n.start = n.start, ...) 95 | } else if(!ar && ma) { 96 | model <- "MA" 97 | paramsMa <- params 98 | paramsAr <- 0 99 | ts <- arima.sim(model=list(ma=params),n=length, n.start = n.start, ...) 100 | } else if(ar && ma) { 101 | model <- "ARMA" 102 | paramsMa <- params 103 | paramsAr <- generateRandomARMAParameters(lags, maxRoot) 104 | ts <- arima.sim(model=list(ar=paramsAr, ma=params),n=length, n.start = n.start) 105 | } 106 | 107 | return(list(paramsAr=paramsAr, paramsMa=paramsMa, lags=lags, model=model, seed=seed, maxRoot=maxRoot, ts=ts)) 108 | } 109 | 110 | #' This function can be used to generate nonlinear time series. 111 | #' It is similar to the function \code{\link{simulateLinearTS}}, but applies nonlinear functions to certain lags. 112 | #' The nonlinear functions currently used are: cos, sin, tanh, atan, and exp(-x/10000). 113 | #' For a detailed explanation see the referenced literature. 114 | #' 115 | #' @title Generate nonlinear time series 116 | #' @param length the length of the series to be generated 117 | #' @param lags the number of lags 118 | #' @param seed a seed used for random number generation 119 | #' @param maxRoot the roots of the characteristic polynomials are chosen between 1.1 and \code{maxRoot} 120 | #' @return a list containing all the parameters, and a member \code{ts} with the generated series 121 | #' @references 122 | #' C. Bergmeir, J.M. Benítez, On the Use of Cross-validation for Time Series Predictor Evaluation. Information Sciences 191 (2012) 192-213. 123 | #' @export 124 | #' @export 125 | simulateNonlinearTS <- function(length, lags=4, seed=1, maxRoot=5.0) { 126 | 127 | # length=10000 128 | # lags=20 129 | # seed=3 130 | # maxRoot=5.0 131 | 132 | x <- NULL 133 | 134 | randNonlinX <- function(type, x) { 135 | nonlinX <- switch(type, cos(x), sin(x), tanh(x), atan(x), exp(-x/10000))#, cosh(x), sinh(x))#,1/(x*x), 1/(x*x*x))#x,tan(x)), 136 | return(nonlinX) 137 | } 138 | 139 | set.seed(seed) 140 | 141 | params <- generateRandomARMAParameters(lags, maxRoot) 142 | #print(params) 143 | 144 | x[1:lags] <- rnorm (lags, sd = 0.5) 145 | 146 | type <- ceiling(runif(lags,min=0, max=5)) 147 | 148 | #print(type) 149 | 150 | #type[1:lags] <- 1 151 | 152 | for (i in ((lags+1):(length+2*lags))) { 153 | 154 | x[i] <- rnorm (1, sd = 0.5) 155 | 156 | for (j in 1:lags) { 157 | x[i] <- x[i] + params[j]*randNonlinX(type[j], x[i-j]) 158 | } 159 | } 160 | 161 | model <- "nonlin" 162 | ts <- x[(2*lags+1):(length+2*lags)] 163 | 164 | return(list(lags=lags, model=model, seed=seed, ts=ts)) 165 | 166 | } 167 | 168 | ##example code 169 | #par(mfrow = c(4, 1)) 170 | # 171 | #ts <- simulateNonlinearTS(1000, seed=3, lags=15) 172 | #ts 173 | #plot(ts$ts, type="l") 174 | #acf(ts$ts) 175 | #tnnTest(ts$ts) 176 | # 177 | #ts <- simulateNonlinearTS(1000, seed=3, lags=9) 178 | #ts 179 | #plot(ts$ts, type="l") 180 | #acf(ts$ts) 181 | 182 | 183 | 184 | ##----------------------- 185 | #VAR 186 | 187 | mcevec <- function(lambda, c, n, d) { 188 | 189 | v <- vector() 190 | v[1:n] <- 0 191 | v[(n-d+1):n] <- c 192 | 193 | for(j in (n-d):1) { 194 | v[j] <- lambda*v[j+d] 195 | } 196 | 197 | v 198 | } 199 | 200 | #mcgenevec <- function(lambda, c, w, n, d) { 201 | # 202 | # v <- vector() 203 | # v[1:n] <- 0 204 | # v[(n-d+1):n] <- c 205 | # 206 | # for(j in (n-d):1) { 207 | # v[j] <- lambda*v[j+d] + w[j+d] 208 | # } 209 | # 210 | # v 211 | #} 212 | 213 | companionMatrixFromEigenvalues <- function(q, lambda, c, d) { 214 | 215 | nchain <- length(q) 216 | n <- sum(q) 217 | 218 | #v <- matrix(0, nrow=n, ncol=n) 219 | 220 | v <- matrix(0, nrow=n, ncol=n) 221 | 222 | i <- 0 223 | for(k in 1:nchain) { 224 | v[, i+1] <- mcevec(lambda[k], c[, i+1], n, d) 225 | 226 | # if(q[k] > 1) { 227 | # for(j in 2:q[k]) { 228 | # mcgenevec(lambda[k], c[, i+1], v[,i+j-1], n, d) 229 | # } 230 | # } 231 | i <- i+q[k] 232 | } 233 | 234 | X <- v 235 | #X 236 | 237 | F <- X%*%diag(lambda) %*% solve(X) 238 | F 239 | } 240 | 241 | 242 | generateRandomStableCompanionMatrix <- function(dim = 2, order = 2) { 243 | 244 | lenq <- order*dim 245 | 246 | # c <- matrix(runif(lenq), nrow=order, ncol=lenq) 247 | # c[which(c>0.5)] <- 1 248 | # c[which(c<=0.5)] <- 0 249 | 250 | c <- matrix(1-diag(dim),nrow=dim,ncol=lenq) 251 | 252 | lambda <- runif(lenq, min=0.01) 253 | 254 | #TODO: generation of complex eigenvalues results in complex values in F. This shouldn't be the case. 255 | # ##generate eigenvalues within unit circle. 256 | # 257 | # if((lenq%%2) == 1) { 258 | # 259 | # if(lenq==1 || lenq==3) { 260 | # lambda <- runif(lenq, min=0.01) 261 | # } else { 262 | # 263 | # theta <- runif((lenq%/%2) - 1, min=0, max=2*pi) 264 | # r <- runif((lenq%/%2) - 1, min=0.01) 265 | # 266 | # lambda <- complex(real=r*cos(theta), imaginary=r*sin(theta)) 267 | # lambda <- c(lambda, Conj(lambda)) 268 | # 269 | # lambda <- c(lambda, runif(3, min=0.01)) 270 | # 271 | # } 272 | # } else { 273 | # 274 | # theta <- runif(lenq%/%2, min=0, max=2*pi) 275 | # r <- runif(lenq%/%2, min=0.01) 276 | # 277 | # lambda <- complex(real=r*cos(theta), imaginary=r*sin(theta)) 278 | # lambda <- c(lambda, Conj(lambda)) 279 | # 280 | # } 281 | 282 | F <- companionMatrixFromEigenvalues(seq(1,1,length=lenq), lambda, c, dim) 283 | 284 | F[which(Mod(F)<1e-10)] <- 0 285 | 286 | if(sum(Mod(Im(F)))> 1e-10) { 287 | warning("generateRandomStableCompanionMatrix: Companion matrix is not real-valued") 288 | print(F) 289 | } 290 | 291 | F <- Re(F) 292 | 293 | list(F=F, lambda=lambda) 294 | } 295 | 296 | #' This function can be used to simulate data from a random, stable VAR process. 297 | #' 298 | #' @title Simulate data from stable VAR process 299 | #' @param dim the dimension of the VAR (bivariate, trivariate,...) 300 | #' @param order the order of the VAR (how many lags) 301 | #' @param sd the standard deviation of the noise to be included 302 | #' @param length the length of the series to be generated 303 | #' @return a matrix containing the simulated data 304 | #' @references 305 | #' G.N. Boshnakov, B.M. Iqelan (2009). Generation of time series models with given spectral properties. Journal of Time Series Analysis 30(3):349-368. 306 | #' @export 307 | simulateStableVarProcess <- function(dim=3, order=2, sd=1, length=1000) { 308 | 309 | compMat <- generateRandomStableCompanionMatrix(dim=dim, order=order) 310 | 311 | A <- list() 312 | for(i in 1:order) { 313 | 314 | A[[i]] <- compMat$F[1:dim, ((i-1)*dim+1):(i*dim)] 315 | 316 | } 317 | #A 318 | 319 | l <- length+100 320 | 321 | repeat { 322 | 323 | #------ 324 | #generate random covariance structure and noise using this covariance structure 325 | cov <- matrix(runif(dim*dim, min=-1), ncol=dim) 326 | diag(cov) <- 1 327 | cov <- cov*t(cov) 328 | 329 | L <- try(chol(cov)) 330 | if(class(L) != "try-error") break 331 | } 332 | 333 | L <- chol(cov) 334 | 335 | u <- t(L) %*% matrix(rnorm(l*dim, sd=sd), nrow=dim) 336 | #------ 337 | 338 | y <- matrix(0, ncol=l, nrow=dim) 339 | 340 | y[,1:order] <- rnorm(dim*order, sd=sd) 341 | 342 | for(i in (order+1):l) { 343 | 344 | y[,i] <- u[,i-order] 345 | 346 | for(j in 1:order) { 347 | y[,i] <- y[,i] + A[[j]]%*%y[,i-j] 348 | } 349 | #y[,i] <- A1%*%y[,i-1] + A2%*%y[,i-2] + u[,i-2] 350 | } 351 | 352 | y[,101:l] 353 | } -------------------------------------------------------------------------------- /synthetic_data_generation/creating-synthetic-ds.r: -------------------------------------------------------------------------------- 1 | source("simulate-ts.r") 2 | source("../src/utils.r") 3 | 4 | library(forecast) 5 | library(tsensembler) 6 | 7 | mcreps <- 1000; 8 | seq. <- seq_len(mcreps) 9 | ts.len <- 200 10 | 11 | TS1 <- lapply(seq., function(j) { 12 | as_positive( 13 | as.vector( 14 | simulateLinearTS(ts.len, 15 | ar = TRUE, 16 | ma = FALSE, 17 | lags = 3, 18 | maxRoot = 5, 19 | n.start = 300)[["ts"]] 20 | ) 21 | ) 22 | }) 23 | 24 | TS2 <- lapply(seq., function(j) { 25 | as_positive( 26 | as.vector( 27 | simulateLinearTS(ts.len, 28 | ar = FALSE, 29 | ma = TRUE, 30 | lags = 1, 31 | maxRoot = 5, 32 | n.start = 100)[["ts"]] 33 | ) 34 | ) 35 | }) 36 | 37 | data(USAccDeaths) 38 | Y <- as.vector(USAccDeaths) 39 | arima.fit <- Arima(Y, order=c(12,0,0), seasonal = c(1, 0, 0)) 40 | 41 | TS3 <- lapply(seq., function(j) { 42 | as.vector( 43 | simulate(object = arima.fit, 44 | nsim = ts.len) 45 | ) 46 | }) 47 | 48 | TS1 <- lapply(TS1, embed_timeseries, embedding.dimension = 5) 49 | TS2 <- lapply(TS2, embed_timeseries, embedding.dimension = 5) 50 | TS3 <- lapply(TS3, embed_timeseries, embedding.dimension = 5) 51 | 52 | synthetic <- list(TS1=TS1, TS2=TS2,TS3=TS3) 53 | 54 | 55 | save(synthetic, file = "data/synthetic.rdata") 56 | -------------------------------------------------------------------------------- /synthetic_data_generation/simulate-ts.r: -------------------------------------------------------------------------------- 1 | # FROM https://github.com/cbergmeir/tsExpKit/blob/master/R/simulateTS.R 2 | 3 | # The coefficients that are generated are chosen in a way that the resulting 4 | # processes are stationary (AR) and invertible (MA). 5 | # 6 | # TODO: function also necessary (and present) in tsDyn 7 | # For detailed explanation see the INS paper. 8 | # 9 | # @title Generate random coefficients for an ARMA model. 10 | # @param lags the number of lags 11 | # @param maxRoot the root of the characteristic polynomial is chosen between 1.1 and \code{maxRoot}. 12 | # @export 13 | generateRandomARMAParameters <- function(lags, maxRoot) { 14 | 15 | if(maxRoot <= 1.1) stop("maxRoot has to be bigger than 1.1") 16 | 17 | l <- lags# + 1 18 | s <- sign(runif(l,-1,1)) 19 | polyRoots <- s*runif(l,1.1,maxRoot) 20 | 21 | #calculate coefficients 22 | coeff <- 1 23 | for(root in polyRoots) coeff <- c(0,coeff) - c(root*coeff,0) 24 | 25 | nCoeff <- coeff / coeff[1] 26 | params <- - nCoeff[2:length(nCoeff)] 27 | 28 | return(params) 29 | } 30 | 31 | ##old version for parameter generation 32 | #generateRandomARMAParameters <- function(lags) { 33 | # rVals <- rnorm(lags) 34 | # values <- -sort(-abs(rVals))/(max(abs(rVals))+0.05) 35 | # factors <- rep(c(1, -1),floor(lags/2)) 36 | # if(lags %% 2 == 1) factors <- c(factors, 1) 37 | # params <- factors * values 38 | # return(params) 39 | #} 40 | 41 | ##old version for non-linear time series simulation 42 | #simulateNonlinearTS <- function(length, lags=4, seed=1) { 43 | # 44 | # set.seed(seed) 45 | # 46 | # maxlags <- lags + 10 47 | # model <- "nonlin" 48 | # 49 | # w <- rnorm(length+maxlags, sd = 0.01) 50 | # 51 | # lts <- simulateLinearTS(length+maxlags, lags=lags, seed=seed)[["ts"]] 52 | # 53 | # z <- rep(0, length+maxlags) 54 | # for(t in (maxlags-5):(length+maxlags)) 55 | # z[t] <- 0.7 * exp (-z[t - 1] / length) + 0.4 * cos (z[t - 2]) + 0.25 * atan(z[t - 3]) + w[t] +lts[t] 56 | # 57 | # ts <- z[maxlags:(length+maxlags-1)] 58 | # 59 | # return(list(lags=lags, model=model, seed=seed, ts=ts)) 60 | # 61 | #} 62 | 63 | #' This function can be used to generate pure AR, pure MA, and ARMA series. The AR part will be 64 | #' stationary and the MA part invertible. Therefore, the coefficients are not given directly, 65 | #' but a value \code{maxRoot} which controls the interval from which the roots for the 66 | #' characteristic polynomials are chosen. So, all the roots of the characteristic polynomials 67 | #' are real-valued. For a detailed explanation see the referenced literature. 68 | #' 69 | #' @title Generate AR, MA, and ARMA series 70 | #' @param length the length of the series to be generated 71 | #' @param ar if \code{TRUE}, series has an AR part 72 | #' @param ma if \code{TRUE}, series has an MA part 73 | #' @param lags the number of lags 74 | #' @param seed a seed used for random number generation 75 | #' @param maxRoot the roots of the characteristic polynomials are chosen between 1.1 and \code{maxRoot} 76 | #' @param n.start burn-in period. if NA, calculated automatically by arima.sim 77 | #' @param ... parameters passed to arima.sim 78 | #' @return a list containing all the parameters, and a member \code{ts} with the generated series 79 | #' @references 80 | #' C. Bergmeir, J.M. Benítez, On the Use of Cross-validation for Time Series Predictor Evaluation. Information Sciences 191 (2012) 192-213. 81 | #' @export 82 | simulateLinearTS <- function (length, ar=TRUE, ma=TRUE, lags=4, seed=1, maxRoot=5.0, n.start = NA, ...) { 83 | 84 | set.seed(seed) 85 | 86 | params <- generateRandomARMAParameters(lags, maxRoot) 87 | 88 | #print(params) 89 | 90 | if(ar && !ma) { 91 | model <- "AR" 92 | paramsAr <- params 93 | paramsMa <- 0 94 | ts <- arima.sim(model=list(ar=params),n=length, n.start = n.start, ...) 95 | } else if(!ar && ma) { 96 | model <- "MA" 97 | paramsMa <- params 98 | paramsAr <- 0 99 | ts <- arima.sim(model=list(ma=params),n=length, n.start = n.start, ...) 100 | } else if(ar && ma) { 101 | model <- "ARMA" 102 | paramsMa <- params 103 | paramsAr <- generateRandomARMAParameters(lags, maxRoot) 104 | ts <- arima.sim(model=list(ar=paramsAr, ma=params),n=length, n.start = n.start) 105 | } 106 | 107 | return(list(paramsAr=paramsAr, paramsMa=paramsMa, lags=lags, model=model, seed=seed, maxRoot=maxRoot, ts=ts)) 108 | } 109 | 110 | #' This function can be used to generate nonlinear time series. 111 | #' It is similar to the function \code{\link{simulateLinearTS}}, but applies nonlinear functions to certain lags. 112 | #' The nonlinear functions currently used are: cos, sin, tanh, atan, and exp(-x/10000). 113 | #' For a detailed explanation see the referenced literature. 114 | #' 115 | #' @title Generate nonlinear time series 116 | #' @param length the length of the series to be generated 117 | #' @param lags the number of lags 118 | #' @param seed a seed used for random number generation 119 | #' @param maxRoot the roots of the characteristic polynomials are chosen between 1.1 and \code{maxRoot} 120 | #' @return a list containing all the parameters, and a member \code{ts} with the generated series 121 | #' @references 122 | #' C. Bergmeir, J.M. Benítez, On the Use of Cross-validation for Time Series Predictor Evaluation. Information Sciences 191 (2012) 192-213. 123 | #' @export 124 | #' @export 125 | simulateNonlinearTS <- function(length, lags=4, seed=1, maxRoot=5.0) { 126 | 127 | # length=10000 128 | # lags=20 129 | # seed=3 130 | # maxRoot=5.0 131 | 132 | x <- NULL 133 | 134 | randNonlinX <- function(type, x) { 135 | nonlinX <- switch(type, cos(x), sin(x), tanh(x), atan(x), exp(-x/10000))#, cosh(x), sinh(x))#,1/(x*x), 1/(x*x*x))#x,tan(x)), 136 | return(nonlinX) 137 | } 138 | 139 | set.seed(seed) 140 | 141 | params <- generateRandomARMAParameters(lags, maxRoot) 142 | #print(params) 143 | 144 | x[1:lags] <- rnorm (lags, sd = 0.5) 145 | 146 | type <- ceiling(runif(lags,min=0, max=5)) 147 | 148 | #print(type) 149 | 150 | #type[1:lags] <- 1 151 | 152 | for (i in ((lags+1):(length+2*lags))) { 153 | 154 | x[i] <- rnorm (1, sd = 0.5) 155 | 156 | for (j in 1:lags) { 157 | x[i] <- x[i] + params[j]*randNonlinX(type[j], x[i-j]) 158 | } 159 | } 160 | 161 | model <- "nonlin" 162 | ts <- x[(2*lags+1):(length+2*lags)] 163 | 164 | return(list(lags=lags, model=model, seed=seed, ts=ts)) 165 | 166 | } 167 | 168 | ##example code 169 | #par(mfrow = c(4, 1)) 170 | # 171 | #ts <- simulateNonlinearTS(1000, seed=3, lags=15) 172 | #ts 173 | #plot(ts$ts, type="l") 174 | #acf(ts$ts) 175 | #tnnTest(ts$ts) 176 | # 177 | #ts <- simulateNonlinearTS(1000, seed=3, lags=9) 178 | #ts 179 | #plot(ts$ts, type="l") 180 | #acf(ts$ts) 181 | 182 | 183 | 184 | ##----------------------- 185 | #VAR 186 | 187 | mcevec <- function(lambda, c, n, d) { 188 | 189 | v <- vector() 190 | v[1:n] <- 0 191 | v[(n-d+1):n] <- c 192 | 193 | for(j in (n-d):1) { 194 | v[j] <- lambda*v[j+d] 195 | } 196 | 197 | v 198 | } 199 | 200 | #mcgenevec <- function(lambda, c, w, n, d) { 201 | # 202 | # v <- vector() 203 | # v[1:n] <- 0 204 | # v[(n-d+1):n] <- c 205 | # 206 | # for(j in (n-d):1) { 207 | # v[j] <- lambda*v[j+d] + w[j+d] 208 | # } 209 | # 210 | # v 211 | #} 212 | 213 | companionMatrixFromEigenvalues <- function(q, lambda, c, d) { 214 | 215 | nchain <- length(q) 216 | n <- sum(q) 217 | 218 | #v <- matrix(0, nrow=n, ncol=n) 219 | 220 | v <- matrix(0, nrow=n, ncol=n) 221 | 222 | i <- 0 223 | for(k in 1:nchain) { 224 | v[, i+1] <- mcevec(lambda[k], c[, i+1], n, d) 225 | 226 | # if(q[k] > 1) { 227 | # for(j in 2:q[k]) { 228 | # mcgenevec(lambda[k], c[, i+1], v[,i+j-1], n, d) 229 | # } 230 | # } 231 | i <- i+q[k] 232 | } 233 | 234 | X <- v 235 | #X 236 | 237 | F <- X%*%diag(lambda) %*% solve(X) 238 | F 239 | } 240 | 241 | 242 | generateRandomStableCompanionMatrix <- function(dim = 2, order = 2) { 243 | 244 | lenq <- order*dim 245 | 246 | # c <- matrix(runif(lenq), nrow=order, ncol=lenq) 247 | # c[which(c>0.5)] <- 1 248 | # c[which(c<=0.5)] <- 0 249 | 250 | c <- matrix(1-diag(dim),nrow=dim,ncol=lenq) 251 | 252 | lambda <- runif(lenq, min=0.01) 253 | 254 | #TODO: generation of complex eigenvalues results in complex values in F. This shouldn't be the case. 255 | # ##generate eigenvalues within unit circle. 256 | # 257 | # if((lenq%%2) == 1) { 258 | # 259 | # if(lenq==1 || lenq==3) { 260 | # lambda <- runif(lenq, min=0.01) 261 | # } else { 262 | # 263 | # theta <- runif((lenq%/%2) - 1, min=0, max=2*pi) 264 | # r <- runif((lenq%/%2) - 1, min=0.01) 265 | # 266 | # lambda <- complex(real=r*cos(theta), imaginary=r*sin(theta)) 267 | # lambda <- c(lambda, Conj(lambda)) 268 | # 269 | # lambda <- c(lambda, runif(3, min=0.01)) 270 | # 271 | # } 272 | # } else { 273 | # 274 | # theta <- runif(lenq%/%2, min=0, max=2*pi) 275 | # r <- runif(lenq%/%2, min=0.01) 276 | # 277 | # lambda <- complex(real=r*cos(theta), imaginary=r*sin(theta)) 278 | # lambda <- c(lambda, Conj(lambda)) 279 | # 280 | # } 281 | 282 | F <- companionMatrixFromEigenvalues(seq(1,1,length=lenq), lambda, c, dim) 283 | 284 | F[which(Mod(F)<1e-10)] <- 0 285 | 286 | if(sum(Mod(Im(F)))> 1e-10) { 287 | warning("generateRandomStableCompanionMatrix: Companion matrix is not real-valued") 288 | print(F) 289 | } 290 | 291 | F <- Re(F) 292 | 293 | list(F=F, lambda=lambda) 294 | } 295 | 296 | #' This function can be used to simulate data from a random, stable VAR process. 297 | #' 298 | #' @title Simulate data from stable VAR process 299 | #' @param dim the dimension of the VAR (bivariate, trivariate,...) 300 | #' @param order the order of the VAR (how many lags) 301 | #' @param sd the standard deviation of the noise to be included 302 | #' @param length the length of the series to be generated 303 | #' @return a matrix containing the simulated data 304 | #' @references 305 | #' G.N. Boshnakov, B.M. Iqelan (2009). Generation of time series models with given spectral properties. Journal of Time Series Analysis 30(3):349-368. 306 | #' @export 307 | simulateStableVarProcess <- function(dim=3, order=2, sd=1, length=1000) { 308 | 309 | compMat <- generateRandomStableCompanionMatrix(dim=dim, order=order) 310 | 311 | A <- list() 312 | for(i in 1:order) { 313 | 314 | A[[i]] <- compMat$F[1:dim, ((i-1)*dim+1):(i*dim)] 315 | 316 | } 317 | #A 318 | 319 | l <- length+100 320 | 321 | repeat { 322 | 323 | #------ 324 | #generate random covariance structure and noise using this covariance structure 325 | cov <- matrix(runif(dim*dim, min=-1), ncol=dim) 326 | diag(cov) <- 1 327 | cov <- cov*t(cov) 328 | 329 | L <- try(chol(cov)) 330 | if(class(L) != "try-error") break 331 | } 332 | 333 | L <- chol(cov) 334 | 335 | u <- t(L) %*% matrix(rnorm(l*dim, sd=sd), nrow=dim) 336 | #------ 337 | 338 | y <- matrix(0, ncol=l, nrow=dim) 339 | 340 | y[,1:order] <- rnorm(dim*order, sd=sd) 341 | 342 | for(i in (order+1):l) { 343 | 344 | y[,i] <- u[,i-order] 345 | 346 | for(j in 1:order) { 347 | y[,i] <- y[,i] + A[[j]]%*%y[,i-j] 348 | } 349 | #y[,i] <- A1%*%y[,i-1] + A2%*%y[,i-2] + u[,i-2] 350 | } 351 | 352 | y[,101:l] 353 | } --------------------------------------------------------------------------------