├── .gitignore ├── data_parameters.r ├── README.md ├── job_2009.sbatch ├── job_2013.sbatch ├── job_2013_a.sbatch ├── data_load.r ├── fit.r ├── plot_functions.r ├── find_dl.r ├── data_processing.r ├── utils.R ├── eda.r ├── extract_speed.R ├── paper_plots.r ├── var_eda.R ├── dl_out_plot.r └── find_var.r /.gitignore: -------------------------------------------------------------------------------- 1 | cl_runs/ 2 | msk/ 3 | models 4 | *.sbatch 5 | .DS_Store 6 | .Rhistory 7 | .find_dl.r.swp 8 | Rplots.pdf -------------------------------------------------------------------------------- /data_parameters.r: -------------------------------------------------------------------------------- 1 | max.lags = 12 2 | horizon = 7 3 | train_days = readRDS(gzcon(url('https://app.box.com/shared/static/rvtrajgihbznvglc3qgq36lmdqkqf2o3.rds'))) 4 | test_days = readRDS(gzcon(url('https://app.box.com/shared/static/88n6wwvlseuz0iwr0gz7752pnmr6mfjg.rds'))) 5 | sensor_col = 11 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dl-traffic 2 | Code for the "Deep Learning for Short-Term Traffic Flow Prediction" paper (https://arxiv.org/abs/1604.04527) 3 | 4 | This data contains all the scripts that were used for the emperical results presented in the paper. 5 | 6 | All of the data is stored on the Box and urls are used in the code. 7 | 8 | The original (un-cleaned) data is stored here https://app.box.com/s/jgnto4zn1008965vuztjr9z2ignpj70m. extract_speed script reads the raw SQLite files and saves to RDS fales used later in the ananlysis. 9 | -------------------------------------------------------------------------------- /job_2009.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=2009 3 | #SBATCH --output=cl_runs/2009_%A_%a.out 4 | #SBATCH --error=cl_runs/2009_%A_%a.out 5 | #SBATCH --time=36:00:00 6 | #SBATCH --partition=sandyb 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks-per-node=16 9 | #SBATCH --array=2-21 10 | 11 | module load java 12 | module load R 13 | 14 | # Print this sub-job's task ID 15 | echo "My SLURM_ARRAY_TASK_ID: " $SLURM_ARRAY_TASK_ID 16 | 17 | Rscript find_dl.r ../../data/2009/gcm21_i_tf_20.rds 2000 $SLURM_ARRAY_TASK_ID "use only d.train" 18 | 19 | 20 | -------------------------------------------------------------------------------- /job_2013.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=2013 3 | #SBATCH --output=cl_runs/2013_%A_%a.out 4 | #SBATCH --error=cl_runs/2013_%A_%a.out 5 | #SBATCH --time=36:00:00 6 | #SBATCH --partition=sandyb 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks-per-node=16 9 | #SBATCH --array=1-16 10 | 11 | module load java 12 | module load R 13 | 14 | # Print this sub-job's task ID 15 | echo "My SLURM_ARRAY_TASK_ID: " $SLURM_ARRAY_TASK_ID 16 | 17 | Rscript find_dl.r data/2013/gcm21_i_m_8_n.rds 250 11 "selected days; with lasso; depth=10; hidden=200" 18 | 19 | -------------------------------------------------------------------------------- /job_2013_a.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=2013 3 | #SBATCH --output=cl_runs/2013_%A_%a.out 4 | #SBATCH --error=cl_runs/2013_%A_%a.out 5 | #SBATCH --time=36:00:00 6 | #SBATCH --partition=sandyb 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks-per-node=16 9 | #SBATCH --array=2-21 10 | 11 | module load java 12 | module load R 13 | 14 | # Print this sub-job's task ID 15 | echo "My SLURM_ARRAY_TASK_ID: " $SLURM_ARRAY_TASK_ID 16 | 17 | Rscript find_dl.r ../../data/2013/gcm21_i_tf_20.rds 2000 $SLURM_ARRAY_TASK_ID "cluster with time" 18 | 19 | 20 | -------------------------------------------------------------------------------- /data_load.r: -------------------------------------------------------------------------------- 1 | # gcm21_i_m_8.rds = https://app.box.com/shared/static/bt69dviydld3o3k87max5od5zhnihbf6.rds 2 | combo = 'm_8_lasso' 3 | d = readRDS(gzcon(url('https://app.box.com/shared/static/bt69dviydld3o3k87max5od5zhnihbf6.rds'))); 4 | sz = dim(d) 5 | ld = lagged_data( d, max.lags) 6 | days = yday(ld$dt) 7 | udays = unique(days) 8 | train_days = readRDS(gzcon(url('https://app.box.com/shared/static/rvtrajgihbznvglc3qgq36lmdqkqf2o3.rds'))) 9 | test_days = readRDS(gzcon(url('https://app.box.com/shared/static/88n6wwvlseuz0iwr0gz7752pnmr6mfjg.rds'))) 10 | test_rows = which(days %in% test_days) 11 | train_rows = which(days %in% train_days) 12 | ld.test = ld[test_rows,] 13 | ld.train = ld[train_rows,] 14 | 15 | # load predicted data 16 | y_dl = readRDS("fit_data/y_dl_m_8_lasso.rds") 17 | y_dl_t = readRDS("fit_data/y_dl_t_m_8_lasso.rds") 18 | 19 | y_lars = readRDS("fit_data/y_lars.rds") 20 | y_lars_t = readRDS("fit_data/y_t_lars.rds") 21 | -------------------------------------------------------------------------------- /fit.r: -------------------------------------------------------------------------------- 1 | 2 | fit_lars = function(i, first_predictor, last_predictor, ld.train) 3 | { 4 | m2 = lars(x=as.matrix(ld.train[,first_predictor:last_predictor]), y=ld.train[,i], type=c("lasso")) 5 | a = summary(m2) 6 | c = coef(m2, s=which.min(a$Cp), mode="step") 7 | return(c) 8 | } 9 | 10 | fit_arima = function(i, training_set_sz, predictors_indeces) 11 | { 12 | print(i) 13 | m = auto.arima(ld[1:training_set_sz,i], xreg=as.matrix(ld[1:training_set_sz,predictors_indeces])) 14 | return(m) 15 | } 16 | fit_dl = function(ld.hex,i, first_predictor, last_predictor) 17 | { 18 | ld.dl = h2o.deeplearning(x=first_predictor:last_predictor, y = i, training_frame = ld.hex) 19 | } 20 | 21 | best_dl = function(d.train, d.test,sensor_col, predictors, ensemble_size) 22 | { 23 | best_err = .Machine$double.xmax 24 | # models = vector("list", ensemble_size) 25 | for(i in 1:ensemble_size) 26 | { 27 | # for (i in 1:ensemble_size) { 28 | if (i %% 10 == 0) 29 | print(i) 30 | # rand_activation <- c("Tanh", "Rectifier", "TanhWithDropout", "Maxout")[sample(1:4,1)] 31 | rand_activation <- c("Tanh", "Rectifier")[sample(1:2,1)] 32 | rand_numlayers <- sample(1:10,1) 33 | rand_hidden <- c(sample(1:200,rand_numlayers,T)) 34 | rand_l1 <- runif(1, 1e-6, 1e-4) 35 | res = tryCatch( 36 | { 37 | dlmodel <- h2o.deeplearning(x=predictors, 38 | y = sensor_col, training_frame = d.train, 39 | validation_frame = d.test, 40 | hidden=rand_hidden, 41 | activation=rand_activation, 42 | l1=rand_l1, 43 | max_w2=10, 44 | epochs = 0.1) 45 | err <- dlmodel@model$validation_metrics@metrics$mean_residual_deviance 46 | }, warning = function(war) { 47 | err = .Machine$double.xmax 48 | return(err) 49 | }, error = function(err) { 50 | err = .Machine$double.xmax 51 | return(err) 52 | } 53 | ) 54 | # models <- c(models, dlmodel) 55 | # models[[i]] <- dlmodel 56 | if (err < best_err) { 57 | best_err <- err 58 | m <- dlmodel 59 | } 60 | } 61 | # seelct best model baswd on the meas residual deviance for the test data set 62 | # best_err = .Machine$double.xmax 63 | # for (i in 1:length(models)) { 64 | # err <- models[[i]]@model$validation_metrics@metrics$mean_residual_deviance 65 | # if (err < best_err) { 66 | # best_err <- err 67 | # best_model <- models[[i]] 68 | # } 69 | # } 70 | # m = best_model 71 | return(m) 72 | } 73 | -------------------------------------------------------------------------------- /plot_functions.r: -------------------------------------------------------------------------------- 1 | fcast_data_plot = function(day, days_v, fcast, data, model_name) 2 | { 3 | rows = which(days_v == day) 4 | n_rows = length(rows) 5 | forward_rows = rows[(1+horizon):n_rows] 6 | par(mar = c(0,0,0,0)) 7 | x = data[forward_rows, 1] 8 | x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 9 | # par(mar = c(4.5,4,2,0)) 10 | lwd = 4 11 | plot(x, 2.23694*data[forward_rows, sensor_col], type='l', lty=1, col=1,lwd=lwd, xlab='time', ylab='speed [m/s]', main=toString(day), ylim=c(5,80), xaxt='n', yaxt = 'n', ann=FALSE, bty="n") #names(d)[sendaysor_col]) 12 | lines(x,2.23694*fcast[forward_rows], type='l', lty=2, col=2, lwd=lwd) # there is no time colun 13 | lines(x,2.23694*data[rows[1:(n_rows - horizon)] ,sensor_col], lty=4, col=4, lwd=lwd) 14 | legend("bottomleft", legend = c('data', model_name, 'const'), lty=c(1,2,4), col=c(1,2,4), lwd=3) 15 | nme = paste(sensor_col,toString(day),'dl', combo, sep='_') 16 | abline(55,0, col='green', lwd=3, lty=4) 17 | abline(v=8, col="orange", lty=3, lwd=4) 18 | } 19 | qqplot.data <- function (vec) # argument: vector of numbers 20 | { 21 | # following four lines from base R's qqline() 22 | y <- quantile(vec[!is.na(vec)], c(0.25, 0.75)) 23 | x <- qnorm(c(0.25, 0.75)) 24 | slope <- diff(y)/diff(x) 25 | int <- y[1L] - slope * x[1L] 26 | 27 | d <- data.frame(resids = vec) 28 | 29 | ggplot(d, aes(sample = resids)) + stat_qq() + geom_abline(slope = slope, intercept = int)+ theme_bw(base_size = 36) 30 | 31 | } 32 | 33 | res_get = function(day, days_v, fcast, data) 34 | { 35 | rows = which(days == day) 36 | n_rows = length(rows) 37 | forward_rows = rows[(1+horizon):n_rows] 38 | x = data[forward_rows, 1] 39 | x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 40 | data_ = 2.23694*data[forward_rows, sensor_col] 41 | fcast_ = 2.23694*fcast[forward_rows] 42 | res = data_ - fcast_ 43 | return(res) 44 | } 45 | 46 | res_plot = function(day, days_v, fcast, data, model_name) 47 | { 48 | rows = which(days == day) 49 | n_rows = length(rows) 50 | forward_rows = rows[(1+horizon):n_rows] 51 | x = data[forward_rows, 1] 52 | x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 53 | data_ = 2.23694*data[forward_rows, sensor_col] 54 | fcast_ = 2.23694*fcast[forward_rows] 55 | par(mar = c(2,2,4,0), mfrow=c(2,2)) 56 | lwd = 4 57 | res = data_ - fcast_ 58 | plot(x, res, type='p',lty=1, pch=16, col=1,lwd=lwd, xlab='time', ylab='speed [m/s]', main=toString(day), ann=T, bty="n") #names(d)[sendaysor_col]) 59 | qqnorm(res) 60 | qqline(res) 61 | acf(res) 62 | plot(x, data_, type='p', pch=16, cex=0.4, lty=1, col=1,lwd=lwd, xlab='time', ylab='speed [m/s]', main=toString(day), ylim=c(5,80), ann=FALSE, bty="n") #names(d)[sendaysor_col]) 63 | lines(x,fcast_, type='p', pch=16, cex=0.4, col=2, lwd=lwd) # there is no time colun 64 | } 65 | -------------------------------------------------------------------------------- /find_dl.r: -------------------------------------------------------------------------------- 1 | library(lubridate) 2 | library(h2o) 3 | source('~/Dropbox/utils.R') 4 | setwd('~/Dropbox/papers/dl-traffic/') 5 | source('src/fit.r') 6 | source('src/data_processing.r') 7 | source('src/data_parameters.r') 8 | 9 | 10 | localH2O = h2o.init(max_mem_size = "8g", nthreads=-1) 11 | # path = paste('../../data', year, sprintf("gcm21_i_%s_%d.rds",'m', 6), sep='/') 12 | args <- commandArgs(trailingOnly = TRUE) 13 | if (length(args) < 2) 14 | { 15 | path = 'data/2013/gcm21_i_m_8_n.rds' 16 | ensemble_size = 1 17 | sensor_col = 11 18 | notes = "default parameters" 19 | } else { 20 | path = args[1] 21 | ensemble_size = args[2] 22 | sensor_col = as.integer(args[3]) 23 | notes = args[4] 24 | } 25 | 26 | d = readRDS(path) 27 | sz = dim(d) 28 | ld = lagged_data( d, max.lags) 29 | ld.hex =converth2o(d = ld, conn = localH2O) # as.h2o(ld) 30 | udays = unique(yday(ld$dt)) 31 | days = yday(ld$dt) 32 | train_rows = which(days %in% train_days) 33 | test_rows = which(days %in% test_days) 34 | first_predictor = 1 + (sz[2]-1)*horizon+1 # horizon*5 min forecast 35 | last_predictor = dim(ld)[2] 36 | names(ld)[first_predictor:last_predictor] 37 | pred_cols = c(1,first_predictor:last_predictor) 38 | lasso_a = readRDS(gzcon(url('https://app.box.com/shared/static/9r4pc149wz2y6whxmdu67c0jf0pgm8it.rds'))) 39 | to_keep = which(lasso_a[sensor_col,] > 0.001) + 1 40 | pred_cols = pred_cols[to_keep] 41 | 42 | d.train = ld.hex[train_rows,] 43 | ld.train = ld[train_rows,] 44 | d.test = ld.hex[test_rows,] 45 | ld.test = ld[test_rows,] 46 | 47 | 48 | m = best_dl(d.train, d.test,sensor_col, pred_cols, 50) 49 | m = best_dl(d.train, d.test,sensor_col, pred_cols, ensemble_size) 50 | h2o.saveModel(m, path = paste(normalizePath('~'),"/Dropbox/papers/dl-traffic/models/", sep=''), force = T) 51 | line = paste(path, ensemble_size, m@model_id, sensor_col, notes, sep='\t') 52 | write(line,'src/models', append=TRUE) 53 | 54 | 55 | for (i in 1:10) 56 | { 57 | dlmodel <- h2o.deeplearning(x=pred_cols, 58 | y = sensor_col, 59 | training_frame = d.train, 60 | validation_frame = d.test, 61 | hidden = c(200,200), 62 | activation="Tanh", 63 | adaptive_rate = F, 64 | l1 = 0.000966, 65 | nesterov_accelerated_gradient = T, 66 | epochs = 0.2 67 | ) 68 | 69 | print(dlmodel) 70 | } 71 | 72 | 73 | 74 | 75 | # hyper_params <- list( 76 | # hidden=list(c(32,32,32),c(64,64)), 77 | # input_dropout_ratio=c(0,0.05), 78 | # rate=c(0.01,0.02), 79 | # rate_annealing=c(1e-8,1e-7,1e-6), 80 | # activation=c("Tanh", "Rectifier") 81 | # ) 82 | # hyper_params 83 | # dl.grid <- h2o.grid( 84 | # "deeplearning", 85 | # model_id="dl_grid", 86 | # training_frame=d.train_n, 87 | # x=pred_cols, 88 | # y=sensor_col, 89 | # epochs=0.1, 90 | # score_validation_samples=10000, ## downsample validation set for faster scoring 91 | # score_duty_cycle=0.025, ## don't score more than 2.5% of the wall time 92 | # adaptive_rate=F, ## manually tuned learning rate 93 | # momentum_start=0.5, ## manually tuned momentum 94 | # momentum_stable=0.9, 95 | # momentum_ramp=1e7, 96 | # l1=1e-5, 97 | # l2=1e-5, 98 | # max_w2=10, ## can help improve stability for Rectifier 99 | # hyper_params=hyper_params 100 | # ) 101 | # dl.grid 102 | -------------------------------------------------------------------------------- /data_processing.r: -------------------------------------------------------------------------------- 1 | library(plyr) 2 | # creates lagged column 3 | append_speed_column = function(db, column_name, all_speed) 4 | { 5 | print(paste("Adding: ", column_name, sep = '')) 6 | con = dbConnect(RSQLite::SQLite(), dbname=db) 7 | d = dbGetQuery(con, query) 8 | #convert dt column to POSIX time 9 | d$dt = parse_date_time(d$dt,"ymd hms") 10 | d <- rename(d, c("Speed" = column_name)) 11 | nd = join(all_speed, d, by='dt', type = "left", match='first') 12 | print(dim(nd)) 13 | return(nd) 14 | } 15 | 16 | get_query = function(year) 17 | { 18 | 19 | query = paste("select Speed, datetime(LastUpdateTime - LastUpdateTime%300, 'unixepoch','localtime') as dt", 20 | " from sensordata", 21 | " where", 22 | " strftime('%Y', LastUpdateTime, 'unixepoch','localtime') = '", 23 | toString(year),"'", 24 | " order by dt;", sep = "") 25 | return(query) 26 | } 27 | 28 | slide = function(df, vname, new.colamn, n, lag) 29 | { 30 | df[,new.colamn] = c(rep(NA, lag),df[1:(n-lag),vname]) 31 | return(df) 32 | } 33 | 34 | lagged_data = function(d, max.lags) 35 | { 36 | sz = dim(d) 37 | nvec = names(d) 38 | ld = d #lagged data 39 | for (lag in 1:max.lags) 40 | { for (i in 2:sz[2]) 41 | { 42 | new.colname = paste(nvec[i], toString(lag), sep = "_") 43 | ld = slide(ld, nvec[i], new.colname, sz[1], lag) 44 | # print(dim(ld)) 45 | } 46 | } 47 | #remove first max.lags rows with NAs 48 | ld =ld[(max.lags+1):sz[1],] 49 | return(ld) 50 | } 51 | # removes rows for certain dys/years 52 | remove_days = function(d, years_keep, days_keep = c(2,3,4,5,6)) 53 | { 54 | days = lubridate::wday(all_speed$dt) 55 | years = lubridate::year(all_speed$dt) 56 | return(d[which(days %in% days_keep & years %in% years_keep)]) 57 | } 58 | # load from rds file 59 | load_data = function(data_dir, year, smoothed = 'm') 60 | { 61 | return(readRDS(c(data_dir, '/gcm_21_', year, '_', smoothed, '.rds'))) 62 | } 63 | # save to rds file 64 | save_data = function(data_dir, d, year, smoothed) 65 | { 66 | saveRDS(c(data_dir, '/gcm_21_', year, '_', smoothed, '.rds')) 67 | } 68 | # get set of parameters to be used for data preparations 69 | get_data_prep_parameters = function(set_id = 1) 70 | { 71 | gcm.data.params = list() 72 | year = 2009 73 | horizon = 7 74 | max.lags = 12 75 | if (set_id == 1) 76 | { 77 | gcm.data.params$year = years 78 | gcm.data.params$horizon = horizon 79 | gcm.data.params$max.lags = max.lags 80 | } 81 | return(gcm.data.params) 82 | } 83 | 84 | converth2o = function(d,conn) 85 | { 86 | min0 = 1440*lubridate::yday(ld$dt) + 60*lubridate::hour(ld$dt) + lubridate::minute(ld$dt) 87 | temp = rename(ld, c('dt' = 'min0')) 88 | temp$min0 = min0 89 | return(as.h2o(temp)) 90 | } 91 | 92 | median_filter = function(x, window = 3, sides = 1) 93 | { 94 | n = length(x) 95 | y = x 96 | if (sides ==1) 97 | { 98 | for (i in window:n) 99 | { 100 | 101 | y[i] = as.double(quantile(x[(i-window+1):i], probs = 0.5, na.rm = TRUE)) 102 | } 103 | } 104 | if (sides == 2) 105 | { 106 | for (i in window:(n - window)) 107 | { 108 | 109 | y[i] = as.double(quantile(x[(i-window+1):(i + window - 1)], probs = 0.5, na.rm = TRUE)) 110 | } 111 | } 112 | return(y) 113 | } 114 | 115 | mean_filter = function(x, window = 3) 116 | { 117 | n = length(x) 118 | y = x 119 | for (i in window:n) 120 | { 121 | y[i] = mean(x[(i-window+1):i]) 122 | } 123 | return(y) 124 | } 125 | 126 | # library(h2o) 127 | # library(plyr) 128 | -------------------------------------------------------------------------------- /utils.R: -------------------------------------------------------------------------------- 1 | utils = new.env() 2 | 3 | utils$img_w = 500 4 | utils$img_h = 750/2 5 | utils$pdf_w = 10.02 6 | utils$pdf_h = 7.27 7 | utils$par.def = par() 8 | 9 | resetPar <- function() { 10 | dev.new() 11 | op <- par(no.readonly = TRUE) 12 | dev.off() 13 | op 14 | } 15 | # saves current contents of the current device to a file if gen_slide_plot is ste to TRUE 16 | utils$save_png = function(path = './', name, lwidth = utils$img_w, lheight = utils$img_h) 17 | { 18 | dev.copy(png,paste(path, paste('/',name,".png", sep="") ,sep=""), width = lwidth, height = lheight); 19 | dev.off(); 20 | } 21 | 22 | utils$save_pdf = function(path = './', name, lwidth = utils$pdf_w, lheight = utils$pdf_h) 23 | { 24 | print(paste(path, paste(name,".pdf", sep="") ,sep="")) 25 | dev.copy(pdf,paste(path, paste(name,".pdf", sep="") ,sep=""), width = lwidth, height = lheight); 26 | dev.off(); 27 | } 28 | 29 | utils$fortify.ts <- function(x){ 30 | time <- as.numeric(time(x)) 31 | if(is.matrix(x)){ 32 | df <- as.data.frame(x) 33 | } 34 | else { 35 | x <- as.numeric(x) 36 | df <- as.data.frame(x) 37 | } 38 | df$time <- time 39 | return(df) 40 | } 41 | 42 | utils$fortify.acf <- function(acf){ 43 | n = length(acf$acf) 44 | range = 1:n 45 | if (acf$type=="correlation") 46 | range=2:n 47 | type <- switch(acf$type, correlation = "ACF", covariance = "ACF (cov)", 48 | partial = "Partial ACF") 49 | data.frame(acf = acf$acf[range], lag = acf$lag[range], type = type, n.used = acf$n.used) 50 | } 51 | 52 | utils$acf_plot <- function(df, ...){ 53 | clim <- qnorm((1 + 0.975)/2)/sqrt(df$n.used) 54 | qplot(lag, ymin = 0, ymax = acf, data = df, geom = "linerange") + 55 | facet_grid(type ~ .) + 56 | geom_hline(yintercept = 0, colour = "grey50", size = 0.5) + 57 | geom_hline(yintercept = c(-clim, clim), linetype = "dashed", colour = "grey50", size = 0.5) 58 | } 59 | 60 | utils$examine_corr <- function(x, ...){ 61 | acfs <- rbind(fortify.acf(acf(x, plot = FALSE,...)), fortify.acf(pacf(x, plot = FALSE,...))) 62 | print(acf_plot(acfs)) 63 | invisible(acfs) 64 | } 65 | 66 | 67 | 68 | # Multiple plot function 69 | # 70 | # ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) 71 | # - cols: Number of columns in layout 72 | # - layout: A matrix specifying the layout. If present, 'cols' is ignored. 73 | # 74 | # If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), 75 | # then plot 1 will go in the upper left, 2 will go in the upper right, and 76 | # 3 will go all the way across the bottom. 77 | # 78 | utils$multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { 79 | library(grid) 80 | 81 | # Make a list from the ... arguments and plotlist 82 | plots <- c(list(...), plotlist) 83 | 84 | numPlots = length(plots) 85 | 86 | # If layout is NULL, then use 'cols' to determine layout 87 | if (is.null(layout)) { 88 | # Make the panel 89 | # ncol: Number of columns of plots 90 | # nrow: Number of rows needed, calculated from # of cols 91 | layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), 92 | ncol = cols, nrow = ceiling(numPlots/cols)) 93 | } 94 | 95 | if (numPlots==1) { 96 | print(plots[[1]]) 97 | 98 | } else { 99 | # Set up the page 100 | grid.newpage() 101 | pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) 102 | 103 | # Make each plot, in the correct location 104 | for (i in 1:numPlots) { 105 | # Get the i,j matrix positions of the regions that contain this subplot 106 | matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) 107 | 108 | print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, 109 | layout.pos.col = matchidx$col)) 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /eda.r: -------------------------------------------------------------------------------- 1 | source('~/Dropbox/utils.R') 2 | # source('c:/Users/vsokolov/Dropbox/utils.R') 3 | library(ggplot2) 4 | setwd('~/Dropbox/papers/dl-traffic/') 5 | # setwd('C:/Users/vsokolov/Google Drive/papers/dl-traffic/') 6 | source('src/data_processing.r') 7 | source('src/data_parameters.r') 8 | # Functions --------------------------------------------------------------- 9 | 10 | plot_by_day = function(data, col) 11 | { 12 | days = lubridate::yday(data$dt) 13 | for (day in unique(days)) 14 | { 15 | rows = which(days == day) 16 | print(day) 17 | day_data = data[rows, col] 18 | plot(day_data, type='l', main=day) 19 | utils$save_png(path = 'tmp_fig', toString(day)) 20 | } 21 | } 22 | 23 | day_data = function(data, day, col) 24 | { 25 | days = lubridate::yday(data$dt) 26 | rows = which(days == day) 27 | day_data = data[rows, col] 28 | } 29 | 30 | plot_day = function(data, day, col) 31 | { 32 | days = lubridate::yday(data$dt) 33 | rows = which(days == day) 34 | day_data = data[rows, col] 35 | x = seq(0,24,length.out = length(day_data)) 36 | plot_speed(x, day_data) 37 | } 38 | plot_speed = function(x, day_data, scale = 1) 39 | { 40 | qplot(x,day_data,geom='point', ylab = 'Speed [m/s]', xlab='Time [hour]', size=I(scale*2), colour = I("red")) + 41 | scale_x_continuous(limits=c(min(x),max(x)), expand = c(0, 0)) + 42 | theme_bw(base_size = scale*16) 43 | } 44 | 45 | print_days = function(data) 46 | { 47 | print(unique(cbind(lubridate::year(data$dt), lubridate::month(data$dt), lubridate::day(data$dt), lubridate::wday(data$dt), lubridate::yday(data$dt)))) 48 | } 49 | 50 | plot_model = function() 51 | { 52 | days = yday(ld.test$dt) 53 | for (day in unique(days)) 54 | { 55 | rows = which(days == day) 56 | n_rows = length(rows) 57 | forward_rows = rows[(1+horizon):n_rows] 58 | par(mar = c(3.5,4,2,0)) 59 | x = ld.test[forward_rows, 1] 60 | x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 61 | par(mar = c(4.5,4,2,0)) 62 | plot(x, ld.test[forward_rows, sensor_col], type='l', lty=1, col=1, lwd=4, xlab='time', ylab='speed [m/s]', main=toString(day), ylim=c(2,40)) #names(d)[sendaysor_col]) 63 | lines(x,y_dl[forward_rows], type='l', lty=2, col=2, lwd=4) # there is no time colun 64 | lines(x,y_dl_corr[forward_rows], type='l', lty=3, col=3, lwd=4) # there is no time colun 65 | lines(x,ld.test[rows[1:(n_rows - horizon)] ,sensor_col], lty=4, col=4, lwd=4) 66 | legend("bottomleft", legend = c('data', 'dl', 'dl+ets', 'const'), lty=1:4, col=1:4, lwd=3) 67 | # save_pdf(path='~/Dropbox/papers/bayes-traffic/fig/forecast/', name = paste(nvec[sensor_col],toString(day),'dl', sep='_')) 68 | } 69 | } 70 | 71 | # Some plots ------------------------------ 72 | d = readRDS('data/2013/gcm21_i.rds'); 73 | ld = lagged_data( d, max.lags) 74 | 75 | s11 = ld[,c(1,11)] 76 | s11$wday = lubridate::wday(s11$dt) 77 | s11$yday = lubridate::yday(s11$dt) 78 | s11$min0 = 60*lubridate::hour(s11$dt) + lubridate::minute(s11$dt) 79 | as = ddply(s11[which(s11$yday < 345 & s11$yday> 200),], "min0", summarise, avg_speed = mean(N6043, na.rm = TRUE))$avg_speed 80 | plot(as) 81 | 82 | dim(s11[which(s11$yday < 345 & s11$yday> 200),]) 83 | print_days(d) 84 | d = readRDS('data/2013/gcm21_i.rds') 85 | d.trend = readRDS('data/2013/gcm21_i_tf_20.rds') 86 | plot_by_day(d,11) 87 | 88 | plot_day(d,295,11) 89 | utils$save_pdf(path = 'paper/fig/',name = 'day_295', lheight = utils$pdf_h/2) 90 | 91 | d_day = day_data(d,295,11) 92 | p = plot_day(d.trend,295,11) 93 | show(p) 94 | p + geom_line(aes(y = d_day)) + theme_bw(base_size = 24) 95 | utils$save_pdf(path = 'paper/fig/',name = 'day_295_tf',lheight = utils$pdf_h*0.7) 96 | 97 | library(genlasso) 98 | d_day.fused.lasso = trendfilter(d_day, ord = 1) 99 | d_day.cv = cv.trendfilter(d_day.fused.lasso) 100 | lambda_ind = which(d_day.fused.lasso$lambda == d_day.cv$lambda.min) 101 | day_data = d_day.fused.lasso$fit[,lambda_ind] 102 | x = seq(0,24,length.out = length(day_data)) 103 | p = plot_speed(x, day_data) 104 | show(p) 105 | p + geom_line(aes(y = d_day)) + theme_bw(base_size = 24) 106 | utils$save_pdf(path = 'paper/fig/',name = 'day_295_fl',lheight = utils$pdf_h*0.7) 107 | 108 | get_day = function(s) 109 | { 110 | unlist(strsplit(s,split = " "))[1] 111 | } 112 | get_time = function(s) 113 | { 114 | time = unlist(strsplit(s,split = " "))[2] 115 | time = unlist(strsplit(time,split = ":")) 116 | time = 60*as.integer(time[1]) + as.integer(time[2]) 117 | } 118 | 119 | plot(ld$N6040[1:n], ld$N6040_10[1:n], pch=20) 120 | save_pdf('~/Dropbox/slides/pf-traffic/fig/', name = "var_relation") 121 | 122 | 123 | # lagged data ------------------------------------------------------------- 124 | 125 | test_days = readRDS('data/test_days.rds') 126 | train_days = readRDS('data/train_days.rds') 127 | test_rows = which(days %in% test_days) 128 | train_rows = which(days %in% train_days) 129 | ld.test = ld[test_rows,] 130 | ld.train = ld[train_rows,] 131 | library(MASS) 132 | n=50 133 | f = kde2d(ld[9:n,11], ld[1:(n-8),20], n = 50) 134 | persp(f, xlab = 'data', ylab = 'lagged data', zlab = 'density', theta=20, phi = 20, box = T, cex.lab=1.5, col="maroon") 135 | utils$save_pdf('paper/fig/','ccf') 136 | 137 | 138 | f = kde2d(ld[9:n,11], ld[1:(n-8),11], n = 50) 139 | persp(f, xlab = 'data', ylab = 'lagged data', zlab = 'density', theta=20, phi = 20, box = T, cex.lab=1.5, col="maroon") 140 | utils$save_pdf('paper/fig/','acf') 141 | -------------------------------------------------------------------------------- /extract_speed.R: -------------------------------------------------------------------------------- 1 | # install.packages("RSQLite") 2 | # more about RSQLite: http://blog.rstudio.org/2014/10/25/rsqlite-1-0-0/ 3 | library(RSQLite) 4 | library(lubridate) 5 | library(plyr) 6 | source('~/Dropbox/utils.R') 7 | setwd('~/Dropbox/papers/dl-traffic/') 8 | source('src/data_processing.r') 9 | 10 | year = 2013 11 | smooth = 'm' 12 | # will pretend that everythign is in UTC time zone 13 | start = paste(year,'01','01', sep='/') 14 | end = paste(year,'12','31', sep='/') 15 | all_times = seq(ymd(start), ymd(end), by = "5 min") 16 | all_speed = data.frame(dt = all_times) 17 | 18 | query = get_query(year) 19 | 20 | # Main Loop --------------------------------------------------------------- 21 | path = paste('data', year, 'gcm21.rds', sep='/') 22 | if (!file.exists(path)) 23 | { 24 | for (i in 6034:6053) 25 | { 26 | db = paste("/Users/vsokolov/data/gcm/neighbour/IL-TESTTSC-STEVENSON-N-", toString(i), ".sqlite", sep="") 27 | column_name = paste("N", toString(i), sep="") 28 | all_speed = append_speed_column(db, column_name, all_speed) 29 | } 30 | saveRDS(all_speed, path) 31 | } 32 | 33 | 34 | # interpolate missing data ------------------------------------------------ 35 | all_speed = readRDS(path) 36 | path = paste('data', year, 'gcm21_i.rds', sep='/') 37 | if (!file.exists(path)) 38 | { 39 | print("Do interpolation") 40 | library(zoo) 41 | sz = dim(all_speed) 42 | # interpolation in time (by columns) 43 | # all_speed[,2:sz[2]] = na.approx(all_speed[,2:sz[2]], rule=2) 44 | 45 | # interpolation in space (by rows) 46 | # find first non-empty row 47 | rs = rowSums(is.na(all_speed)) 48 | for (i in 1:sz[1]) 49 | { 50 | if (rs[i] < (sz[2]-1) ) 51 | { 52 | first_non_na_row = i 53 | break 54 | } 55 | } 56 | # remove all-NA rows 57 | all_speed = all_speed[-(1:first_non_na_row),] 58 | # update sz 59 | sz = dim(all_speed) 60 | # update rs 61 | rs = rowSums(is.na(all_speed)) 62 | last_working = 1 63 | for (i in 2:sz[1]) 64 | { 65 | if (i %% 1000) 66 | print(i) 67 | # print(last_working) 68 | if (rs[i]==0) 69 | next 70 | if (rs[i] == (sz[2]-1)) # all NAs 71 | all_speed[i,2:sz[2]] = na.approx(as.double(all_speed[last_working,2:sz[2]]), rule=2) 72 | else 73 | { 74 | last_working = i 75 | all_speed[i,2:sz[2]] = na.approx(as.double(all_speed[i,2:sz[2]] ), rule=2) 76 | } 77 | } 78 | saveRDS(all_speed, path) 79 | } 80 | 81 | 82 | # Smoothing --------------------------------------------------------------- 83 | 84 | 85 | all_speed = readRDS(path) 86 | # days = lubridate::yday(all_speed$dt) 87 | # d = all_speed[which(days == 282),11] 88 | # plot(d) 89 | # lines(l1tf(d, lambda = 50), col = 1, type='l') 90 | # lines(l1tf(d, lambda = 10), col = 2, type='l') 91 | # lines(l1tf(d, lambda = 1), col = 3, type='l') 92 | install.packages("devtools") 93 | library(devtools) 94 | install_github("hadley/l1tf") 95 | library(l1tf) 96 | 97 | # parameters of the smoother 98 | smooth_param = 8 99 | if (smooth_param > 0 ) 100 | { 101 | path = paste('data', year, sprintf("gcm21_i_%s_%d.rds",smooth, smooth_param), sep='/') 102 | } else 103 | { 104 | path = paste('data', year, sprintf("gcm21_i_%s.rds",smooth), sep='/') 105 | } 106 | 107 | if (!file.exists(path)) 108 | { 109 | print("Do Smoothing") 110 | sz = dim(all_speed) 111 | days = lubridate::yday(all_speed$dt) 112 | nvec = names(all_speed) 113 | for (index in 2:sz[2]) 114 | { 115 | print(index) 116 | for (day in unique(days)) 117 | { 118 | # print(day) 119 | rows = which(days == day) 120 | dd = all_speed[rows,c(1,index)] 121 | if (length(dd[,2]) < window + 1) 122 | { 123 | print (length(dd)) 124 | print (c('Skipping day ', toString(day))) 125 | next 126 | } 127 | # dd$trend = as.double(filter(dd[,2], c(log(10:1))/15.10441, side=1)) 128 | # dd.ets = ets(dd[,2], model = 'ZZZ', damped = T) 129 | # dd$trend = fitted(dd.ets) 130 | if (smooth == 'm') 131 | dd$trend = median_filter(dd[,2], smooth_param, sides=1) 132 | if (smooth == 'tf') 133 | { 134 | dd$trend = l1tf(dd[,2], lambda = smooth_param) 135 | } 136 | 137 | if (smooth == 'tfs') 138 | { 139 | lhor = 24 140 | dd$trend = l1tf(dd[,2], lambda = smooth_param) 141 | nmeas = length(dd$trend) 142 | for (i in lhor:nmeas) 143 | { 144 | dd$trend[i] = l1tf(dd[(i-lhor+1):i,2], lambda = smooth_param*2)[lhor] 145 | } 146 | } 147 | 148 | if (smooth == 'l') 149 | { 150 | dd$mins = 60*lubridate::hour(dd$dt) + lubridate::minute(dd$dt) 151 | dd_trend = loess(dd[,nvec[index]] ~ dd[,'mins'], span = 0.2) 152 | dd$trend = predict(dd_trend, newdata = dd) 153 | } 154 | # dd$trend = mean_filter(dd[,2],6) 155 | if (runif(1) < 0.05) 156 | { 157 | plot(dd[,2], type='l') 158 | lines(dd$trend, lty = 2, col = 2) 159 | # utils$save_png('~/Dropbox/papers/bayes-traffic/fig/2013/', paste(nvec[index], toString(day), sep='_')) 160 | } 161 | # replace data with the smoothed data 162 | all_speed[which(days==day),index] = dd$trend 163 | # plot(dted$trend) 164 | # lines(dd$trend) 165 | } 166 | } 167 | saveRDS(all_speed, path) 168 | } 169 | 170 | # normalize the data 171 | all_speed = readRDS(path) 172 | path_name = unlist(strsplit(path,"\\."))[1] 173 | path = paste(path_name,"_n.rds",sep='') 174 | if (!file.exists(path)) 175 | { 176 | n = dim(all_speed)[[2]] 177 | for (i in 2:n) 178 | { 179 | all_speed[,i] = (all_speed[,i] - mean(all_speed[,i], na.rm=T))/sd(all_speed[,i], na.rm=T) 180 | } 181 | saveRDS(all_speed, path) 182 | } 183 | 184 | print("Done!") 185 | 186 | -------------------------------------------------------------------------------- /paper_plots.r: -------------------------------------------------------------------------------- 1 | setwd('~/Dropbox/papers/dl-traffic/') 2 | library(lubridate) 3 | source('src/data_parameters.r') 4 | source('src/data_processing.r') 5 | source('src/data_load.r') 6 | source('src/plot_functions.r') 7 | source('~/Dropbox/utils.R') 8 | 9 | 10 | # Scatter plots 11 | s = sample(dim(d)[1], 2000, replace = F) 12 | qplot(d$N6038[s], d$N6039[s], size=I(.5)) + xlab("Speed at N6038 [m/s]") + ylab("Speed at N6039 [m/s]") + theme_grey(base_size = 24) 13 | utils$save_pdf("fig", "scatter_1") 14 | qplot(d$N6035[s], d$N6042[s], size=I(.5)) + xlab("Speed at N6038 [m/s]") + ylab("Speed at N6039 [m/s]") + theme_grey(base_size = 24) 15 | utils$save_pdf("fig", "scatter_2") 16 | 17 | 18 | 19 | # bears is 283 (10/10/2013, Thursday) 20 | # weather is 345 (12/11/2013, Wednesday) 21 | data_name = "train" # {train, test} 22 | model_name = "dl" # {dl, lars} 23 | 24 | if (data_name == "train") { 25 | data = ld.train 26 | if(model_name=="dl") fcast = y_dl_t else fcast = y_lars_t[,sensor_col] 27 | # analysis_days = c(84, 203, 184, 24,59, 46) 28 | # analysis_days = sample(unique(days), size = 50, replace = F) 29 | analysis_days = c(210) 30 | } else { 31 | data = ld.test 32 | if(model_name=="dl") fcast = y_dl else fcast = y_lars[,sensor_col] 33 | analysis_days = c(280, 283, 345) 34 | } 35 | days = lubridate::yday(data$dt) 36 | 37 | 38 | for (day in analysis_days) 39 | { 40 | #fcast_data_plot(day, days, fcast, data, model_name) 41 | res_plot(day, days, fcast, data, model_name) 42 | } 43 | 44 | # par(resetPar()) 45 | # analyse day = 120 46 | # rows = which(days == 210) 47 | # n_rows = length(rows) 48 | # forward_rows = rows[(1+horizon):n_rows] 49 | # x = data[forward_rows, 1] 50 | # x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 51 | # day_210_dl = 2.23694*data[forward_rows, sensor_col] 52 | # fcast_210_dl = 2.23694*fcast[forward_rows] 53 | # day_210_lars = 2.23694*data[forward_rows, sensor_col] 54 | # fcast_210_lars = 2.23694*fcast[forward_rows] 55 | 56 | 57 | res_lars = res_get(210, days, fcast, data) 58 | 59 | 60 | res_dl = res_get(210, days, fcast, data) 61 | shapiro.test(res_dl) 62 | shapiro.test(res_lars) 63 | library(nortest) 64 | ad.test(res_dl) 65 | ad.test(res_lars) 66 | 67 | m = mean(res_dl) 68 | s = sd(res_dl) 69 | ks.test(res_dl, "pnorm", mean = m, sd = s) 70 | 71 | m = mean(res_lars) 72 | s = sd(res_lars) 73 | ks.test(res_lars, "pnorm", mean = m, sd = s) 74 | 75 | qqplot.data(res_dl) 76 | qqplot.data(res_lars) 77 | 78 | library(ggplot2) 79 | x = seq(0,24,length.out = 281) 80 | qplot(x,res_dl) 81 | qplot(x,res_lars) 82 | 83 | qpacf = function(d) 84 | { 85 | bacf <- acf(d, plot = FALSE)[1:20] 86 | bacfdf <- with(bacf, data.frame(lag, acf)) 87 | ggplot(data = bacfdf, mapping = aes(x = lag, y = acf)) + 88 | geom_hline(aes(yintercept = 0)) + 89 | geom_segment(mapping = aes(xend = lag, yend = 0)) + xlab("ACF") + theme_bw(base_size = 36) 90 | 91 | } 92 | qplot(res_lars) + xlab("Residual") + theme_bw(base_size = 36) 93 | utils$save_pdf(path = 'paper/fig/', name = "res_lars") 94 | qpacf(res_lars) 95 | utils$save_pdf(path = 'paper/fig/', name = "res_lars_pacf") 96 | qplot(x, res_lars) + theme_bw(base_size = 36) + xlab("Time [h]") + ylab("Residual [mi/h]") 97 | utils$save_pdf(path = 'paper/fig/', name = "res_lars_1") 98 | mean(res_lars) 99 | 100 | qplot(res_dl) + xlab("Residual") + theme_bw(base_size = 36) 101 | utils$save_pdf(path = 'paper/fig/', name = "res_dl") 102 | qpacf(res_dl) 103 | utils$save_pdf(path = 'paper/fig/', name = "res_dl_pacf") 104 | qplot(x, res_dl) + theme_bw(base_size = 36) + xlab("Time [h]") + ylab("Residual [mi/h]") 105 | utils$save_pdf(path = 'paper/fig/', name = "res_dl_1") 106 | Box.test(res_dl, lag = 10, fitdf = 0) 107 | library(tseries) 108 | adf.test(res_dl, k=12 ) 109 | mean(res_dl) 110 | library(fNonlinear) 111 | 112 | n= length(res_lars) 113 | bgtest(res_lars[1:(n-1)]~res_lars[2:n], type="F", order = 3) 114 | Box.test(res_lars, lag = 10, fitdf = 0) 115 | bptest(res_lars[1:(n-3)]~res_lars[2:(n-2)] + res_lars[3:(n-1)]) 116 | wnnTest(res_lars) 117 | bdsTest(res_lars) 118 | runsTest(res_lars) 119 | 120 | adf.test(res_lars, k=12 ) 121 | 122 | 123 | n= length(res_dl) 124 | bgtest(res_dl[1:(n-1)]~res_dl[2:n], type="F", order = 5) 125 | Box.test(res_dl, lag = 10, fitdf = 0) 126 | bptest(res_dl[1:(n-3)]~res_dl[2:(n-2)] + res_dl[3:(n-1)]) 127 | wnnTest(res_dl) 128 | bdsTest(res_dl) 129 | runsTest(res_dl) 130 | adf.test(res_dl, k=12 ) 131 | 132 | 133 | par(mfrow=c(1,1)) 134 | # for (day in c(283, 345)) 135 | for (day in c(283)) 136 | { 137 | rows = which(days == day) 138 | n_rows = length(rows) 139 | forward_rows = rows[(1+horizon):n_rows] 140 | data_day = data[forward_rows,] 141 | fcast_day = fcast[forward_rows] 142 | 143 | fcast_dl = fcast[forward_rows] 144 | plot(data_day[,sensor_col], type='p', main=toString(day)) 145 | lines(fcast_day) 146 | # utils$save_pdf('paper/fig/', name = "heat_dl_bears") 147 | speed_heat(day, data_day) 148 | utils$save_pdf('paper/fig/', name = "heat_data_bears") 149 | data_day[,sensor_col] = fcast_dl 150 | speed_heat(day, data_day) 151 | utils$save_pdf('paper/fig/', name = "heat_fcast_dl_bears") 152 | 153 | } 154 | 155 | 156 | 157 | # lars findings. Any relations to trffic model? 158 | betas = A[sensor_col,] 159 | m = matrix(betas, nrow=20, ncol = 6) 160 | print(m) 161 | print(rowSums(m)) 162 | plot(rowSums(m)) 163 | names(betas) = names(ld.train)[first_predictor:last_predictor] 164 | mileposts = cumsum(c(0, 566, 883, 1364, 1168, 875, 1210, 1153, 1186, 1173, 1203, 1242, 1086, 1122, 745, 1190, 913, 1193, 1404, 988))/1609.34 165 | 166 | graphics::image(y = seq(7,12), x = mileposts, z = m , ylab = "Time [h]", xlab = "Mile post", yaxt="n", col = heat.colors(22,1), cex.lab=2, cex.axis=2) 167 | 168 | par(mar = c(6,6,1,2)) 169 | library(fields) 170 | image.plot(y = seq(7*5,12*5, by = 5), x = 1:20, z = m , add=F, ylab = "Time Lag [min]", xlab = "Mile post", col = topo.colors(50)) 171 | utils$save_pdf('./paper/fig/', 'betas', lheight = utils$pdf_h/2) 172 | 173 | 174 | library(vars) 175 | varm = VAR(d[-1,-1], p=6, type="both") 176 | irfd = irf(varm, response = "N6043", boot = F, n.ahead = 12, runs = 4) 177 | # irfd = irf(varm, response = "N6053", boot = F, n.ahead = 10, runs = 4) 178 | # irfd = irf(varm, response = "N6034", boot = F, n.ahead = 10, runs = 4) 179 | par(mar=c(4,4,1,1)) 180 | 181 | first_sensor = 1 182 | last_sensor = 10 183 | s = first_sensor 184 | plot(seq(35,60,5),irfd$irf[s][[1]][7:12], type="o", xlab = "Lag [min]", ylab = "Response", col=s, lty=s, ylim=c(-0.05,.76), lwd=2, pch=s) 185 | nm = c(names(irfd$irf[s])[1]) 186 | for (s in (first_sensor+1):last_sensor) 187 | { 188 | d = irfd$irf[s] 189 | nm = c(nm,names(d)[1]) 190 | lines(seq(35,60,5),d[[1]][7:12], type="o", col=s, lty=s, lwd=2, pch=s) 191 | } 192 | legend(45,0.8,legend = nm, col=first_sensor:last_sensor,lty=first_sensor:last_sensor, bty="n", pch=first_sensor:last_sensor) 193 | 194 | View(irfd$irf) 195 | (as.double(unlist(irfd$irf))) 196 | irfm = matrix(unlist(irfd$irf), nrow = 20, byrow = T) 197 | 198 | rownames(irfm) = names(d[,-1]) 199 | mileposts = cumsum(c(0, 566, 883, 1364, 1168, 875, 1210, 1153, 1186, 1173, 1203, 1242, 1086, 1122, 745, 1190, 913, 1193, 1404, 988))/1609.34 200 | library(fields) 201 | image.plot(y = seq(7*5,12*5, by = 5), x = 1:20, z = irfm[,7:12] , add=F, ylab = "Time Lag [min]", xlab = "Mile post", col = topo.colors(50)) 202 | 203 | 204 | 205 | 206 | first_sensor = 2 207 | last_sensor = 10 208 | yd = yday(d$dt) 209 | which_day = 31 210 | s = first_sensor 211 | plot(d[yd==which_day,first_sensor], type='o', ylim=c(5,30), lty=first_sensor, col=first_sensor, pch=first_sensor, cex=0.3) 212 | nm = c(names(d)[s]) 213 | for (s in (first_sensor+1):last_sensor) 214 | { 215 | plot(d[yd==which_day,first_sensor], type='o', ylim=c(5,30), lty=first_sensor, col=first_sensor, pch=first_sensor, cex=0.3) 216 | lines(d[yd==which_day,s], type='o', lty=s, col=s, pch=s, cex=0.3) 217 | nm = c(nm,names(d)[s]) 218 | } 219 | 220 | legend(0,20,legend = nm, col=first_sensor:last_sensor,lty=first_sensor:last_sensor, bty="n", pch=first_sensor:last_sensor) 221 | 222 | 223 | a43 = matrix(A[10,], ncol = 6, byrow = F) 224 | 225 | rownames(a43) = names(d[,-1]) 226 | colnames(a43) = seq(35,60,5) 227 | -------------------------------------------------------------------------------- /var_eda.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(lubridate) 3 | # source(url("https://www.dropbox.com/s/leckggnen53sd7a/utils.R?dl=1")) 4 | source('~/Dropbox/utils.R') 5 | setwd('~/Google Drive/papers/dl-traffic/') 6 | fig_path = 'paper/fig/' 7 | all_speed = readRDS('data/2009/gcm21_i.rds') 8 | rs = rowSums(is.na(all_speed)) 9 | barplot(table(rs)/length(rs), col = 6, ylab="Percent of Malfunction Sensors", xlab="Number of Malfunction Sensors") 10 | # save_pdf(fig_path, name = 'malfunction_sensor') 11 | 12 | #all_speed = readRDS('/Users/vsokolov/Google Drive/papers/bayes-traffic/all_wed_speed_interpolted.rds') 13 | 14 | #morning_speed = all_speed[which(hour(all_speed$dt)>4 & hour(all_speed$dt)<13),] 15 | 16 | # find strange days: 17 | all_speed$daymin = lubridate::minute(all_speed$dt) + 60*lubridate::hour(all_speed$dt) 18 | all_speed$wday = lubridate::wday(all_speed$dt) 19 | avg_speed = ddply(all_speed, "daymin", summarise, avg_speed = mean(N6043, na.rm = FALSE)) 20 | avg__wd_speed = ddply(all_speed, .(daymin, wday), summarise, avg_speed = mean(N6043, na.rm = FALSE)) 21 | plot(avg_speed) 22 | 23 | plot(avg__wd_speed$avg_speed[avg__wd_speed$wday==3]) 24 | 25 | plot_by_day = function(data, col) 26 | { 27 | days = lubridate::yday(data$dt) 28 | udays = unique(days) 29 | n = length(udays) 30 | for (i in 1:(n-5)) 31 | { 32 | rows = which(days == udays[i]) 33 | rows_next = which(days == udays[i+5]) 34 | ts = lubridate::round_date(data[rows[1], 1], unit = 'day') 35 | ts_next = lubridate::round_date(data[rows_next[1], 1], unit = 'day') 36 | 37 | plot(data[rows, col], type='l', main = c(ts, ts_next)) 38 | lines(data[rows_next, col], type='l') 39 | } 40 | } 41 | 42 | 43 | # choose data set to work with 44 | d = all_speed 45 | day_step = 24*12 46 | # d = morning_speed 47 | # day_step = (13-4-1)*12-1 48 | sz = dim(d) 49 | 50 | 51 | 52 | 53 | # lag = 24 54 | # # get speed data for period 5am-noon (8 hours) 55 | 56 | # n = dim(morning_speed)[1] 57 | # for (i in 2:21) 58 | # { 59 | # plot(all_speed[lag:n,i], morning_speed[1:(n-lag+1),i], main = toString(i), xlab="Current", ylab="Previous") 60 | # } 61 | # 62 | 63 | 64 | speed_heat = function(y_day, data) 65 | { 66 | s = data[which(lubridate::yday(data$dt) == y_day),2:sz[2]] 67 | ny = dim(s)[1] 68 | data.m = data.matrix(s) 69 | # Take cumsum of distances between the sensor, which I measured in QGIS 70 | mileposts = cumsum(c(0, 566, 883, 1364, 1168, 875, 1210, 1153, 1186, 1173, 1203, 1242, 1086, 1122, 745, 1190, 913, 1193, 1404, 988))/1609.34 71 | par(mar = c(5,4,1,1), mai = c(1,1,0.2,0.2)) 72 | image(y = seq(1,24,length.out = ny), x = mileposts, z = t(data.m) , ylab = "Time [h]", xlab = "Mile post", yaxt="n", col = heat.colors(22,1), cex.lab=2, cex.axis=2) 73 | axis(side = 2,at = seq(0,23,by=2), labels = seq(0,23,by=2),cex.axis=2) 74 | } 75 | 76 | speed_heat(7,d) 77 | speed_heat(7,d1) 78 | speed_heat(21,d) 79 | speed_heat(28,d) 80 | speed_heat(35,d) 81 | speed_heat(42,d) 82 | speed_heat(49,d) 83 | utils$save_pdf(fig_path,'heat_plot_interpolated',lheight = utils$pdf_h*0.7) 84 | speed_heat(49,d1) 85 | save_pdf(fig_path, 'heat_plot_interpolated') 86 | speed_heat(56,d) 87 | speed_heat(63,d) 88 | speed_heat(70,d) 89 | 90 | # possible solution to deal with missing data 91 | # temp=na.omit(morning_speed[,20]) 92 | 93 | 94 | # create new data frame with lagged variables 95 | 96 | 97 | slide = function(df, vname, new.colame, n, lag) 98 | { 99 | # print(c(lag, n,vname, new.colame)) 100 | df[,new.colname] = c(rep(NA, lag),df[1:(n-lag),vname]) 101 | return(df) 102 | } 103 | 104 | n = dim(morning_speed)[1] 105 | m = dim(morning_speed)[2] 106 | 107 | # interpolate missing data 108 | library(zoo) 109 | for (i in 2:m) 110 | { 111 | morning_speed[,i] = na.approx(morning_speed[,i], rule=2) 112 | } 113 | max.lags = 12 114 | nvec = names(all_speed) 115 | m = length(nvec) 116 | 117 | lagged_morning_speed = morning_speed 118 | for (i in 2:m) 119 | { 120 | for (lag in 1:max.lags) 121 | { 122 | new.colname = paste(nvec[i], toString(lag), sep = "_") 123 | lagged_morning_speed = slide(lagged_morning_speed, nvec[i], new.colname, n, lag) 124 | } 125 | } 126 | 127 | n = dim(d)[2] 128 | nvec = names(d) 129 | for (i in 2:(n-1)) 130 | { 131 | for (j in (i+1):n) 132 | { 133 | if (j==i) 134 | next 135 | ccf(d[,i], d[,j], lag.max = 20, main = paste(toString(i), toString(j))) 136 | save_png('~/Dropbox/papers/bayes-traffic/fig/ccf/', name = paste(nvec[i], nvec[j],sep = '_')) 137 | } 138 | } 139 | 140 | m = dim(d)[1] 141 | x = seq(from=0, to=24, length.out = day_step) 142 | for (i in seq(from=1,to = m, by = day_step)) 143 | { 144 | par(mfrow=c(3,1), mar=c(2,2,0,0)) 145 | plot(x, d[i:(i+day_step-1),2], type='l') 146 | plot(x, d[i:(i+day_step-1),20], type='l') 147 | plot(x, d[i:(i+day_step-1),21], type='l') 148 | } 149 | 150 | sm = loess(x~d[i:(i+day_step-1),2], span = 0.8) 151 | pl = predict(sm, newdata = d[i:(i+day_step-1),2]) 152 | plot(x,pl, type='l', ylim=c(5,35)) 153 | lines(x,d[i:(i+day_step-1),2], type='l', lty=2, col=2) 154 | 155 | 156 | qplot(x,d[i:(i+day_step-1),2]) + geom_smooth(method = "loess", size = 1 ) 157 | 158 | 159 | 160 | #Smoothing 161 | print(unique(days)) 162 | n_days = length(unique(days)) 163 | nvec = names(d) 164 | index = 10 165 | for (day in unique(days)) 166 | { 167 | all_days = yday(all_speed$dt) 168 | # index = 2 169 | dd = all_speed[which(all_days==day),c(1,index)] 170 | dd$mins = 60*hour(dd$dt) + minute(dd$dt) 171 | dd_trend = loess(dd[,nvec[index]] ~ dd[,'mins'], span = 10) 172 | dd$trend = predict(dd_trend, newdata = dd) 173 | plot(dd[,nvec[index]]) 174 | lines(dd$trend) 175 | } 176 | 177 | 178 | # mock-up plots for presentation ------------------------------------------ 179 | 180 | 181 | true_data = c(seq(2,30,by=2), rep(30,4),28, 26, 24, 22) +rnorm(23) 182 | n = length(true_data) 183 | plot(true_data, type='l', xlim=c(0,n+10), xlab="time", ylab='speed', lwd=4, col=6) 184 | last = true_data[n] 185 | lines(n:(n+5),rep(last,6), col=2, lwd=4) 186 | lines(n:(n+5),seq(from=last, to=last-3,length.out = 6), col=3, lwd=4) 187 | lines(n:(n+5),seq(from=last, to=last-6 ,length.out = 6)+ c(0,rnorm(5)), col=6, lwd=4) 188 | abline(v=n, lty=2) 189 | abline(v=n+5, lty=2) 190 | text(21,10,labels = "now") 191 | legend('topleft', legend = c("Constant","Model","Data"), lwd = c(4,4,4),col=c(2,3,6)) 192 | save_pdf('~/Dropbox/slides/pf-traffic/fig/', name = "forecast_compare") 193 | 194 | plot(true_data, type='l', xlim=c(0,n+10), xlab="time", ylab='speed', lwd=4, col=6) 195 | lines(n:(n+5),seq(from=last, to=last-3,length.out = 6), col=3, lwd=4) 196 | abline(v=n, lty=2) 197 | abline(v=n+5, lty=2) 198 | text(21,10,labels = "now") 199 | save_pdf('~/Dropbox/slides/pf-traffic/fig/', name = "knn_1") 200 | plot(true_data, type='l', xlim=c(0,n+10), xlab="time", ylab='speed', lwd=4, col=6) 201 | lines(n:(n+5),seq(from=last, to=last-4,length.out = 6), col=3, lwd=4) 202 | abline(v=n, lty=2) 203 | abline(v=n+5, lty=2) 204 | text(21,10,labels = "now") 205 | save_pdf('~/Dropbox/slides/pf-traffic/fig/', name = "knn_2") 206 | plot(true_data, type='l', xlim=c(0,n+10), xlab="time", ylab='speed', lwd=4, col=6) 207 | lines(n:(n+5),seq(from=last, to=last-2,length.out = 6), col=3, lwd=4) 208 | abline(v=n, lty=2) 209 | abline(v=n+5, lty=2) 210 | text(21,10,labels = "now") 211 | save_pdf('~/Dropbox/slides/pf-traffic/fig/', name = "knn_3") 212 | 213 | x = seq(5,30, length.out = 300) 214 | y = x + rnorm(300, sd=4) 215 | plot(x,y, pch=20, xlab=expression("y"[t]), ylab=expression("y"[t+1]) ) 216 | abline(1,1, lwd=3, col=2) 217 | save_pdf('~/Dropbox/slides/pf-traffic/fig/', name = "auto_regression") 218 | 219 | 220 | library(lubridate) 221 | library(forecast) 222 | days = yday(all_speed$dt) 223 | day_rows = which(days == 21) 224 | one_day = all_speed[day_rows,] 225 | one_sensor = one_day[,9] 226 | par(mar = c(5.1, 5, 4.1, 2.1)) 227 | plot(seq(0, 12.5, length.out = 150),one_sensor[1:150], ylab='Speed [m/s]', xlab = 'Time [hour]', pch=20, cex.lab=2.5, cex.axis=2.5) 228 | fit = ets(y = one_sensor, model='ANN', alpha = 0.1) 229 | lines(seq(0, 12.5, length.out = 150),fitted.values(fit)[1:150], col=3, lwd=7) 230 | utils$save_pdf(fig_path, name = "ets", lheight = utils$pdf_h*0.8) 231 | 232 | library(FBN) 233 | plot(seq(0, 12.5, length.out = 150),one_sensor[1:150], ylab='Speed [m/s]', xlab = 'Time [hour]', pch=20, cex.lab=2.5, cex.axis=2.5) 234 | lines(seq(0, 12.5, length.out = 150),medianFilter(inputData = one_sensor, windowSize = 10)[1:150], col=3, lwd=7) 235 | utils$save_pdf(fig_path, name = "median_filter", lheight = utils$pdf_h*0.8) 236 | 237 | mins = 60*hour(one_day$dt) + minute(one_day$dt) 238 | d_trend = loess(one_sensor ~ mins, span = 0.1) 239 | plot(seq(0, 12.5, length.out = 150),one_sensor[1:150], ylab='Speed [m/s]', xlab = 'Time [hour]', pch=20, cex.lab=2.5, cex.axis=2.5) 240 | lines(seq(0, 12.5, length.out = 150),predict(d_trend, newdata = mins)[1:150], col=3, lwd=7) 241 | utils$save_pdf(fig_path, name = "trend_filter", lheight = utils$pdf_h*0.8) 242 | 243 | 244 | plot(ld$N6040[1:n], ld$N6040_10[1:n], pch=20) 245 | save_pdf('~/Dropbox/slides/pf-traffic/fig/', name = "var_relation") 246 | 247 | 248 | -------------------------------------------------------------------------------- /dl_out_plot.r: -------------------------------------------------------------------------------- 1 | library(lubridate) 2 | library(h2o) 3 | source('~/Dropbox/utils.R') 4 | setwd('~/Dropbox/papers/dl-traffic') 5 | source('src/data_parameters.r') 6 | source('src/data_processing.r') 7 | 8 | 9 | # Functions --------------------------------------------------------------- 10 | 11 | 12 | # get_days = function(file_name) 13 | # { 14 | # return(as.integer(strsplit(file_name,"[.]")[[1]][1])) 15 | # } 16 | # test_days = unlist(lapply(list.files('tmp_fig/test/'), get_days)) 17 | # train_days = unlist(lapply(list.files('tmp_fig/train/'), get_days)) 18 | 19 | load_model = function(name) 20 | { 21 | m = h2o.loadModel(paste(normalizePath('~'),"/Dropbox/papers/dl-traffic/models/", name, sep='')) 22 | #m = h2o.loadModel(path = "~/Dropbox/papers/dl-traffic/models/", name = name_) 23 | } 24 | 25 | 26 | # Prepare Data ------------------------------------------------------------ 27 | 28 | localH2O = h2o.init(max_mem_size = "2g", nthreads=-1) 29 | # combo = 'i' 30 | # d = readRDS(gzcon(url('https://app.box.com/shared/static/xvbdzd7sg1sfzfy25pe0a936eidgxrqd.rds'))); 31 | # m = load_model('DeepLearning_model_R_1450113775656_1980') 32 | combo = 'm_8_lasso' 33 | 34 | # d = readRDS(gzcon(url('https://app.box.com/shared/static/bt69dviydld3o3k87max5od5zhnihbf6.rds'))); 35 | # m = load_model('DeepLearning_model_R_1449675910414_4090') 36 | m = load_model('DeepLearning_model_R_1449673978399_9545') #with lasso 37 | 38 | source('src/data_load.r') 39 | # d = readRDS('data/2013/gcm21_i_tf_20.rds'); 40 | # m = load_model('DeepLearning_model_R_1449510681559_260') 41 | # m = load_model('DeepLearning_model_R_1449641475140_995') #with lasso 42 | 43 | # l1 =0.000966; 44 | # d = readRDS('data/2013/gcm21_i_tfs_15.rds'); 45 | # m = load_model('DeepLearning_model_R_1449675875413_7960') 46 | # m = load_model('DeepLearning_model_R_1449675008782_605') #with lasso 47 | # data/2013/gcm21_i.rds = https://app.box.com/shared/static/xvbdzd7sg1sfzfy25pe0a936eidgxrqd.rds 48 | 49 | ld.hex = converth2o(d = ld, conn = localH2O) # as.h2o(ld) 50 | d.test = ld.hex[test_rows,] 51 | d.train = ld.hex[train_rows,] 52 | 53 | 54 | # Choose Model ------------------------------------------------------------ 55 | 56 | 57 | # models = c('DeepLearning_model_R_1450988591754_555', 58 | # 'DeepLearning_model_R_1450988589597_1055', 59 | # 'DeepLearning_model_R_1450988591826_115', 60 | # 'DeepLearning_model_R_1450988591812_850', 61 | # 'DeepLearning_model_R_1450988591795_590', 62 | # 'DeepLearning_model_R_1450988588982_1165', 63 | # 'DeepLearning_model_R_1450988588915_270', 64 | # 'DeepLearning_model_R_1450988591812_765', 65 | # 'DeepLearning_model_R_1450988591832_925', 66 | # 'DeepLearning_model_R_1450988591836_550', 67 | # 'DeepLearning_model_R_1450988591813_1130', 68 | # 'DeepLearning_model_R_1450988589595_85', 69 | # 'DeepLearning_model_R_1450988591823_930', 70 | # 'DeepLearning_model_R_1450988591855_1175', 71 | # 'DeepLearning_model_R_1450988588948_1170', 72 | # 'DeepLearning_model_R_1450989100335_385' 73 | # ) 74 | # 75 | # # choose best model 76 | # best_err = .Machine$double.xmax 77 | # for (i in 1:length(models)) { 78 | # m = load_model(models[i]) 79 | # err <- m@model$validation_metrics@metrics$mean_residual_deviance 80 | # print(m@model_id) 81 | # print(err) 82 | # print( m@model$validation_metrics@metrics$r2) 83 | # print(m@parameters$hidden) 84 | # print(m@allparameters$activation) 85 | # if (err < best_err) { 86 | # best_err <- err 87 | # best_model <- m 88 | # # print(m@model_id) 89 | # } 90 | # } 91 | # m = best_model 92 | # sensor_col = m@parameters$y 93 | 94 | # Analyse forecasts ------------------------------------------------------- 95 | pr <- h2o.predict(m, newdata = d.test) 96 | y_dl = as.vector(as.data.frame(pr)$predict) 97 | saveRDS(y_dl, "fit_data/y_dl_m_8_lasso.rds") 98 | 99 | y_dl = readRDS("fit_data/y_dl_m_8_lasso.rds") 100 | 101 | pr <- h2o.predict(m, newdata = d.train) 102 | y_dl_t = as.vector(as.data.frame(pr)$predict) 103 | saveRDS(y_dl_t, "fit_data/y_dl_t_m_8_lasso.rds") 104 | 105 | y_dl_t = readRDS("fit_data/y_dl_t_m_8_lasso.rds") 106 | 107 | 108 | metrics(as.vector(d.test[,sensor_col]), y_dl) 109 | metrics(as.vector(d.train[,sensor_col]), y_dl_t) 110 | 111 | # days = yday(ld.test$dt) 112 | 113 | 114 | # Try smoothing ----------------------------------------------------------- 115 | 116 | 117 | # residual corrected forecast 118 | # res = ld.test[, sensor_col] - y_dl 119 | # y_dl_corr = y_dl 120 | # sz = dim(ld.test) 121 | # n.test = length(y_dl) 122 | # y_dl_corr = y_dl 123 | # # library(forecast) 124 | # ets_horizon = 20 125 | # l1t_horizon = 30 126 | # corr = rep(0, n.test-horizon-1) 127 | # # for (i in ets_horizon:(n.test-horizon-1)) 128 | 129 | 130 | # for (i in l1t_horizon:(n.test-horizon-1)) 131 | # { 132 | # if (i %% 1000 == 0) 133 | # print(i) 134 | # res.trend = l1tf(res[(i-l1t_horizon + 1):i],0.01) #estimate trend in residuals 135 | # # forecast using the trend 136 | # a = sum(rep(0.25,4)*res.trend[(l1t_horizon-4+1):l1t_horizon])# - res.trend[l1t_horizon-1] 137 | # corr[i] = a 138 | # y_dl_corr[i+2+1] = y_dl_corr[i+2+1] + a 139 | # } 140 | 141 | # for (i in ets_horizon:(n.test-horizon-1)) 142 | # { 143 | # if (i %% 1000 == 0) 144 | # print(i) 145 | # res.ets = holt(res[(i-ets_horizon + 1):i], initial = 'simple', damped = T, h = horizon+1) 146 | # # y_dl_corr[i+horizon+1] = res.ets$mean[horizon+1] + y_dl_corr[i+horizon+1] 147 | # y_dl_corr[i+2+1] = res.ets$mean[2+1] + y_dl_corr[i+2+1] 148 | # } 149 | # mean(abs(ld.test[, sensor_col] - y_dl_corr)/ld.test[, sensor_col]) 150 | # mean(abs(ld.test[, sensor_col] - y_dl)/ld.test[, sensor_col]) 151 | # sqrt(sum((ld.test[, sensor_col] - y_dl_corr)^2)) 152 | # sqrt(sum((ld.test[, sensor_col] - y_dl)^2)) 153 | # n = length(ld.test[, sensor_col]) 154 | # y_naive = ld.test[1:(n-horizon), sensor_col] 155 | # mean(abs(ld.test[(horizon+1):n, sensor_col] - y_naive)/ld.test[(horizon+1):n, sensor_col]) 156 | # 157 | # plot(y_dl_corr - ld.test[, sensor_col]) 158 | # qqnorm(y_dl_corr - ld.test[, sensor_col]) 159 | # qqline(y_dl_corr - ld.test[, sensor_col]) 160 | # res_corr = ld.test[, sensor_col] - y_dl_corr 161 | # plot(density(ld.test[, sensor_col] - y_dl)) 162 | # plot(density(ld.test[, sensor_col] - y_dl_corr)) 163 | 164 | 165 | 166 | # Plot -------------------------------------------------------------------- 167 | 168 | 169 | days = yday(ld.test$dt) 170 | # s11 = ld[,c(1,11)] 171 | # s11$wday = lubridate::wday(s11$dt) 172 | # s11$yday = lubridate::yday(s11$dt) 173 | # s11$min0 = 60*lubridate::hour(s11$dt) + lubridate::minute(s11$dt) 174 | # as = ddply(s11[which(s11$yday < 345 & s11$yday> 200),], "min0", summarise, avg_speed = mean(N6043, na.rm = TRUE))$avg_speed 175 | # bears is 283 (10/10/2013, Thursday) 176 | # weather is 345 (12/11/2013, Wednesday) 177 | # for (day in unique(days)) 178 | for (day in c(280, 283, 345)) 179 | { 180 | rows = which(days == day) 181 | n_rows = length(rows) 182 | forward_rows = rows[(1+horizon):n_rows] 183 | par(mar = c(0,0,0,0)) 184 | x = ld.test[forward_rows, 1] 185 | x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 186 | # par(mar = c(4.5,4,2,0)) 187 | lwd = 4 188 | plot(x, 2.23694*ld.test[forward_rows, sensor_col], type='l', lty=1, col=1,lwd=lwd, xlab='time', ylab='speed [m/s]', main=toString(day), ylim=c(5,80), xaxt='n', yaxt = 'n', ann=FALSE, bty="n") #names(d)[sendaysor_col]) 189 | lines(x,2.23694*y_dl[forward_rows], type='l', lty=2, col=2, lwd=lwd) # there is no time colun 190 | # lines(x,y_dl_corr[forward_rows], type='l', lty=3, col=3, lwd=4) # there is no time column 191 | # lines(x,as[(1+horizon):n_rows], type='l', lty=3, col=3,lwd=lwd) # there is no time colun 192 | lines(x,2.23694*ld.test[rows[1:(n_rows - horizon)] ,sensor_col], lty=4, col=4, lwd=lwd) 193 | # legend("bottomleft", legend = c('data', 'dl', 'dl_corr','const'), lty=c(1,2,3,4), col=c(1,2,3,4), lwd=3) 194 | # legend("bottomleft", legend = c('data', 'dl', 'const'), lty=c(1,2,4), col=c(1,2,4), lwd=3) 195 | nme = paste(sensor_col,toString(day),'dl', combo, sep='_') 196 | abline(55,0, col='green', lwd=3, lty=4) 197 | abline(v=8, col="orange", lty=3, lwd=4) 198 | # utils$save_pdf(path='paper/fig/', name = nme) 199 | 200 | # plot residuals over time 201 | res = 2.23694*ld.test[forward_rows, sensor_col] - 2.23694*y_dl[forward_rows] 202 | par(mar = c(2,3,0,0)) 203 | plot(x,2.23694*ld.test[forward_rows, sensor_col], type='l', col="black", lwd=lwd, ylim=c(0,67), cex.axis=2.5) 204 | lines(x,abs(res), type='p', col="red", lwd=lwd, lty=4, yaxt="n", pch=16, cex=1) 205 | utils$save_pdf(path='paper/fig/', name = paste(nme, "res", sep="_")) 206 | } 207 | 208 | 209 | # plot residuals 210 | days = yday(ld.test$dt) 211 | # for (day in c(84, 203, 184, 24,59, 46)) 212 | for (day in c(46)) 213 | { 214 | rows = which(days == day) 215 | n_rows = length(rows) 216 | forward_rows = rows[(1+horizon):n_rows] 217 | x = ld.train[forward_rows, 1] 218 | x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 219 | data = 2.23694*ld.train[forward_rows, sensor_col] 220 | fcast = 2.23694*y_dl_t[forward_rows] 221 | # par(mar = c(4.5,4,2,0)) 222 | par(mar = c(2,2,0,0), mfrow=c(1,1)) 223 | lwd = 4 224 | res = data - fcast 225 | plot(x, 100*res/data, type='p', lty=1, col=1,lwd=lwd, xlab='time', ylab='speed [m/s]', ann=T, bty="n", main="") #names(d)[sendaysor_col]) 226 | qqnorm(res) 227 | qqline(res) 228 | acf(res) 229 | plot(x, data, type='l', lty=1, col=1,lwd=lwd, xlab='time', ylab='speed [m/s]', main=toString(day), ylim=c(5,80), xaxt='n', yaxt = 'n', ann=FALSE, bty="n") #names(d)[sendaysor_col]) 230 | lines(x,fcast, type='l', lty=2, col=2, lwd=lwd) # there is no time colun 231 | # utils$save_pdf(path='paper/fig/', name = nme) 232 | print (adf.test(res,alternative = "stationary")) 233 | } 234 | 235 | -------------------------------------------------------------------------------- /find_var.r: -------------------------------------------------------------------------------- 1 | library(lubridate) 2 | library(lars) 3 | library(SparseM) 4 | # install.packages('lars') 5 | # install.packages('SparseM') 6 | source('~/Dropbox/utils.R') 7 | setwd('~/Dropbox/papers/dl-traffic/') 8 | source('src/fit.r') 9 | source('src/data_processing.r') 10 | source('src/data_parameters.r') 11 | source('src/data_load.r') 12 | 13 | combo = 'm_8_var' 14 | # path = 'data/2013/gcm21_i_tf_20.rds' 15 | # path = 'data/2013/gcm21_i_m_8.rds' 16 | # 'data/2013/gcm21_i_tfs_15.rds' = "https://app.box.com/shared/static/bt69dviydld3o3k87max5od5zhnihbf6.rds" 17 | 18 | # need to call it one! 19 | #source('src/data_load.r') 20 | 21 | first_predictor = 1 + (sz[2]-1)*horizon+1 # horizon*5 min forecast 22 | last_predictor = dim(ld)[2] 23 | pred_names = names(ld)[first_predictor:last_predictor] 24 | 25 | 26 | 27 | ### LARS FIT ### 28 | library(lars) 29 | A = matrix(0, ncol = length(first_predictor:last_predictor), nrow = length(2:dim(d)[2])) 30 | for (i in 2:dim(d)[2]) 31 | { 32 | A[i-1,] = fit_lars(i,first_predictor, last_predictor, ld.train) 33 | } 34 | # saveRDS('data/Lasso_A.rds', object = A) 35 | # A = readRDS('data/Lasso_A.rds') 36 | A = readRDS(gzcon(url('https://app.box.com/shared/static/9r4pc149wz2y6whxmdu67c0jf0pgm8it.rds'))) 37 | library(SparseM) 38 | par(mar=c(2,2,.5,.5), mai = c(0.6,0.6,0.2,0.2), mfrow=c(1,1)) 39 | # image(as.matrix.csr(A), col=heat.colors(40,1), ann=F, cex.axis=1.5) 40 | image(as.matrix.csr(A), col=c("red","white"), ann=F, cex.axis=1.5) 41 | mtext(side = 1, text = "column", line = 2, cex=1.5) 42 | mtext(side = 2, text = "row", line = 2, cex=1.5) 43 | utils$save_pdf('paper/fig/', name = 'lasso_sparsity',lheight = utils$pdf_h/2) 44 | # percent of sparse entires 45 | sum(A<0.000001)/(dim(A)[1]*dim(A)[2]) 46 | 47 | ### Predictions ### 48 | # n = 4884 49 | n = dim(ld.test)[1] 50 | y_lars = matrix(0,n,dim(d)[2]-1) 51 | for(i in 1:n) 52 | { 53 | # print(i) 54 | x = as.double(ld.test[i,first_predictor:last_predictor]) 55 | # y=ld[i,2:dim(d)[2]] 56 | y_lars[i,] = A%*%x 57 | } 58 | 59 | n1 = dim(ld.train)[1] 60 | y_lars1 = matrix(0,n1,dim(d)[2]-1) 61 | for(i in 1:n1) 62 | { 63 | # print(i) 64 | x = as.double(ld.train[i,first_predictor:last_predictor]) 65 | # y=ld[i,2:dim(d)[2]] 66 | y_lars1[i,] = A%*%x 67 | } 68 | 69 | 70 | saveRDS(y_lars, "fit_data/y_lars.rds") 71 | saveRDS(y_lars1, "fit_data/y_t_lars.rds") 72 | 73 | y_lars1 = readRDS("fit_data/y_t_lars.rds") 74 | y_lars = readRDS("fit_data/y_lars.rds") 75 | 76 | metrics = function(data, model) 77 | { 78 | n = length(data) 79 | n1 = length(model) 80 | if (n1!=n) 81 | { 82 | print("Dimension mismatch") 83 | return() 84 | } 85 | davg = mean(data) 86 | sstot = sum( (data - davg)^2 ) 87 | ssmod = sum( (model - davg)^2 ) 88 | ssres = sum( (model - data)^2 ) 89 | rsq = 1 - ssres/sstot 90 | mse = ssres/n 91 | return(list(rsq=rsq, mse=mse)) 92 | } 93 | m(ld.train[,11], y_lars1[,10]) 94 | metrics(ld.test[,11], y_lars[,10]) 95 | 96 | 97 | days = yday(ld.test$dt) 98 | # bears is 283 (10/10/2013, Thursday) 99 | # weather is 345 (12/11/2013, Wednesday) 100 | # for (day in unique(days)) 101 | # for (day in c(280, 283, 305, 340, 331, 345)) 102 | for (day in c(280, 283, 345)) 103 | { 104 | rows = which(days == day) 105 | n_rows = length(rows) 106 | forward_rows = rows[(1+horizon):n_rows] 107 | x = ld.test[forward_rows, 1] 108 | x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 109 | par(mar = c(0,0,0,0)) 110 | lwd = 3 111 | plot(x, 2.23694*ld.test[forward_rows, sensor_col], type='l', lty=1, col=1,lwd=lwd, xlab='time', ylab='speed [mi/h]', main=toString(day), ylim=c(5,80), xaxt='n', yaxt = 'n', ann=FALSE, bty="n") #names(d)[sendaysor_col]) 112 | lines(x,2.23694*y_lars[forward_rows, (sensor_col-1)], type='l',lty=2, col=2, lwd=lwd) # there is no time colun 113 | lines(x,2.23694*ld.test[rows[1:(n_rows - horizon)] ,sensor_col], lty=4, col=4, lwd=lwd) 114 | abline(55,0, col='green', lwd=3, lty=4) 115 | abline(v=8, col="orange", lty=3, lwd=4) 116 | # legend("bottomleft", legend = c('data', 'dl', 'const'), lty=c(1,2,4), col=c(1,2,4), lwd=3) 117 | utils$save_pdf(path='paper/fig/', name = paste(sensor_col,toString(day),'dl', combo, sep='_'), lwidth = utils$pdf_w, lheight = utils$pdf_h) 118 | 119 | # plot residuals over time 120 | res = 2.23694*ld.test[forward_rows, sensor_col] - 2.23694*y_lars[forward_rows] 121 | par(mar = c(2,3,0,0)) 122 | plot(x,2.23694*ld.test[forward_rows, sensor_col], type='l', col="black", lwd=lwd, ylim=c(0,67), cex.axis=2.5) 123 | lines(x,abs(res), type='p', col="red", lwd=lwd, lty=4, yaxt="n", pch=16, cex=1) 124 | utils$save_pdf(path='paper/fig/', name = paste(sensor_col,toString(day),'dl', combo, "res", sep='_')) 125 | } 126 | 127 | 128 | # plot residuals 129 | days = yday(ld.train$dt) 130 | for (day in c(84, 203, 184, 24,59, 46)) 131 | { 132 | rows = which(days == day) 133 | n_rows = length(rows) 134 | forward_rows = rows[(1+horizon):n_rows] 135 | x = ld.test[forward_rows, 1] 136 | x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 137 | data = 2.23694*ld.train[forward_rows, sensor_col] 138 | fcast = 2.23694*y_lars1[forward_rows, (sensor_col-1)] 139 | par(mar = c(1,0,4,0), mfrow=c(2,2)) 140 | lwd = 4 141 | res = data - fcast 142 | plot(x, res, type='p', lty=1, col=1,lwd=lwd, xlab='time', ylab='speed [m/s]', main=toString(day), xaxt='n', yaxt = 'n', ann=T, bty="n") #names(d)[sendaysor_col]) 143 | qqnorm(res) 144 | qqline(res) 145 | acf(res) 146 | plot(x, data, type='l', lty=1, col=1,lwd=lwd, xlab='time', ylab='speed [m/s]', main=toString(day), ylim=c(5,80), xaxt='n', yaxt = 'n', ann=FALSE, bty="n") #names(d)[sendaysor_col]) 147 | lines(x,fcast, type='l', lty=2, col=2, lwd=lwd) 148 | print (adf.test(res,alternative = "stationary")) 149 | # there is no time colun 150 | 151 | # utils$save_pdf(path='paper/fig/', name = paste(sensor_col,toString(day),'dl', combo, sep='_'), lwidth = utils$pdf_w, lheight = utils$pdf_h) 152 | } 153 | 154 | 155 | 156 | 157 | # 158 | # 159 | # # for (sensor_col in 2:sz[2]) 160 | # # { 161 | # for (day in unique(days)) 162 | # { 163 | # # print(day) 164 | # rows = which(days == day) 165 | # n_rows = length(rows) 166 | # # print(n_rows) 167 | # # rows = rows[] 168 | # forward_rows = rows[(1+horizon):n_rows] 169 | # par(mar = c(3.5,4,2,0)) 170 | # x = ld[forward_rows, 1] 171 | # x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 172 | # par(mar=c(4,4,4,0)) 173 | # plot(x, ld[forward_rows, sensor_col], type='l', lty=1, col=1, lwd=2, xlab='time', ylab='speed [m/s]', main=toString(day), ylim=c(2,30)) #names(d)[sendaysor_col]) 174 | # lines(x,y_lars[forward_rows,sensor_col-1], type='l', lty=2, col=2, lwd=2) # there is no time colun 175 | # # lines(x,y_dl[forward_rows,sensor_col-1], type='l', lty=3, col=3, lwd=2) # there is no time colun 176 | # lines(x,ld[rows[1:(n_rows - horizon)] ,sensor_col], lty=3, col=3, lwd=2) 177 | # legend("bottomleft", legend = c('data', 'var', 'const'), lty=1:3, col=1:3, lwd=3) 178 | # # save_pdf(path='~/Dropbox/papers/bayes-traffic/fig/forecast/', name = paste(nvec[sensor_col],toString(day), sep='_')) 179 | # } 180 | # # } 181 | # 182 | # 183 | # # playground -------------------------------------------------------------- 184 | # 185 | # #plot train data 186 | # # days = yday(ld.train$dt) 187 | # # for (day in unique(days)) 188 | # # { 189 | # # rows = which(days == day) 190 | # # n_rows = length(rows) 191 | # # par(mar = c(3.5,4,2,0)) 192 | # # x = ld.train[rows, 1] 193 | # # x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 194 | # # par(mar = c(4.5,4,2,0)) 195 | # # plot(x, ld.train[rows, sensor_col], type='l', lty=1, col=1, lwd=4, xlab='time', ylab='speed [m/s]', main=toString(day), ylim=c(2,40)) #names(d)[sendaysor_col]) 196 | # # } 197 | # # #plot test data 198 | # # days = yday(ld.test$dt) 199 | # # for (day in unique(days)) 200 | # # { 201 | # # rows = which(days == day) 202 | # # n_rows = length(rows) 203 | # # par(mar = c(3.5,4,2,0)) 204 | # # x = ld.test[rows, 1] 205 | # # x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 206 | # # par(mar = c(4.5,4,2,0)) 207 | # # plot(x, ld.test[rows, sensor_col], type='l', lty=1, col=1, lwd=4, xlab='time', ylab='speed [m/s]', main=toString(day), ylim=c(2,40)) #names(d)[sendaysor_col]) 208 | # # } 209 | # 210 | # 211 | # 212 | # m = h2o.loadModel(paste(normalizePath('~'),"/Dropbox/papers/bayes-traffic/models/2009/DeepLearning_model_R_1444555786512_8743", sep=''), conn=localH2O) 213 | # 214 | # 215 | # pr <- h2o.predict(m, newdata = d.test) 216 | # y_dl = as.vector(as.data.frame(pr)$predict) 217 | # days = yday(ld.test$dt) 218 | # 219 | # 220 | # # residual corrected forecast 221 | # res = ld.test[, sensor_col] - y_dl 222 | # y_dl_corr = y_dl 223 | # sz = dim(ld.test) 224 | # library(forecast) 225 | # n.test = length(y_dl) 226 | # y_dl_corr = y_dl 227 | # for (i in 20:(n.test-horizon-1)) 228 | # { if (i %% 1000 == 0) 229 | # print(i) 230 | # res.ets = holt(res[(i-19):i], initial = 'optimal', damped = T, h = horizon+1) 231 | # y_dl_corr[i+horizon+1] = res.ets$mean[horizon+1] + y_dl_corr[i+horizon+1] 232 | # } 233 | # # saveRDS(y_dl_corr, file = '~/Dropbox/papers/bayes-traffic/data/y_dl_corr.rds') 234 | # # y_dl_corr = readRDS('~/Dropbox/papers/bayes-traffic/data/y_dl_corr.rds') 235 | # 100*mean(abs(ld.test[, sensor_col] - y_dl_corr)/ld.test[, sensor_col]) 236 | # 100*mean(abs(ld.test[, sensor_col] - y_dl)/ld.test[, sensor_col]) 237 | # sum((ld.test[, sensor_col] - y_dl_corr)^2) 238 | # sum((ld.test[, sensor_col] - y_dl)^2) 239 | # 240 | # # plot(y_dl_corr - ld.test[, sensor_col]) 241 | # # qqnorm(y_dl_corr - ld.test[, sensor_col]) 242 | # # qqline(y_dl_corr - ld.test[, sensor_col]) 243 | # # 244 | # # res_corr = ld.test[, sensor_col] - y_dl_corr 245 | # # plot(density(ld.test[, sensor_col] - y_dl)) 246 | # # plot(density(ld.test[, sensor_col] - y_dl_corr)) 247 | # 248 | # 249 | # 250 | # days = yday(ld.test$dt) 251 | # for (day in unique(days)) 252 | # { 253 | # rows = which(days == day) 254 | # n_rows = length(rows) 255 | # forward_rows = rows[(1+horizon):n_rows] 256 | # par(mar = c(3.5,4,2,0)) 257 | # x = ld.test[forward_rows, 1] 258 | # x = (60*lubridate::hour(x) + lubridate::minute(x))/60.0 259 | # par(mar = c(4.5,4,2,0)) 260 | # plot(x, ld.test[forward_rows, sensor_col], type='l', lty=1, col=1, lwd=4, xlab='time', ylab='speed [m/s]', main=toString(day), ylim=c(2,40)) #names(d)[sendaysor_col]) 261 | # lines(x,y_dl[forward_rows], type='l', lty=2, col=2, lwd=4) # there is no time colun 262 | # lines(x,y_dl_corr[forward_rows], type='l', lty=3, col=3, lwd=4) # there is no time colun 263 | # lines(x,ld.test[rows[1:(n_rows - horizon)] ,sensor_col], lty=4, col=4, lwd=4) 264 | # # lines(x,0.3*ld.test[rows[1:(n_rows - horizon)], sensor_col]+0.7*y_dl[forward_rows], lty=4, col=4, lwd=2) 265 | # legend("bottomleft", legend = c('data', 'dl', 'dl+ets', 'const'), lty=1:4, col=1:4, lwd=3) 266 | # utils$save_pdf(path='paper/fig/forecast/', name = paste(sensor_col,toString(day),'dl', combo, sep='_')) 267 | # } 268 | # 269 | # 270 | # 271 | # # Demo for the paper ------------------------------------------------------ 272 | # 273 | # 274 | # demo_day = 77 275 | # rows = which(days == demo_day) 276 | # n_rows = length(rows) 277 | # demo = data.frame(dt = ld.test[rows,1], data = ld.test[rows,sensor_col], fc = y_dl_corr[rows]) 278 | # demo$min0 = lubridate::minute(demo$dt) + 60*lubridate::hour(demo$dt) 279 | # demo$hour0 = seq(0,24, length.out = n_rows) 280 | # demo$rows = rows 281 | # plot(demo$hour0, demo$data, type='l') 282 | # lines(demo$hour0, demo$fc) 283 | # 284 | # plot(density(demo$data - demo$fc)) 285 | # 286 | # 287 | # vikings_rows = which(lubridate::month(d$dt) == 12 & lubridate::day(d$dt) == 28) 288 | # packers_rows = which(lubridate::month(d$dt) == 12 & lubridate::day(d$dt) == 13) 289 | # ice_rows = which(lubridate::month(d$dt) == 1 & lubridate::day(d$dt) == 16) 290 | # ice_rows1 = which(lubridate::month(d$dt) == 1 & lubridate::day(d$dt) == 9) 291 | # 292 | # yday(d[ice_rows[10],1]) 293 | # 294 | # plot(d[ice_rows, 11]) 295 | # plot(d[ice_rows1, 11]) 296 | # show(m) 297 | # pr <- h2o.predict(m, newdata = d.train) 298 | # in_sample = as.vector(as.data.frame(pr)$predict) 299 | # in_sample_res = in_sample - ld.train[,sensor_col] 300 | # 301 | # library(forecast) 302 | # res.m = auto.arima(in_sample_res) 303 | # 304 | # train_rows = 600:700 305 | # test_rows = 701:710 306 | # plot(train_rows, in_sample_res[train_rows], xlim = c(min(train_rows),max(test_rows)), ylim = c(-4,3.5)) 307 | # fc.ses = ses(in_sample_res[train_rows], h = 10,alpha = 0.2, initial = 'simple') 308 | # fc.holt = holt(in_sample_res[train_rows], h = 10, alpha = 0.5, beta = 0.5, initial = 'simple', damped = T) 309 | # print(in_sample_res[test_rows]) 310 | # lines(test_rows,in_sample_res[test_rows], col=2, lwd=5) 311 | # lines(test_rows, fc.ses$mean, col = 3, lwd = 5) 312 | # lines(test_rows, fc.holt$mean, col = 4, lwd = 5) 313 | # 314 | # 315 | # 316 | # 317 | # 318 | # res = d.test[forward_rows, sensor_col] - y_dl[forward_rows] 319 | # plot(x,res, ylab='Residual', xlab='Time') 320 | # save_pdf('~/Dropbox/papers/bayes-traffic/fig/', 'dl_error') 321 | # acf(res) 322 | # qqnorm(res) 323 | # qqline(res) 324 | # fit = auto.arima(res) 325 | # plot(residuals(fit)) 326 | # qqnorm(residuals(fit)) 327 | # test_rec_error <- as.data.frame(h2o.anomaly(d.test, m)) 328 | 329 | 330 | 331 | --------------------------------------------------------------------------------