├── .gitignore ├── Chapter10.rmd ├── Chapter11.rmd ├── Chapter12.rmd ├── Chapter2.rmd ├── Chapter3.rmd ├── Chapter5.rmd ├── Chapter6.rmd ├── Chapter7.rmd ├── Chapter8.rmd ├── Chapter9.rmd └── readme.rmd /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.xlsx -------------------------------------------------------------------------------- /Chapter10.rmd: -------------------------------------------------------------------------------- 1 | # Chapter 10 2 | 3 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Load_packages} 4 | 5 | library(fpp2) 6 | library(hts) 7 | 8 | ``` 9 | 10 | 1. Write out the S matrices for the Australian tourism hierarchy and the Australian prison grouped structure. Use the smatrix command to verify your answers. 11 | 12 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question1} 13 | 14 | # Australian tourism visitor nights data 15 | # - used development version of fpp package to use visnight data(2018/2/22). Run below 3 rows of codes without #s to download it. 16 | # install.packages("devtools") 17 | # library(devtools) 18 | # devtools::install_github("robjhyndman/fpp") 19 | str(visnights) 20 | head(visnights) 21 | 22 | # make a hierachical time series using visnights time series matrix. 23 | # I'm going to aggregate data using the position of characters. The first three characters of each column name of visnights will be the categories at the first level of the hierarchy (States). The following five characters will be the bottom-level categories (Zones). 24 | tourism.hts <- hts(visnights, characters = c(3, 5)) 25 | str(tourism.hts) 26 | # There are 3 levels. And there are 6 nodes for level 1 and 20 nodes for level 2. It means that the summing matrix S is 27 x 20 matrix. 27 | # S = 28 | # |1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1| <- level 0 29 | # |1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0| <- row 2-7 30 | # |0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0| (level 1) 31 | # |0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0| 32 | # |0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0| 33 | # |0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0| 34 | # |0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1| 35 | # |1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0| <- row 8-27 36 | # |0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0| (level 2) 37 | # |0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0| 38 | # |0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0| 39 | # |0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0| 40 | # |0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0| 41 | # |0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0| 42 | # |0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0| 43 | # |0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0| 44 | # |0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0| 45 | # |0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0| 46 | # |0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0| 47 | # |0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0| 48 | # |0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0| 49 | # |0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0| 50 | # |0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0| 51 | # |0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0| 52 | # |0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0| 53 | # |0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0| 54 | # |0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1| 55 | 56 | # calculate S matrix using smatrix function. 57 | smatrix(tourism.hts) 58 | 59 | # Australian prisoner numbers data 60 | str(prison) 61 | head(prison) 62 | 63 | # make a grouped time series using prison time series matrix. It will be a grouped time series, not hierarchical time series. It is because the gender and the legal status are different attributes. They make them different hierarchies, but need to be combined at the top level. 64 | # I'm going to aggregate data using the position of characters again. The first three characters of each column name of prison will be the categories states categories. The 4th character will be the gender categories, and the following 9 characters will be the legal status categories. 65 | prison.gts <- gts(prison, characters = c(3, 1, 9)) 66 | str(prison.gts) 67 | 68 | # There are 3 levels. And there are 32 nodes for level 2(bottom-level). G1, G2, G3, G4, G5, G6 are level 1 and there are 48 nodes. It means that the summing matrix S is 81 x 32 matrix. 69 | # S = 70 | # | 1111 1111 1111 1111 1111 1111 1111 1111 | <- level 0 71 | # | 1111 0000 0000 0000 0000 0000 0000 0000 | <- level 1 72 | # | 0000 1111 0000 0000 0000 0000 0000 0000 | (G1) 73 | # | 0000 0000 1111 0000 0000 0000 0000 0000 | 74 | # | 0000 0000 0000 1111 0000 0000 0000 0000 | 75 | # | 0000 0000 0000 0000 1111 0000 0000 0000 | 76 | # | 0000 0000 0000 0000 0000 1111 0000 0000 | 77 | # | 0000 0000 0000 0000 0000 0000 1111 0000 | 78 | # | 0000 0000 0000 0000 0000 0000 0000 1111 | 79 | # | 1100 1100 1100 1100 1100 1100 1100 1100 | <- level 1 80 | # | 0011 0011 0011 0011 0011 0011 0011 0011 | (G2) 81 | # | 1010 1010 1010 1010 1010 1010 1010 1010 | <- level 1 82 | # | 0101 0101 0101 0101 0101 0101 0101 0101 | (G3) 83 | # | 1100 0000 0000 0000 0000 0000 0000 0000 | <- level 1 84 | # | 0011 0000 0000 0000 0000 0000 0000 0000 | (G4) 85 | # | 0000 1100 0000 0000 0000 0000 0000 0000 | 86 | # | 0000 0011 0000 0000 0000 0000 0000 0000 | 87 | # | 0000 0000 1100 0000 0000 0000 0000 0000 | 88 | # | 0000 0000 0011 0000 0000 0000 0000 0000 | 89 | # | 0000 0000 0000 1100 0000 0000 0000 0000 | 90 | # | 0000 0000 0000 0011 0000 0000 0000 0000 | 91 | # | 0000 0000 0000 0000 1100 0000 0000 0000 | 92 | # | 0000 0000 0000 0000 0011 0000 0000 0000 | 93 | # | 0000 0000 0000 0000 0000 1100 0000 0000 | 94 | # | 0000 0000 0000 0000 0000 0011 0000 0000 | 95 | # | 0000 0000 0000 0000 0000 0000 1100 0000 | 96 | # | 0000 0000 0000 0000 0000 0000 0011 0000 | 97 | # | 0000 0000 0000 0000 0000 0000 0000 1100 | 98 | # | 0000 0000 0000 0000 0000 0000 0000 0011 | 99 | # | 1010 0000 0000 0000 0000 0000 0000 0000 | <- level 1 100 | # | 0101 0000 0000 0000 0000 0000 0000 0000 | (G5) 101 | # | 0000 1010 0000 0000 0000 0000 0000 0000 | 102 | # | 0000 0101 0000 0000 0000 0000 0000 0000 | 103 | # | 0000 0000 1010 0000 0000 0000 0000 0000 | 104 | # | 0000 0000 0101 0000 0000 0000 0000 0000 | 105 | # | 0000 0000 0000 1010 0000 0000 0000 0000 | 106 | # | 0000 0000 0000 0101 0000 0000 0000 0000 | 107 | # | 0000 0000 0000 0000 1010 0000 0000 0000 | 108 | # | 0000 0000 0000 0000 0101 0000 0000 0000 | 109 | # | 0000 0000 0000 0000 0000 1010 0000 0000 | 110 | # | 0000 0000 0000 0000 0000 0101 0000 0000 | 111 | # | 0000 0000 0000 0000 0000 0000 1010 0000 | 112 | # | 0000 0000 0000 0000 0000 0000 0101 0000 | 113 | # | 0000 0000 0000 0000 0000 0000 0000 1010 | 114 | # | 0000 0000 0000 0000 0000 0000 0000 0101 | 115 | # | 1000 1000 1000 1000 1000 1000 1000 1000 | <- level 1 116 | # | 0100 0100 0100 0100 0100 0100 0100 0100 | (G6) 117 | # | 0010 0010 0010 0010 0010 0010 0010 0010 | 118 | # | 0001 0001 0001 0001 0001 0001 0001 0001 | 119 | # | 1000 0000 0000 0000 0000 0000 0000 0000 | <- level 2 120 | # | 0100 0000 0000 0000 0000 0000 0000 0000 | 121 | # | 0010 0000 0000 0000 0000 0000 0000 0000 | 122 | # | 0001 0000 0000 0000 0000 0000 0000 0000 | 123 | # | 0000 1000 0000 0000 0000 0000 0000 0000 | 124 | # | 0000 0100 0000 0000 0000 0000 0000 0000 | 125 | # | 0000 0010 0000 0000 0000 0000 0000 0000 | 126 | # | 0000 0001 0000 0000 0000 0000 0000 0000 | 127 | # | 0000 0000 1000 0000 0000 0000 0000 0000 | 128 | # | 0000 0000 0100 0000 0000 0000 0000 0000 | 129 | # | 0000 0000 0010 0000 0000 0000 0000 0000 | 130 | # | 0000 0000 0001 0000 0000 0000 0000 0000 | 131 | # | 0000 0000 0000 1000 0000 0000 0000 0000 | 132 | # | 0000 0000 0000 0100 0000 0000 0000 0000 | 133 | # | 0000 0000 0000 0010 0000 0000 0000 0000 | 134 | # | 0000 0000 0000 0001 0000 0000 0000 0000 | 135 | # | 0000 0000 0000 0000 1000 0000 0000 0000 | 136 | # | 0000 0000 0000 0000 0100 0000 0000 0000 | 137 | # | 0000 0000 0000 0000 0010 0000 0000 0000 | 138 | # | 0000 0000 0000 0000 0001 0000 0000 0000 | 139 | # | 0000 0000 0000 0000 0000 1000 0000 0000 | 140 | # | 0000 0000 0000 0000 0000 0100 0000 0000 | 141 | # | 0000 0000 0000 0000 0000 0010 0000 0000 | 142 | # | 0000 0000 0000 0000 0000 0001 0000 0000 | 143 | # | 0000 0000 0000 0000 0000 0000 1000 0000 | 144 | # | 0000 0000 0000 0000 0000 0000 0100 0000 | 145 | # | 0000 0000 0000 0000 0000 0000 0010 0000 | 146 | # | 0000 0000 0000 0000 0000 0000 0001 0000 | 147 | # | 0000 0000 0000 0000 0000 0000 0000 1000 | 148 | # | 0000 0000 0000 0000 0000 0000 0000 0100 | 149 | # | 0000 0000 0000 0000 0000 0000 0000 0010 | 150 | # | 0000 0000 0000 0000 0000 0000 0000 0001 | 151 | 152 | # calculate S matrix using smatrix function. 153 | smatrix(prison.gts) 154 | 155 | ``` 156 | 157 | 158 | 3. Generate 8-step-ahead bottom-up forecasts using ARIMA models for the visnights Australian domestic tourism data. Plot the coherent forecasts by level and comment on their nature. Are you satisfied with these forecasts? 159 | 160 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question3} 161 | 162 | # Forecast 163 | fc_visnights_arima <- forecast( 164 | tourism.hts, h = 8, method = "bu", fmethod = "arima" 165 | ) 166 | 167 | # Plot the coherent forecasts by level. 168 | plot(fc_visnights_arima, levels = 0, color_lab = TRUE) 169 | title(main = "Total visitor nights") 170 | # show strong seasonality. 171 | 172 | plot(fc_visnights_arima, levels = 1, color_lab = TRUE) 173 | title(main = "Grouped by State") 174 | # On average, the number of visitors to New South Wales were biggest. However, in 3rd quarters, the number of visitors to Queensland were bigger than the one for NSW. 175 | # The number of visitors to NSW, VIC, WAU show strong seasonality, but the numbers of WAU and Other had weaker seasonality. 176 | 177 | # level 2 coherent forecasts are equal to the base forecasts. 178 | plot(fc_visnights_arima, levels = 2, color_lab = TRUE) 179 | title(main = "Grouped by Zone in each state") 180 | # The number of visitors to Queensland Metro area were the biggest at all times. 181 | 182 | # I'm satisfied with the forecasts. 183 | 184 | ``` 185 | 186 | 4. Model the aggregate series for Australian domestic tourism data visnights using an ARIMA model. Comment on the model. Generate and plot 8-step-ahead forecasts from the ARIMA model and compare these with the bottom-up forecasts generated in question 3 for the aggregate level. 187 | 188 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question4} 189 | str(visnights) 190 | head(visnights) 191 | 192 | # aggregate visnights time series matrix. 193 | visnights_total <- rowSums(visnights) 194 | visnights_total.ts <- ts(visnights_total, 195 | start = 1998, 196 | frequency = 4) 197 | str(visnights_total.ts) 198 | 199 | # model the aggregated data using ARIMA model and forecast. 200 | visnights_total_autoarima <- auto.arima( 201 | visnights_total.ts 202 | ) 203 | 204 | autoplot(visnights_total.ts) + 205 | autolayer(visnights_total_autoarima$fitted) 206 | # It looked like the model didn't fit well to the aggregated data. 207 | 208 | fc_visnights_total_autoarima <- forecast( 209 | visnights_total_autoarima, h = 8 210 | ) 211 | 212 | fc_visnights_total_autoarima$model 213 | # ARIMA(0, 1, 1)(0, 1, 1)[4] model was chosen. 214 | 215 | # get ts object of bottom-up forecasts. 216 | visnights_total_bu.ts <- ts( 217 | rowSums(fc_visnights_arima$bts), 218 | start = 2017, 219 | frequency = 4) 220 | 221 | autoplot(fc_visnights_total_autoarima) + 222 | autolayer(visnights_total_bu.ts) 223 | # The forecasts have increasing trend with strong seasonality. Bottom-up forecasts had non-increasing trend that the values were lower. Whether the lower level data are considered made such difference in forecasting. 224 | 225 | ``` 226 | 227 | 5. Generate 8-step-ahead optimally reconciled coherent forecasts using ARIMA base forecasts for the visnights Australian domestic tourism data. Plot the coherent forecasts by level and comment on their nature. How and why are these different to the bottom-up forecasts generated in question 3 above. 228 | 229 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question5} 230 | 231 | # I want to do optimally reconciled coherent forecasts using MinT estimator. I need to decide (base forecast) covariance model for the estimator. 232 | str(visnights) 233 | str(smatrix(tourism.hts)) 234 | # The number of time points(T) is 76 and the number of rows(m) in S matrix is 27. T >= m. Therefore I'm going to use sample covariance. 235 | 236 | # forecast 237 | fc_visnights_arima_opt <- forecast( 238 | tourism.hts, h = 8, 239 | method = "comb", weights = "mint", covariance = "sam", 240 | fmethod = "arima" 241 | ) 242 | 243 | # Plot the coherent forecasts by level. 244 | plot(fc_visnights_arima_opt, levels = 0, col = "red") 245 | par(new = TRUE, xpd = TRUE) 246 | plot(fc_visnights_arima, levels = 0, col = "blue") 247 | title(main = "Total number of visitors") 248 | legend("bottomright", legend = c("Optimal", "Bottom-up"), title = "Coherent forecast", col = c("red", "blue"), lty = c(1, 1), bty = "n", cex = 0.5) 249 | # Top level coherent forecasts show that the optimally reconciled method yielded higher forecast values than the bottom-up method. But the values aren't bigger than the forecasts of aggregated data's ARIMA model. 250 | # It happened because the top level forecasts of aggregated data's ARIMA model affected the optimal values smaller while the aggregated value from bottom-level forecasts affected the values bigger. 251 | 252 | plot(fc_visnights_arima_opt, levels = 1, color_lab = TRUE) 253 | plot(fc_visnights_arima, levels = 1, color_lab = TRUE) 254 | # Level 1 coherent forecasts show that the optimally reconciled method yielded higher forecast values especially for peaks than the bottom-up method. 255 | # It looked like the top-level forecasts affected more than the bottom-level forecasts especially to the seasonal peak forecast values. 256 | 257 | plot(fc_visnights_arima_opt, levels = 2, color_lab = TRUE) 258 | plot(fc_visnights_arima, levels = 2, color_lab = TRUE) 259 | # For some categories, the forecasts became bigger when optimally reconciled method was used while for the others, the forecasts became smaller. 260 | # The increase and decrease were different by each category because of the affiliation differences. 261 | 262 | ``` 263 | 264 | 6. Define as a test-set the last two years of the visnights Australian domestic tourism data. Generate, bottom-up, top-down and optimally reconciled forecasts for this period and compare their forecasts accuracy. 265 | 266 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question6} 267 | 268 | tourism.hts.train <- window(tourism.hts, end=c(2014,4)) 269 | tourism.hts.test <- window(tourism.hts, start=2015) 270 | 271 | fc_visnights_ets_opt = forecast( 272 | tourism.hts.train, h = 8, 273 | method = "comb", weights = "wls", fmethod = "ets" 274 | ) 275 | fc_visnights_ets_bu = forecast( 276 | tourism.hts.train, h = 8, 277 | method = "bu", fmethod = "ets" 278 | ) 279 | 280 | tab <- matrix(NA, ncol = 4, nrow = 4) 281 | rownames(tab) <- c("Total", "State", "Bottom", "All series") 282 | colnames(tab) <- c("Bottom-up MAPE", "Bottom-up MASE", "Optimal MAPE", "Optimal MASE") 283 | # I'll use MAPE and MASE as evaluation method. 284 | 285 | tab[1,] <- c( 286 | accuracy.gts( 287 | fc_visnights_ets_bu, 288 | tourism.hts.test, 289 | levels = 0 290 | )[c("MAPE","MASE"),"Total"], 291 | accuracy.gts( 292 | fc_visnights_ets_opt, 293 | tourism.hts.test, 294 | levels = 0 295 | )[c("MAPE","MASE"),"Total"] 296 | ) 297 | 298 | j=2 299 | for(i in c(1:2)){ 300 | tab[j,] <- c( 301 | mean(accuracy.gts( 302 | fc_visnights_ets_bu, 303 | tourism.hts.test, 304 | levels = i)["MAPE",]), 305 | mean(accuracy.gts( 306 | fc_visnights_ets_bu, 307 | tourism.hts.test, 308 | levels = i)["MASE",]), 309 | mean(accuracy.gts( 310 | fc_visnights_ets_opt, 311 | tourism.hts.test, 312 | levels = i)["MAPE",]), 313 | mean(accuracy.gts( 314 | fc_visnights_ets_opt, 315 | tourism.hts.test, 316 | levels = i)["MASE",]) 317 | ) 318 | j=j+1 319 | } 320 | 321 | tab[4,] <- c( 322 | mean(accuracy.gts( 323 | fc_visnights_ets_bu, 324 | tourism.hts.test 325 | )["MAPE",]), 326 | mean(accuracy.gts( 327 | fc_visnights_ets_bu, 328 | tourism.hts.test 329 | )["MASE",]), 330 | mean(accuracy.gts( 331 | fc_visnights_ets_opt, 332 | tourism.hts.test 333 | )["MAPE",]), 334 | mean(accuracy.gts( 335 | fc_visnights_ets_opt, 336 | tourism.hts.test 337 | )["MASE",]) 338 | ) 339 | 340 | knitr::kable(tab, digits=2, booktabs=TRUE) 341 | # all evaluation results show that forecasts of optiamlly reconciled method were more accurate than the bottom-up forecasts. 342 | 343 | plot(fc_visnights_ets_opt, levels = 0) 344 | plot(fc_visnights_ets_bu, levels = 0) 345 | plot(tourism.hts, level = 0) 346 | # can't see increasing trend in the forecasts of optimally reconciled method and the bottom-up forecasts. Bottom-up forecasts are lowest. 347 | 348 | ``` 349 | 350 | ### Question 2 isn't related with coding that I didn't include it in here. -------------------------------------------------------------------------------- /Chapter11.rmd: -------------------------------------------------------------------------------- 1 | # Chapter 11 2 | 3 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Load_packages} 4 | 5 | library(fpp2) 6 | library(vars) 7 | library(xlsx) 8 | 9 | # load foreach and doParallel packages to do parallel computing. 10 | # I can do parallel computation using %dopar% binary operator in foreach package with parallel backend in doParallel package. I can get result far faster than when I used loop. 11 | # If I use %do% instead of %dopar%, I can't use parallel computation even if I don't need to designate '.packages' option in foreach function. 12 | # '.packages' option specifies the required R package to be loaded to use the function that I want to use repeatedly. 13 | # https://www.r-statistics.com/tag/r-parallel-computation/ 14 | # This R version can't use doSMP package. Therefore I chose to use foreach and doParallel packages. 15 | library(foreach) 16 | library(doParallel) 17 | workers <- makeCluster(4) # My computer has 4 cores 18 | registerDoParallel(workers) 19 | 20 | ``` 21 | 22 | 1. Use the tbats() function to model your retail time series. 23 | 24 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question1} 25 | 26 | # a. Check the residuals and produce forecasts. 27 | retail <- read.xlsx("retail.xlsx", 28 | sheetIndex = 1, 29 | startRow = 2) 30 | retail.ts <- ts(retail[,"A3349873A"], 31 | frequency=12, 32 | start=c(1982,4)) 33 | 34 | retail_tbats <- tbats(retail.ts) 35 | checkresiduals(retail_tbats) 36 | # The residuals aren't like white noise. And they are right skewed. 37 | 38 | fc_retail_tbats <- forecast(retail_tbats, h = 36) 39 | autoplot(fc_retail_tbats) 40 | 41 | # test accuracy using future data. 42 | retail.new <- read.xlsx("8501011.xlsx", 43 | sheetName = "Data1", 44 | startRow = 10) 45 | retail.new.ts <- ts(retail.new[, "A3349873A"], 46 | start = c(1982, 4), 47 | frequency = 12) 48 | retail.new.test <- subset( 49 | retail.new.ts, 50 | start = length(retail.ts) + 1 51 | ) 52 | 53 | accuracy(fc_retail_tbats, retail.new.test) 54 | # TBATS model was worse than Holt-Winters' or dynamic regression model. It was better than ARIMA model or seasonal naive method. 55 | 56 | # b. Does this completely automated approach work for these data? 57 | # Judging from the forecasts shown in plot, it looked like that TBATS model worked for these data well. 58 | 59 | # c. Have you saved any degrees of freedom by using Fourier terms rather than seasonal differencing? 60 | retail_tbats 61 | # TBATS(0.126, {5, 0}, 1, {<12, 5>}) 62 | # TBATS(lambda, {p, q}, damping, {}) 63 | # alpha - level(lt) smoothing parameter 64 | # beta - trend(bt) smoothing parameter 65 | # gamma1 - seasonal level(sj,t) smoothing parameter 66 | # gamma2 - seasonal growth(s*j,t) smoothing parameter 67 | # j - Fourier term(j times of base frequency) 68 | # Seed states - initial state variables. In this case, there are 17. Maybe first and second of them are l0 and b0, 3rd to 7th are s1,0 to s5,0, 8th to 12th are s*1,0 to s*5,0, 13th to 17th are d-4 to d0. d is disturbance, and it can be thought of as white noise. d-4 to d0 are all 0s because errors before the first time point can be assumed as 0. 69 | # When I used checkresiduals function, I could know that the degrees of freedom of the model are 27. Maybe 2 are related with level and trend, 5 are related with AR(5) and the rest 20 are related with seasonality. 20 = 2(smoothing parameters) x 2(cos, sin) x 5(number of terms). 70 | 71 | # When I used ARIMA model for these data, ARIMA(1, 0, 2)(0, 1, 1)[12] with drift model was chosen. This model used seasonal differencing to model seasonal component. 72 | # When I used checkresiduals function, I could know that the degrees of freedom of the model are 5. Maybe 1 for AR(1), 2 for MA(2), 1 for seasonal MA(1) and final 1 for drift coefficient. 73 | 74 | # When just compared degrees of freedom related with seasonal component, tbats model saved 18 dofs. Having more dofs means more flexibility. TBATS model got more dofs, therefore the model can deal with more number of different seasonalities at a time. 75 | 76 | ``` 77 | 78 | 79 | 2. Consider the weekly data on US finished motor gasoline products supplied (thousands of barrels per day) (series gasoline): 80 | 81 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question2} 82 | 83 | # a. Fit a TBATS model to these data. 84 | gasoline_tbats <- tbats(gasoline) 85 | 86 | # b. Check the residuals and produce forecasts. 87 | checkresiduals(gasoline_tbats) 88 | # The residuals aren't like white noise. 89 | 90 | fc_gasoline_tbats <- forecast(gasoline_tbats) 91 | autoplot(fc_gasoline_tbats) 92 | # It looked like TBATS model isn't fitted well. 93 | 94 | # c. Could you model these data using any of the other methods we have considered in this book? 95 | # I think that as I did in Question 4 of Chapter 9, dynamic regression model is the best for the data. I think so because regression can deal with the piecewise trends in the data, and ARIMA model can be fitted for residuals well. 96 | 97 | ``` 98 | 99 | 100 | 3. Experiment with using nnetar() on your retail data and other data we have considered in previous chapters. 101 | 102 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question3} 103 | 104 | retail_nnetar <- nnetar( 105 | retail.ts, lambda = BoxCox.lambda(retail.ts) 106 | ) 107 | fc_retail_nnetar <- forecast(retail_nnetar, h = 36) 108 | autoplot(fc_retail_nnetar) 109 | 110 | # test accuracy using future data. 111 | accuracy(fc_retail_nnetar, retail.new.test) 112 | # It is better than all methods I tried so far, including Holt-Winters'. 113 | 114 | # experiment with ibmclose data. 115 | ibmclose_nnetar <- nnetar(ibmclose) 116 | fc_ibmclose_nnetar <- forecast(ibmclose_nnetar) 117 | autoplot(fc_ibmclose_nnetar) 118 | # Even neural network method yielded naive-method like result. It looked like there wan't any rule in lagged values. 119 | 120 | # experiment with usmelec data. 121 | usmelec_nnetar <- nnetar( 122 | usmelec, lambda = BoxCox.lambda(usmelec) 123 | ) 124 | 125 | fc_usmelec_nnetar <- forecast( 126 | usmelec_nnetar, h = 12*4 127 | ) 128 | 129 | autoplot(fc_usmelec_nnetar) 130 | 131 | # get the latest figures 132 | usmelec.new <- read.csv("MER_T07_02A.csv", sep = ",") 133 | usmelec.new[, "Year"] <- as.numeric(substr(usmelec.new[, "YYYYMM"], 1, 4)) 134 | usmelec.new[, "Month"] <- as.numeric( 135 | substr(usmelec.new[, "YYYYMM"], 5, 6) 136 | ) 137 | usmelec.new <- subset( 138 | usmelec.new, 139 | Description == "Electricity Net Generation Total, All Sectors", 140 | select = c("Year", "Month", "Value") 141 | ) 142 | usmelec.new <- subset(usmelec.new, Month != 13) 143 | usmelec.new[, "Value"] <- as.numeric( 144 | as.character(usmelec.new[, "Value"]) 145 | )/1000 146 | usmelec.new.ts <- ts( 147 | as.numeric(usmelec.new[, "Value"]), 148 | start = c(1973, 1), 149 | frequency = 12 150 | ) 151 | 152 | # get accuracy 153 | accuracy(fc_usmelec_nnetar, usmelec.new.ts) 154 | # Without ME and MPE, all the other errors show that neural network model is better than ARIMA model to forecast the usmelec data. 155 | 156 | autoplot(fc_usmelec_nnetar) + 157 | autolayer(window(usmelec.new.ts, start = c(2013, 7))) + 158 | scale_x_continuous(limits = c(2010, 2019)) + 159 | scale_y_continuous(limits = c(250, 450)) 160 | # It looked like neural network model was fitted well to the data. 161 | 162 | ``` 163 | 164 | ## Deal with complex seasonality in data example 165 | 166 | ```{r echo=FALSE, message=FALSE, warning=FALSE, complex_seasonality} 167 | 168 | # plot calls data 169 | p1 <- autoplot(calls) + 170 | ylab("Call volume") + 171 | xlab("Weeks") + 172 | scale_x_continuous(breaks=seq(1,33,by=2)) 173 | 174 | p2 <- autoplot(window(calls, end=4)) + 175 | ylab("Call volume") + 176 | xlab("Weeks") + 177 | scale_x_continuous(minor_breaks = seq(1,4,by=0.2)) 178 | 179 | gridExtra::grid.arrange(p1,p2) 180 | 181 | # 1. STL with multiple seasonal periods(Use mstl function) 182 | # - used development version of forecast package to use mstl function(2018/2/22). Run below 3 rows of codes without #s to download it. 183 | # install.packages("devtools") 184 | # library(devtools) 185 | # devtools::install_github("robjhyndman/forecast") 186 | calls %>% 187 | mstl() %>% 188 | autoplot() 189 | 190 | # 2. Dynamic harmonic regression with multiple seasonal periods(Use auto.arima function) 191 | calls_autoarima <- auto.arima( 192 | # model seasonal component using Fourier terms, not using ARIMA model(set seasonal option as FALSE). 193 | calls, seasonal=FALSE, lambda=0, 194 | # For K in fourier function, use vector instead of a number because calls data are msts(multi-seasonal time series). There are 2 seasonal frequencies in the data that the vector needs to specify 2 numbers of Fourier terms. 195 | xreg=fourier(calls, K=c(10,10)) 196 | ) 197 | 198 | fc_calls_autoarima <- forecast( 199 | calls_autoarima, 200 | xreg=fourier(calls, K=c(10,10), h=2*169) 201 | ) 202 | 203 | autoplot(fc_calls_autoarima, include=5*169) + 204 | ylab("Call volume") + 205 | xlab("Weeks") 206 | 207 | # 3. TBATS model(Use tbats function) 208 | # - One drawback of TBATS models is that they can be very slow to estimate, especially with long time series. So I will consider a subset of the calls data to save time. 209 | calls_tbats <- calls %>% 210 | subset(start=length(calls)-2000) %>% 211 | tbats() 212 | 213 | fc_calls_tbats <- forecast(calls_tbats, h=2*169) 214 | 215 | autoplot(fc_calls_tbats, include=5*169) + 216 | ylab("Call volume") + xlab("Weeks") 217 | 218 | # 4. Dynamic harmonic regression with multiple seasonal periods and covariates(Use auto.arima function) 219 | # I'll use elecdemand data in this case because I can use several time series in the data as covariates. 220 | elecdemand %>% 221 | as.data.frame %>% 222 | ggplot(aes(x=Temperature, y=Demand)) + 223 | geom_point() + 224 | xlab("Temperature (degrees Celsius)") + 225 | ylab("Demand (GW)") 226 | 227 | cooling <- pmax(elecdemand[,"Temperature"], 18) 228 | 229 | elecdemand_dreg <- auto.arima( 230 | elecdemand[,"Demand"], 231 | # To forecast total electricity demand, use 2 time series related with temperature as covariates. 232 | xreg = cbind(fourier(elecdemand, c(10,10,0)), 233 | heating=elecdemand[,"Temperature"], 234 | cooling=cooling) 235 | ) 236 | 237 | # I'll do scenario forecasting. Therefore I'll use a repeat of the last two days of temperatures to generate future possible demand values. 238 | temps.new <- subset( 239 | elecdemand[, c(1:3)], 240 | start=NROW(elecdemand)-2*48+1 241 | ) 242 | 243 | cooling.new <- pmax(temps.new, 18) 244 | 245 | # forecast temperature using Fourier terms 246 | fc_elecdemand_dreg <- forecast( 247 | elecdemand_dreg, 248 | xreg=cbind(fourier(temps.new, c(10,10,0)), 249 | heating=temps.new, 250 | cooling=pmax(cooling.new,18)) 251 | ) 252 | 253 | autoplot(fc_elecdemand_dreg, include=14*48) 254 | checkresiduals(fc_elecdemand_dreg) 255 | # Although the short-term forecasts look reasonable, this is a very crude model for a complicated process. 256 | 257 | ``` 258 | 259 | ## Use VAR model for forecasting 260 | 261 | ```{r echo=FALSE, message=FALSE, warning=FALSE, VAR_model} 262 | 263 | # make a VAR model for forecasting US consumption 264 | VARselect( 265 | uschange[, 1:2], lag.max=8, type="const" 266 | )[["selection"]] 267 | # There is a large discrepancy between the VAR(5) selected by the AIC and the VAR(1) selected by the BIC. As a result I'm going to fit a VAR(1) first, as selected by the BIC. And then I'll increase lag order one by one to get a model whose residuals are uncorrelated. 268 | 269 | # used consumption and income as variables. 270 | uschange_var1 <- VAR(uschange[,1:2], p=1, type="const") 271 | summary(uschange_var1) 272 | 273 | # computes the multivariate asymptotic Portmanteau test for serially correlated errors. 274 | serial.test( 275 | uschange_var1, lags.pt=10, type="PT.asymptotic" 276 | ) 277 | # p value is more than 0.05. This model can be used. 278 | 279 | uschange_var2 <- VAR(uschange[,1:2], p=2, type="const") 280 | summary(uschange_var2) 281 | 282 | serial.test( 283 | uschange_var2, lags.pt=10, type="PT.asymptotic" 284 | ) 285 | # p value is less than 0.05. Try 1 higher order. 286 | 287 | uschange_var3 <- VAR(uschange[,1:2], p=3, type="const") 288 | summary(uschange_var3) 289 | 290 | serial.test( 291 | uschange_var3, lags.pt=10, type="PT.asymptotic" 292 | ) 293 | # p value is more than 0.05. This model can be used. 294 | 295 | # try making model using production variable too. 296 | VARselect( 297 | uschange[, 1:3], lag.max=8, type="const" 298 | )[["selection"]] 299 | # try VAR(1) model first. 300 | 301 | uschange_var1.v3 <- VAR( 302 | uschange[,1:3], p=1, type="const" 303 | ) 304 | summary(uschange_var1.v3) 305 | 306 | serial.test( 307 | uschange_var1.v3, lags.pt=10, type="PT.asymptotic" 308 | ) 309 | # p value is less than 0.05. Try 1 higher order model. 310 | 311 | uschange_var2.v3 <- VAR( 312 | uschange[,1:3], p=2, type="const" 313 | ) 314 | summary(uschange_var2.v3) 315 | 316 | serial.test( 317 | uschange_var2.v3, lags.pt=10, type="PT.asymptotic" 318 | ) 319 | # p value is less than 0.05. Try 1 higher order model. 320 | 321 | uschange_var3.v3 <- VAR( 322 | uschange[,1:3], p=3, type="const" 323 | ) 324 | summary(uschange_var3.v3) 325 | 326 | serial.test( 327 | uschange_var3.v3, lags.pt=10, type="PT.asymptotic" 328 | ) 329 | # p value is more than 0.05. This model can be used. 330 | 331 | # forecast using usable models. 332 | forecast(uschange_var1) %>% autoplot() 333 | forecast(uschange_var3) %>% autoplot() 334 | forecast(uschange_var3.v3) %>% autoplot() 335 | # It looked like VAR(3) model is a little better than VAR(1). 336 | 337 | ``` 338 | 339 | ## Get prediction interval of Neural Network model using simulations or PI option. 340 | 341 | ```{r echo=FALSE, message=FALSE, warning=FALSE, nnetar_PI} 342 | 343 | # simulation of 9 possible future sample paths for the sunspot data. 344 | sunspotarea_nnetar <- nnetar(sunspotarea, lambda=0) 345 | 346 | sim <- ts( 347 | matrix(0, nrow=30L, ncol=9L), 348 | start=end(sunspotarea)[1L]+1L 349 | ) 350 | 351 | for(i in seq(9)){ 352 | sim[,i] <- simulate(sunspotarea_nnetar, nsim=30L) 353 | } 354 | 355 | autoplot(sunspotarea) + autolayer(sim) 356 | 357 | # if PI=TRUE option is used in forecast.nnetar function, prediction interval calculated by simulations can be shown. But it takes more time to do the option. 358 | forecast(sunspotarea_nnetar, PI=TRUE, h=30) %>% 359 | autoplot() 360 | 361 | # try using ibmclose data. 362 | forecast(nnetar(ibmclose), PI=TRUE, h=30) %>% 363 | autoplot() 364 | # even neural network model couldn't yield meaningful forecasts. 365 | 366 | ``` 367 | 368 | ## Bootstrapping and Bagging 369 | 370 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Bootstrapping_and_Bagging} 371 | 372 | # Bootstrap the residuals of debitcards time series in order to simulate future values of the series using a model. 373 | # bld.mbb.bootstrap function - Box-Cox And Loess-Based Decomposition with Moving Block Bootstrap. 374 | # generate 10 bootstrapped versions. 375 | bootseries <- bld.mbb.bootstrap(debitcards, 10) %>% 376 | as.data.frame %>% 377 | ts(start=2000, frequency=12) 378 | 379 | autoplot(debitcards) + 380 | autolayer(bootseries, colour=TRUE) + 381 | autolayer(debitcards, colour=FALSE) + 382 | ylab("Bootstrapped series") + 383 | guides(colour="none") 384 | 385 | # Get prediction intervals from bootstrapped series 386 | # generate 1000 bootstrapped versions. 387 | nsim <- 1000L 388 | sim <- bld.mbb.bootstrap(debitcards, nsim) 389 | 390 | # For each of these series, I'm going to fit an ETS model and simulate one sample path from that model. The estimated parameters will be different. And the point forecasts will be different, too. 391 | h <- 36L 392 | debitcards.future <- matrix(0, nrow=nsim, ncol=h) 393 | 394 | # put each simulation's point forecasts in each row of debitcards.future matrix. 395 | # I can get debitcards.future much faster by using parallel computing. 396 | debitcards.future <- foreach( 397 | i = 1:1000, 398 | .packages = 'forecast' 399 | ) %dopar% 400 | simulate( 401 | ets(sim[[i]]), nsim = h 402 | ) %>% 403 | unlist() 404 | 405 | dim(debitcards.future) <- c(1000, 36) 406 | 407 | # I can get debitcards.future by running below loop. 408 | # But it takes lots of time to run it. 409 | #for(i in seq(nsim)){ 410 | # debitcards.future[i,] <- simulate( 411 | # ets(sim[[i]]), nsim=h 412 | # ) 413 | #} 414 | 415 | # I can make debitcards.future using map function of purrr package, too. 416 | # But it also takes lots of time to run the function. 417 | #debitcards.future <- purrr::map( 418 | # as.list(bld.mbb.bootstrap(debitcards, 1000L)), 419 | # function(x){forecast(ets(x), h=h)$mean} 420 | # ) %>% 421 | # # change the class from list to vector. 422 | # unlist() 423 | # change the class from vector to matrix. I couldn't get 1000 x 36 matrix by directly change the class to matrix from list. 424 | # Therefore I made the format as vector first and then change it as matrix designating the number of rows and columns. 425 | # dim(debitcards.future) <- c(1000, 36) 426 | 427 | # Finally, take the means and quantiles of these simulated sample paths to form point forecasts and prediction intervals. 428 | # tsp function extract start time, end time and the frequency of data. 429 | # h.start = first time point of forecast horizon. 430 | h.start <- tsp(debitcards)[2] + 1/12 431 | 432 | # calculate each column's mean, lower limit and upper limit. 433 | fc_debitcards_sim <- structure( 434 | list( 435 | mean = ts( 436 | colMeans(debitcards.future), 437 | start=h.start, 438 | frequency=12 439 | ), 440 | lower = ts( 441 | apply(debitcards.future, 2, quantile, prob=0.025), 442 | start=h.start, 443 | frequency=12 444 | ), 445 | upper = ts( 446 | apply(debitcards.future, 2, quantile, prob=0.975), 447 | start=h.start, 448 | frequency=12 449 | ), 450 | level=95), 451 | class="forecast") 452 | 453 | # get point forecasts and PI using forecast function. 454 | fc_debitcards_ets <- forecast(ets(debitcards), h=h) 455 | 456 | # plot the results 457 | autoplot(debitcards) + 458 | ggtitle("Monthly retail debit card usage in Iceland") + 459 | xlab("Year") + ylab("million ISK") + 460 | autolayer(fc_debitcards_sim, series="Simulated") + 461 | autolayer(fc_debitcards_ets, series="ETS") 462 | # Can see that the PI of bootstraped simulations yielded larger interval than the one obtained from an ETS model applied directly to the original data. 463 | 464 | # Bagging(Bootstrap AGGregatING) 465 | # - produce forecasts from each of the additional bootstrapped time series, and average the resulting forecasts. 466 | # We can get better forecasts than if we simply forecast the original time series directly. 467 | # fc_debitcards_sim.sets can be got by running parallel computing, too. 468 | # It is a little faster than purrr's map function. 469 | fc_debitcards_sim.sets <- foreach( 470 | i = 1:10, 471 | .packages = 'forecast' 472 | ) %dopar% 473 | forecast( 474 | ets(bootseries[[i]]) 475 | )$mean %>% 476 | # map function returns a list. Therefore to make it as ts object, transform it to data.frame first. 477 | as.data.frame() %>% 478 | ts(frequency=12, start=h.start) 479 | 480 | # Getting fc_debitcards_sim.sets by using map function. 481 | #fc_debitcards_sim.sets <- purrr::map( 482 | # as.list(bootseries), 483 | # function(x){forecast(ets(x))$mean} 484 | # ) %>% 485 | # map function returns a list. Therefore to make it as ts object, transform it to data.frame first. 486 | # as.data.frame() %>% 487 | # ts(frequency=12, start=h.start) 488 | 489 | ## Comparison of variables. 490 | # fc_debitcards_sim : ETS modeling for each 1000 bootstrapped series, generate 1000 sets of point forecasts. And then aggregate the 1000 sets by getting mean, lower and upper limits of the sets. 491 | # fc_debitcards_ets : ETS modeling from original data only, generate one set of point forecasts. 492 | # fc_debitcards_ets.mean : ETS modeling for each 10 bootstrapped series, generate 10 sets of point forecasts. 493 | 494 | 495 | # get aggregated mean forecasts from fc_debitcards_sim.sets. 496 | fc_debitcards_sim2 <- structure( 497 | list( 498 | mean = ts( 499 | # The rows of fc_debitcards_sim.sets are time points of forecast horizon. The columns are bootstrapped versions. Therefore get means for each row. 500 | rowMeans(fc_debitcards_sim.sets), 501 | start=h.start, 502 | frequency=12 503 | ) 504 | ), 505 | class="forecast") 506 | 507 | # plot the results. 508 | autoplot(debitcards) + 509 | autolayer(bootseries, colour=TRUE) + 510 | autolayer(fc_debitcards_sim.sets, colour=TRUE) + 511 | autolayer(debitcards, colour=FALSE) + 512 | autolayer(fc_debitcards_sim2$mean, colour=FALSE) + 513 | ylab("Bootstrapped series") + 514 | guides(colour="none") 515 | 516 | # But the whole procedure can be handled with the baggedETS function. 517 | fc_debitcards_ets <- debitcards %>% 518 | ets %>% 519 | forecast(h=36) 520 | fc_debitcards_bagged.ets <- debitcards %>% 521 | baggedETS %>% 522 | forecast(h=36) 523 | 524 | autoplot(debitcards) + 525 | autolayer(fc_debitcards_bagged.ets$mean, 526 | series="BaggedETS") + 527 | autolayer(fc_debitcards_ets$mean, series="ETS") + 528 | guides(colour=guide_legend(title="Forecasts")) 529 | # In this case, the forecast difference of the 2 functions is small. 530 | # By default, 100 bootstrapped series are used. And on average, baggedets gives better forecasts than just applying ets directly. But it is slower because a lot more computation is required. 531 | # baggedModel function can be used instead. fn option can be designated as ets or auto.arima. 532 | 533 | ``` 534 | 535 | -------------------------------------------------------------------------------- /Chapter12.rmd: -------------------------------------------------------------------------------- 1 | # Chapter 12 2 | 3 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Load_packages} 4 | 5 | library(fpp2) 6 | 7 | # I can do parallel computation using %dopar% binary operator in foreach package with parallel backend in doParallel package. I can get result far faster than when I used loop. 8 | # If I use %do% instead of %dopar%, I can't use parallel computation even if I don't need to designate '.packages' option in foreach function. '.packages' option specifies the required R package to be loaded to use the function that I want to use repeatedly. 9 | # https://www.r-statistics.com/tag/r-parallel-computation/ 10 | # This R version can't use doSMP package. Therefore I chose to use foreach and doParallel packages. 11 | library(foreach) 12 | library(doParallel) 13 | workers <- makeCluster(4) # My computer has 4 cores 14 | registerDoParallel(workers) 15 | 16 | ``` 17 | 18 | ## Forecast combinations examples 19 | 20 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Forecast_combinations} 21 | 22 | # It is an example using auscafe data. They have information about monthly expenditure on eating out in Australia, from April 1982 to September 2017. I'm going to get forecasts from the following models: ETS, ARIMA, STL-ETS, NNAR, and TBATS. And we compare the results using the last 5 years (60 months) of observations. 23 | auscafe.train <- window(auscafe, end=c(2012,9)) 24 | h <- length(auscafe) - length(auscafe.train) 25 | 26 | # forecast using several models. 27 | auscafe_ETS <- forecast(ets(auscafe.train), h=h) 28 | auscafe_ARIMA <- forecast( 29 | auto.arima(auscafe.train, lambda=0, biasadj=TRUE), h=h 30 | ) 31 | auscafe_STL <- stlf( 32 | auscafe.train, lambda=0, h=h, biasadj=TRUE 33 | ) 34 | auscafe_NNAR <- forecast(nnetar(auscafe.train), h=h) 35 | auscafe_TBATS <- forecast( 36 | tbats(auscafe.train, biasadj=TRUE), h=h 37 | ) 38 | auscafe_Combination <- ( 39 | auscafe_ETS$mean + 40 | auscafe_ARIMA$mean + 41 | auscafe_STL$mean + 42 | auscafe_NNAR$mean + 43 | auscafe_TBATS$mean 44 | )/5 45 | 46 | # plot the result 47 | autoplot(auscafe) + 48 | autolayer(auscafe_ETS$mean, series="ETS") + 49 | autolayer(auscafe_ARIMA$mean, series="ARIMA") + 50 | autolayer(auscafe_STL$mean, series="STL") + 51 | autolayer(auscafe_NNAR$mean, series="NNAR") + 52 | autolayer(auscafe_TBATS$mean, series="TBATS") + 53 | autolayer(auscafe_Combination, series="Combination") + 54 | xlab("Year") + ylab("$ billion") + 55 | ggtitle("Australian monthly expenditure on eating out") 56 | 57 | # get accuracy for each model 58 | c( 59 | ETS=accuracy( 60 | auscafe_ETS, auscafe 61 | )["Test set","RMSE"], 62 | ARIMA=accuracy( 63 | auscafe_ARIMA, auscafe 64 | )["Test set","RMSE"], 65 | `STL-ETS`=accuracy( 66 | auscafe_STL, auscafe 67 | )["Test set","RMSE"], 68 | NNAR=accuracy( 69 | auscafe_NNAR, auscafe 70 | )["Test set","RMSE"], 71 | TBATS=accuracy( 72 | auscafe_TBATS, auscafe 73 | )["Test set","RMSE"], 74 | Combination=accuracy( 75 | auscafe_Combination, auscafe 76 | )["Test set","RMSE"] 77 | ) 78 | 79 | # When I get accuracy using RMSE, the forecasts from combinations of models yielded lowest error. TBATS did particularly well with this series, but the combination approach was even better. 80 | # Combination approach generally improves forecast accuracy. 81 | 82 | ``` 83 | 84 | 85 | ## Dealing with weekly data example 86 | 87 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Weekly_data} 88 | 89 | # The simplest approach is to use an STL decomposition to the seasonal component along with a non-seasonal method applied to the seasonally adjusted component of data. 90 | gasoline %>% stlf() %>% autoplot() 91 | 92 | # An alternative approach is to use a dynamic harmonic regression model. Example using gasoline data again. 93 | # Use parallel computing to choose the number of Fourier pairs that yields smallest AIC value. 94 | gasoline_aiccs <- foreach( 95 | i = 1:26, 96 | .packages = 'fpp2' 97 | ) %dopar% 98 | auto.arima( 99 | gasoline, 100 | xreg=fourier(gasoline, K=i), 101 | seasonal=FALSE 102 | )$aic %>% 103 | unlist() 104 | 105 | gasoline_K.best <- which( 106 | gasoline_aiccs == min(gasoline_aiccs) 107 | ) 108 | 109 | # Using for-loop to get the number of Fourier pairs. 110 | #gasoline_dreg.best <- list(aicc=Inf) 111 | #for(K in seq(26)){ 112 | # gasoline_dreg <- auto.arima( 113 | # #substitute seasonal component with Fourier terms. 114 | # gasoline, 115 | # xreg=fourier(gasoline, K=K), 116 | # seasonal=FALSE 117 | # ) 118 | # 119 | # if(gasoline_dreg$aicc < gasoline_dreg.best$aicc) 120 | # { 121 | # gasoline_dreg.best <- gasoline_dreg 122 | # gasoline_K.best <- K 123 | # } 124 | #} 125 | 126 | # forecast the next 2 years of data. If I assume that 1 year is about 52 weeks, forecast horizon(h) is 104. 127 | fc_gasoline_dreg.best <- forecast( 128 | gasoline_dreg.best, 129 | xreg=fourier(gasoline, K=gasoline_K.best, h=104) 130 | ) 131 | 132 | autoplot(fc_gasoline_dreg.best) 133 | 134 | # A third approach is the TBATS model. This was the subject of Question 11.2. 135 | gasoline_tbats <- tbats(gasoline) 136 | 137 | checkresiduals(gasoline_tbats) 138 | # The residuals aren't like white noise. 139 | 140 | fc_gasoline_tbats <- forecast(gasoline_tbats) 141 | autoplot(fc_gasoline_tbats) 142 | # The forecasts from above 3 methods are similar to each other. 143 | # At the question 11.2, I thought that the dynamic harmonic regression model will be best for the data because of unlinearity in trend. 144 | 145 | # If there's an unlinearity of trend or irregular seasonality in the data, dynamic regression model with dummy variable(s) will be the only choice. This fact can be applied to daily or sub-daily data, too. 146 | 147 | ``` 148 | 149 | 150 | ## Dealing with time series of small counts 151 | 152 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Ts_of_small_counts} 153 | 154 | # The forecast of 3.64 customers can matters even though the forecast of 100.368 customers rarely matters. 155 | 156 | # Croston's method. 157 | # Even if this method does not properly deal with the count nature of the data, but it is used so often, that it is worth knowing about it. 158 | # qi is the i-th non-zero quantity, and ai is the time between qi-1 and qi. q is often called the "demand" and a the "inter-arrival time". 159 | # Let j be the time for the last observed positive observation. Then, the h-step ahead forecast for the demand at time T+h, is given by the ratio q(j+1|j) / a(j+1|j). One-step forecast is calculated by using exponential smoothing with alpha. 160 | # croston() function's default alpha value is 0.1. 161 | productC %>% croston() %>% autoplot() 162 | 163 | ``` 164 | 165 | ## Examples of how to ensure forecasts stay within limits 166 | 167 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Ensure_forecasts_stay_within_limits} 168 | 169 | # To impose a positivity constraint, simply work on the log scale. Simply set the Box-Cox parameter lambda as 0. 170 | eggs %>% 171 | ets(model="AAN", damped=FALSE, lambda=0) %>% 172 | forecast(h=50, biasadj=TRUE) %>% 173 | autoplot() 174 | 175 | # To make forecasts constrained to an interval, transform the data using a scaled logit transform which maps (lower limit, upper limit) to the whole real line: 176 | # y = log((x-a)/(b-x)), where a and b are lower and upper limits, x is on the original scale, and y is the transformed data. 177 | # To reverse the transformation, I need to use 178 | # x = (b-a)e^y/(1+e^y)+a 179 | # eggs data example 180 | # set bounds 181 | a <- 50 182 | b <- 400 183 | 184 | # Transform data and fit model 185 | eggs_ets.aan.constrained <- log((eggs-a)/(b-eggs)) %>% 186 | ets(model="AAN", damped=FALSE) 187 | fc_eggs_ets.aan.constrained <- forecast( 188 | eggs_ets.aan.constrained, h=50 189 | ) 190 | 191 | # Back-transform forecasts 192 | fc_eggs_ets.aan.constrained$mean <- 193 | (b-a)*exp(fc_eggs_ets.aan.constrained$mean)/ 194 | (1+exp(fc_eggs_ets.aan.constrained$mean)) + a 195 | fc_eggs_ets.aan.constrained$lower <- 196 | (b-a)*exp(fc_eggs_ets.aan.constrained$lower)/ 197 | (1+exp(fc_eggs_ets.aan.constrained$lower)) + a 198 | fc_eggs_ets.aan.constrained$upper <- 199 | (b-a)*exp(fc_eggs_ets.aan.constrained$upper)/ 200 | (1+exp(fc_eggs_ets.aan.constrained$upper)) + a 201 | fc_eggs_ets.aan.constrained$x <- eggs 202 | 203 | # Plot result on original scale 204 | autoplot(fc_eggs_ets.aan.constrained) 205 | # As a result of this artificial (and unrealistic) constraint, the forecast distributions have become extremely skewed. 206 | # No bias-adjustment has been used here, so the forecasts are the medians of the future distributions. 207 | # The prediction intervals show 80% and 95% percentile error ranges when the forecasts are constrained. 208 | 209 | ``` 210 | 211 | 212 | ## Prediction intervals for aggregates 213 | 214 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Forecast_aggregate_of_time_periods} 215 | 216 | # For example: You may have daily data, fit a model using the data, and want to forecast the total for the next week(need to aggregate 7 days of forecasts). 217 | # If the point forecasts are means, then adding them up will give a good estimate of the total. 218 | 219 | # A general solution is to use simulations. 220 | # An example using ETS models applied to Australian monthly gas production data. 221 | # First fit a model to the data 222 | gas_ets <- ets(gas/1000) 223 | 224 | # Forecast six months ahead 225 | fc_gas_ets <- forecast(gas_ets, h=6) 226 | 227 | # Simulate 10000 future sample paths 228 | nsim <- 10000 229 | h <- 6 230 | 231 | # Use parallel computing to get sample paths and the sums of their forecasts. 232 | sim <- foreach( 233 | i = 1:nsim, 234 | .packages = 'forecast' 235 | ) %dopar% 236 | sum( 237 | simulate(gas_ets, future = TRUE, nsim = h) 238 | ) %>% 239 | unlist() 240 | 241 | # Use for-loop to get sample paths and the sums of their forecasts. 242 | #sim <- numeric(nsim) 243 | #for(i in seq_len(nsim)){ 244 | # # for each sample path, add 6 months' forecasts. 245 | # sim[i] <- sum(simulate(gas_ets, future=TRUE, nsim=h)) 246 | #} 247 | 248 | # get final aggregated forecast. 249 | gas_ets.sim.meanagg <- mean(sim) 250 | 251 | sum(fc_gas_ets$mean[1:6]) 252 | gas_ets.sim.meanagg 253 | # The results from above 2 methods are similar to each other. 254 | 255 | # get prediction intervals. 256 | #80% interval: 257 | quantile(sim, prob=c(0.1, 0.9)) 258 | #95% interval: 259 | quantile(sim, prob=c(0.025, 0.975)) 260 | 261 | ``` 262 | 263 | ## Backcasting 264 | 265 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Backcasting} 266 | 267 | # Backcast = forecast in reverse time 268 | # Function to reverse time 269 | reverse_ts <- function(y) 270 | { 271 | # reverse the data setting the start time same. 272 | ts(rev(y), start=tsp(y)[1L], frequency=frequency(y)) 273 | } 274 | 275 | # Function to reverse a forecast(object) 276 | reverse_forecast <- function(object){ 277 | # already forecasted with reversed time series. 278 | # This function reverses the forecast parts to the past of the time series. And also reverses the reversed time series to the original. 279 | h <- length(object$mean) 280 | f <- frequency(object$mean) 281 | object$x <- reverse_ts(object$x) 282 | object$mean <- ts(rev(object$mean), 283 | end=tsp(object$x)[1L]-1/f, 284 | frequency=f) 285 | object$lower <- object$lower[h:1L,] 286 | object$upper <- object$upper[h:1L,] 287 | return(object) 288 | } 289 | 290 | # Backcast example to quarterly retail trade in the Euro area data. 291 | euretail %>% 292 | reverse_ts() %>% 293 | auto.arima() %>% 294 | forecast() %>% 295 | reverse_forecast() -> bc_euretail 296 | 297 | autoplot(bc_euretail) + 298 | ggtitle(paste("Backcasts from", bc_euretail$method)) 299 | 300 | ``` 301 | 302 | 303 | ## How to decide the number of Fourier pairs for harmonic regression fastly. 304 | 305 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Foreach_with_doParallel} 306 | 307 | aiccs <- foreach(i=1:26, .packages = "fpp2") %dopar% auto.arima( 308 | gasoline, 309 | xreg=fourier(gasoline, K=i), 310 | seasonal=FALSE 311 | )$aicc 312 | # 'worker initialization failed' error can appear. I don't why when and why the error occurs. But when I turned off RStudio and reopened, all codes in this chun ran without any problem. 313 | # If I put models in aiccs, I can access to each model by using 'aiccs[[i]]$blahblah' code. 314 | 315 | # measure how much time is needed to get AICc for 26 different Fourier pairs cases. 316 | system.time( 317 | aiccs <- foreach( 318 | i=1:26, .packages = "fpp2" 319 | ) %dopar% 320 | auto.arima( 321 | gasoline, 322 | xreg=fourier(gasoline, K=i), 323 | seasonal=FALSE 324 | )$aicc 325 | ) 326 | # user: 0.03, system: 0.02, elapsed: 182.05 327 | # It took about just 3 minutes to run the codes. When I used loop, it took more than 20 minutes. 328 | # when I used 8 cores, the results were user: 0.08, system: 0.02, elapsed: 138.05. Faster result. 329 | # explanations about user, system and elapsed: 330 | # https://stackoverflow.com/questions/5688949/what-are-user-and-system-times-measuring-in-r-system-timeexp-output 331 | 332 | # foreach function returns list by default. Need to use unlist function to get minimum AICc value and its index(which equals the number of Fourier pairs). 333 | aicc.min <- min(unlist(aiccs)) 334 | K_aicc.min <- which(unlist(aiccs) == aicc.min) 335 | 336 | # get best fitted model using the minimum AICc generating number of Fourier terms. 337 | gasoline_dreg.best <- auto.arima( 338 | gasoline, 339 | xreg=fourier(gasoline, K=K_aicc.min), 340 | seasonal=FALSE 341 | ) 342 | 343 | # forecast using the best model. 344 | fc_gasoline_dreg.best <- forecast( 345 | gasoline_dreg.best, 346 | xreg=fourier(gasoline, K=K_aicc.min, h=104) 347 | ) 348 | 349 | autoplot(fc_gasoline_dreg.best) 350 | 351 | ``` 352 | 353 | ## Forecasting on training and test sets 354 | 355 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Forecasting_on_train_and_test_sets} 356 | 357 | # Typically, we compute one-step forecasts on the training data (the "fitted values") and multi-step forecasts on the test data. 358 | 359 | # Multi-step forecasting on training data 360 | # use fitted function's h argument. It allow for h-step "fitted values" on the training set. 361 | # An example using auscafe data. 362 | auscafe.train <- subset( 363 | auscafe, end=length(auscafe)-61 364 | ) 365 | auscafe.test <- subset( 366 | auscafe, start=length(auscafe)-60 367 | ) 368 | 369 | auscafe_arima.2.1.1.0.1.2.12 <- Arima( 370 | auscafe.train, 371 | order=c(2,1,1), seasonal=c(0,1,2), 372 | lambda=0 373 | ) 374 | 375 | # typical one-step forecasts on training set('fitted values') and multi-step forecasts on test set. 376 | fc_auscafe_arima.2.1.1.0.1.2.12 <- forecast( 377 | auscafe_arima.2.1.1.0.1.2.12, h = 60 378 | ) 379 | 380 | auscafe.train %>% 381 | forecast(h=60) %>% 382 | autoplot() + 383 | autolayer(auscafe.test) + 384 | autolayer(auscafe_arima.2.1.1.0.1.2.12$fitted) 385 | 386 | # 12-step forecasts on training data. 387 | autoplot(auscafe.train, series="Training data") + 388 | autolayer( 389 | fitted(auscafe_arima.2.1.1.0.1.2.12, h=12), 390 | series="12-step fitted values" 391 | ) 392 | 393 | # One-step forecasting on test data 394 | # In the above example, the forecast errors will be for 1-step, 2-steps, ., 60-steps ahead. 395 | # The forecast variance usually increases with the forecast horizon. 396 | # So if I simply average the absolute or squared errors from the test set, I'm combining results with different variances. 397 | 398 | # To solve this issue, obtain 1-step errors on the test data. 399 | # This can be easily done by using model argument in model-making functions. 400 | # Just use training data to estimate parameters, and then apply the estimated model to test data. 401 | # The fitted values are one-step forecasting on test set even if they are looked like training set's fitted values. 402 | auscafe.test_arima.2.1.1.0.1.2.12 <- Arima( 403 | auscafe.test, model=auscafe_arima.2.1.1.0.1.2.12 404 | ) 405 | accuracy(auscafe.test_arima.2.1.1.0.1.2.12) 406 | 407 | ``` 408 | 409 | -------------------------------------------------------------------------------- /Chapter2.rmd: -------------------------------------------------------------------------------- 1 | # Chapter 2 2 | 3 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Load_packages} 4 | 5 | library(fpp2) 6 | ``` 7 | 8 | 1. Use the help menu to explore what the series gold, woolyrnq and gas represent. These are available in the forecast package. 9 | 10 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question1} 11 | 12 | # See the structures of datas 13 | str(gold) 14 | str(woolyrnq) 15 | str(gas) 16 | 17 | # a. Use autoplot to plot each of these in separate plots. 18 | autoplot(gold) 19 | autoplot(woolyrnq) 20 | autoplot(gas) 21 | writeLines("") 22 | 23 | # b. What is the frequency of each commodity series? Hint: apply the frequency() function. 24 | print("Frequency") 25 | print("gold") 26 | frequency(gold) 27 | print("woolyrnq") 28 | frequency(woolyrnq) 29 | print("gas") 30 | frequency(gas) 31 | writeLines("") 32 | 33 | # c. Use which.max() to spot the outlier in the gold series. Which observation was it? 34 | print("When gold got maximum value?") 35 | which.max(gold) 36 | print("What was the gold's maximum value?") 37 | gold[which.max(gold)] 38 | ``` 39 | 40 | 41 | 2. Download the file tute1.csv from OTexts.org/fpp2/extrafiles/tute1.csv, open it in Excel (or some other spreadsheet application), and review its contents. You should find four columns of information. Columns B through D each contain a quarterly series, labelled Sales, AdBudget and GDP. Sales contains the quarterly sales for a small company over the period 1981-2005. AdBudget is the advertising budget and GDP is the gross domestic product. All series have been adjusted for inflation. 42 | 43 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question2} 44 | 45 | # a. You can read the data into R with the following script: 46 | 47 | tute1 <- read.csv("tute1.csv", header=TRUE) 48 | View(tute1) 49 | 50 | # b. Convert the data to time series 51 | 52 | mytimeseries <- ts(tute1[,-1], start=1981, frequency=4) 53 | 54 | # (The [,-1] removes the first column which contains the quarters as we don't need them now.) 55 | 56 | # c. Construct time series plots of each of the three series 57 | 58 | autoplot(mytimeseries, facets=TRUE) 59 | 60 | # Check what happens when you don't include facets=TRUE. 61 | 62 | autoplot(mytimeseries) 63 | 64 | ``` 65 | 66 | 67 | 3. Download some monthly Australian retail data from OTexts.org/fpp2/extrafiles/retail.xlsx. These represent retail sales in various categories for different Australian states, and are stored in a MS-Excel file. 68 | 69 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question3} 70 | 71 | # a. You can read the data into R with the following script: 72 | 73 | retaildata <- xlsx::read.xlsx("retail.xlsx", sheetIndex = 1, startRow = 2) 74 | View(retaildata) 75 | # retaildata <- readxl::read_excel("retail.xlsx", skip=1) 76 | 77 | # You may need to first install the readxl package. The second argument (skip=1) is required because the Excel sheet has two header rows. 78 | 79 | # b. Select one of the time series as follows (but replace the column name with your own chosen column): 80 | 81 | myts <- ts(retaildata[,"A3349873A"], frequency=12, start=c(1982,4)) 82 | 83 | # c. Explore your chosen retail time series using the following functions: autoplot, ggseasonplot, ggsubseriesplot, gglagplot, ggAcf 84 | #Can you spot any seasonality, cyclicity and trend? What do you learn about the series? 85 | 86 | autoplot(myts) 87 | ggseasonplot(myts) 88 | ggsubseriesplot(myts) 89 | gglagplot(myts, lags = 12) 90 | ggAcf(myts) 91 | 92 | # I can see seasonality and trend of the data 93 | 94 | ``` 95 | 96 | 4. Create time plots of the following times series: bicoal, chicken, dole, usdeaths, lynx, goog, writing, fancy, a10, h02. 97 | 98 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question4} 99 | 100 | # - Use help() to find out about the data in each series. 101 | help(bicoal) 102 | help(chicken) 103 | help(dole) 104 | help(usdeaths) 105 | help(lynx) 106 | help(goog) 107 | help(writing) 108 | help(fancy) 109 | help(a10) 110 | help(h02) 111 | 112 | # - For the goog plot, modify the axis labels and title. 113 | autoplot(goog) + 114 | ggtitle("Daily closing stock prices of Google Inc.") + 115 | xlab("Time") + 116 | ylab("Price(Unit: US$)") 117 | ``` 118 | 119 | 5. Use the ggseasonplot and ggsubseriesplot functions to explore the seasonal patterns in the following time series: writing, fancy, a10, h02. 120 | 121 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question5} 122 | 123 | # - What can you say about the seasonal patterns? 124 | # - Can you identify any unusual years? 125 | 126 | ggseasonplot(writing) 127 | ggsubseriesplot(writing) 128 | # The sales amount of paper falls down in August annually 129 | 130 | ggseasonplot(fancy) 131 | ggsubseriesplot(fancy) 132 | # In December, 1992 the monthly sales for a souvenir shop increased dramatically compared to the same month of the last year 133 | 134 | ggseasonplot(a10) 135 | ggsubseriesplot(a10) 136 | # The amount of antidiabetes monthly scripts falls down in February annually 137 | 138 | ggseasonplot(h02) 139 | ggsubseriesplot(h02) 140 | # The amount of corticosteroid monthly scripts also falls down in February annually 141 | 142 | ``` 143 | 144 | 145 | 6. Use the the following graphics functions: autoplot, ggseasonplot, ggsubseriesplot, gglagplot, ggAcf and explore features from the following time series: hsales, usdeaths, bricksq, sunspotarea, gasoline. 146 | 147 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question6} 148 | 149 | # - Can you spot any seasonality, cyclicity and trend? 150 | # - What do you learn about the series? 151 | 152 | autoplot(hsales) 153 | ggseasonplot(hsales) 154 | ggsubseriesplot(hsales) 155 | gglagplot(hsales) 156 | ggAcf(hsales, lag.max = 400) 157 | # can spot seasonality and cyclicity. The cycle period is about 4 years(100 months) 158 | 159 | autoplot(usdeaths) 160 | ggseasonplot(usdeaths) 161 | ggsubseriesplot(usdeaths) 162 | gglagplot(usdeaths) 163 | ggAcf(usdeaths, lag.max = 60) 164 | # can spot seasonality 165 | 166 | autoplot(bricksq) 167 | ggseasonplot(bricksq) 168 | ggsubseriesplot(bricksq) 169 | gglagplot(bricksq) 170 | ggAcf(bricksq, lag.max = 200) 171 | # can spot little seasonality and strong trend 172 | 173 | autoplot(sunspotarea) 174 | # ggseasonplot(sunspotarea) 175 | # not seasonal, can't draw it 176 | # ggsubseriesplot(sunspotarea) 177 | # not seasonal, useless to draw it 178 | gglagplot(sunspotarea) 179 | ggAcf(sunspotarea, lag.max = 50) 180 | # can spot strong cyclicity 181 | 182 | autoplot(gasoline) 183 | ggseasonplot(gasoline) 184 | # ggsubseriesplot(gasoline) 185 | # The number of weeks is 52 and it looked like it is too much for subseriesplot 186 | gglagplot(gasoline) 187 | ggAcf(gasoline, lag.max = 1000) 188 | # can spot seasonality and trend 189 | 190 | ``` 191 | 192 | 193 | 7. The arrivals data set comprises quarterly international arrivals (in thousands) to Australia from Japan, New Zealand, UK and the US. 194 | 195 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question7} 196 | 197 | # See the structure of arrivals 198 | str(arrivals) 199 | 200 | # - Use autoplot, ggseasonplot and ggsubseriesplot to compare the differences between the arrivals from these four countries. 201 | # - Can you identify any unusual observations? 202 | autoplot(arrivals) 203 | # The biggest number of arrivals came from New Zealand in 1980s. And the title owner country changed to Japan in 1990s and came back to UK in 2000s. 204 | # The arrival data of UK shows the biggest quarterly fluctuation. 205 | 206 | ggseasonplot(arrivals[, "Japan"]) 207 | ggseasonplot(arrivals[, "NZ"]) 208 | ggseasonplot(arrivals[, "UK"]) 209 | ggseasonplot(arrivals[, "US"]) 210 | 211 | ggsubseriesplot(arrivals[, "Japan"]) 212 | ggsubseriesplot(arrivals[, "NZ"]) 213 | ggsubseriesplot(arrivals[, "UK"]) 214 | ggsubseriesplot(arrivals[, "US"]) 215 | # The arrivals from Japan decrease a lot in 2nd quarter compared to the other quarteres. 216 | # The arrivals from New Zealand are highest in 3rd quarter and lowest in 1st quarter. 217 | # The arrivals from UK and US are low in 2nd and 3rd quarters and high in 1st and 4th quarters. 218 | 219 | ``` 220 | 221 | 222 | 9. The pigs data shows the monthly total number of pigs slaughtered in Victoria, Australia, from Jan 1980 to Aug 1995. Use mypigs <- window(pigs, start=1990) to select the data starting from 1990. Use autoplot and ggAcf for mypigs series and compare these to white noise plots from Figures 2.15 and 2.16. 223 | 224 | - Figure 2.15 225 | ![Fig 2.15 White Noise Time Series](https://www.otexts.org/fpp2/fpp_files/figure-html/wnoise-1.png) 226 | 227 | - Figure 2.16 228 | ![Fig 2.16 ACF of White Noise Time Series](https://www.otexts.org/fpp2/fpp_files/figure-html/wnoiseacf-1.png) 229 | 230 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question9} 231 | 232 | mypigs <- window(pigs, start=1990) 233 | str(mypigs) 234 | 235 | autoplot(mypigs) 236 | ggAcf(mypigs) 237 | # can find that 3 autocorrelation values were outside of bounds. Therefore mypigs isn't probably white noise. 238 | 239 | ``` 240 | 241 | 10. dj contains 292 consecutive trading days of the Dow Jones Index. Use ddj <- diff(dj) to compute the daily changes in the index. Plot ddj and its ACF. Do the changes in the Dow Jones Index look like white noise? 242 | 243 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question10} 244 | 245 | ddj <- diff(dj) 246 | str(ddj) 247 | 248 | autoplot(ddj) 249 | ggAcf(ddj) 250 | # can find that substantially less than 5% of autocorrelation values were outside of bounds. Therefore ddj can be white noise. 251 | 252 | ``` 253 | 254 | -------------------------------------------------------------------------------- /Chapter3.rmd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JehyeonHeo/Forecasting_with_R_practices/0f884ec022e2d94738d74c9e59b7f29c3c613fab/Chapter3.rmd -------------------------------------------------------------------------------- /Chapter5.rmd: -------------------------------------------------------------------------------- 1 | # Chapter 5 2 | 3 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Load_packages} 4 | 5 | library(fpp2) 6 | 7 | ``` 8 | 9 | 1. Daily electricity demand for Victoria, Australia, during 2014 is contained in elecdaily. The data for the first 20 days can be obtained as follows. 10 | 11 | daily20 <- head(elecdaily,20) 12 | 13 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question1} 14 | 15 | # I needed to make elecdaily data because there weren't elecdaily data when I solved this question at the first time. 16 | # I could get only elecdemand data, which have half-hourly electricity demand data for Victoria, Australia. 17 | # Even if I could get elecdaily data from the development version in github(https://github.com/robjhyndman/forecast), I left the part of codes for making elecdaily data. 18 | 19 | # Aggregate demand column by sum and workday and temperature columns by mean. Set nfrequency as 365 to aggregate data for each day. 20 | elecdaily <- ts.union( 21 | aggregate(elecdemand[, "Demand"], nfrequency = 365, FUN = sum), 22 | aggregate(elecdemand[, !colnames(elecdemand) %in% c("Demand")], nfrequency = 365, FUN = mean) 23 | ) 24 | # Need to change the names of columns after aggregating. 25 | colnames(elecdaily) <- colnames(elecdemand) 26 | 27 | # It will be easier to aggregate if I know the index of the columns that I want to remove. 28 | #elecdemand[, -1] 29 | #elecdemand[, -c(2,3)] 30 | 31 | daily20 <- head(elecdaily, 20) 32 | daily20 33 | 34 | # a. Plot the data and find the regression model for Demand with temperature as an explanatory variable. Why is there a positive relationship? 35 | autoplot(daily20) 36 | 37 | # Use tslm function to find the regression model 38 | tslm_Dem_Temp <- tslm(Demand ~ Temperature, data = daily20) 39 | tslm_Dem_Temp 40 | 41 | # There is a positive relationship between the two variables. It looked like it happened because of air conditioner and fan. It's likely that as temperature increased, people wanted to run them and it increased the demand of electricity 42 | 43 | # A scatter plot of Demand against Temperature is shown below with the estimated regression line. This graph shows the positive relationship a lot more clearly 44 | daily20 %>% 45 | as.data.frame() %>% 46 | ggplot(aes(x=Temperature, y=Demand)) + 47 | ylab("Electricity Demand") + 48 | xlab("Temperature") + 49 | geom_point() + 50 | geom_smooth(method="lm", se=FALSE) 51 | 52 | # b. Produce a residual plot. Is the model adequate? Are there any outliers or influential observations? 53 | checkresiduals(tslm_Dem_Temp$residuals) 54 | # I think that this model is adequate because residuals aren't correlated with each other. But there was an outlier. 55 | 56 | # c. Use the model to forecast the electricity demand that you would expect for the next day if the maximum temperature was 15 and compare it with the forecast if the maximum temperature was 35. Do you believe these forecasts? 57 | fc_Dem_Temp <- forecast(tslm_Dem_Temp, 58 | newdata=data.frame(Temperature=c(15,35))) 59 | fc_Dem_Temp 60 | # I think that the model forecasted rightly because the forecasted temperature values were near to the range of temperatures in the data 61 | 62 | # d. Give prediction intervals for your forecasts. The following R code will get you started: 63 | # 80% intervals 64 | fc_Dem_Temp$upper[, 1] 65 | fc_Dem_Temp$lower[, 1] 66 | # 95% intervals 67 | fc_Dem_Temp$upper[, 2] 68 | fc_Dem_Temp$lower[, 2] 69 | 70 | # e. Plot Demand vs Temperature for all of the available data in elecdaily. What does this say about your model? 71 | elecdaily %>% 72 | as.data.frame() %>% 73 | ggplot(aes(x=Temperature, y=Demand)) + 74 | ylab("Electricity Demand") + 75 | xlab("Temperature") + 76 | geom_point() + 77 | geom_smooth(method="lm", se=FALSE) 78 | # The result plot indicates that the model was made by few data points. It could've explained the data of the first 20 days well, but it wasn't right model for total data points 79 | 80 | ``` 81 | 82 | 83 | 2. Data set mens400 contains the winning times (in seconds) for the men's 400 meters final in each Olympic Games from 1896 to 2016. 84 | 85 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question2} 86 | 87 | # a. Plot the winning time against the year. Describe the main features of the plot. 88 | autoplot(mens400) 89 | # Feature1. Winning times in Olympic men's 400m track final had the trend of decreasing as time went on. 90 | # Feature2. There are missing values. 91 | 92 | # b. Fit a regression line to the data. Obviously the winning times have been decreasing, but at what average rate per year? 93 | # Extract time part from mens400 time series to do linear modeling. 94 | time_mens400 <- time(mens400) 95 | tslm_mens400 <- tslm(mens400 ~ time_mens400, 96 | data = mens400) 97 | 98 | # Show data with regression line 99 | autoplot(mens400) + 100 | geom_abline(slope = tslm_mens400$coefficients[2], 101 | intercept = tslm_mens400$coefficients[1], 102 | colour = "red") 103 | 104 | # Get the winning time decreasing rate 105 | tslm_mens400$coefficients[2] 106 | # The winning times have been decreasing at average rate of 0.06457 second per year. 107 | 108 | # c. Plot the residuals against the year. What does this indicate about the suitability of the fitted line? 109 | cbind(Time = time_mens400, 110 | Residuals = tslm_mens400$residuals) %>% 111 | as.data.frame() %>% 112 | ggplot(aes(x = Time, y = Residuals)) + 113 | geom_point() + 114 | ylab("Residuals of Regression Line(Unit:s)") 115 | # The residual plot shows that the regression model generally fitted the data well. I can check it using checkresiduals function, too. 116 | checkresiduals(tslm_mens400) 117 | 118 | # d. Predict the winning time for the men's 400 meters final in the 2020 Olympics. Give a prediction interval for your forecasts. What assumptions have you made in these calculations? 119 | # I made linear model with na.action argument as na.exclude to exclude missing values. 120 | # And then I used the linear model in forecast function to get prediction interval. 121 | # Forecast function can't calculate prediction interval when there is any missing values in the data that I excluded them fitting linear model. 122 | lm_mens400 <- lm( 123 | mens400 ~ time_mens400, 124 | data = mens400, 125 | na.action = na.exclude 126 | ) 127 | 128 | fc_mens400 <- forecast( 129 | lm_mens400, 130 | newdata = data.frame(time_mens400 = 2020) 131 | ) 132 | 133 | autoplot(mens400) + 134 | autolayer(fc_mens400, PI = TRUE) 135 | 136 | # Get 80% and 95% prediction intervals 137 | fc_mens400$upper 138 | fc_mens400$lower 139 | 140 | # 80% interval is from 40.45 to 43.63 141 | # 95% interval is from 39.55 to 44.53 142 | # But we need to consider that they were calculated from the assumption that the model's residuals were normally distributed. But we saw from the result of checkresiduals function that it isn't true. 143 | 144 | ``` 145 | 146 | 147 | 3. Type easter(ausbeer) and interpret what you see. 148 | 149 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question3} 150 | 151 | help("ausbeer") 152 | head(ausbeer) 153 | str(ausbeer) 154 | # Quarterly Australian beer production data. There are 218 data points. 155 | 156 | time(ausbeer)[c(1, length(ausbeer))] 157 | # start is 1st quarter of 1956 and the last is the 2nd quarter of 2010. 158 | 159 | easter(ausbeer) 160 | # easter function returns a vector of 0's or 1's or fractional parts in the observed time period. If full Easter holidays are in a time period, it returns 1, and returns 0 if there isn't any. If the holidays are extended from a period to the other, easter function returns fractional portions to each of them. 161 | 162 | ``` 163 | 164 | 165 | 5. The data set fancy concerns the monthly sales figures of a shop which opened in January 1987 and sells gifts, souvenirs, and novelties. The shop is situated on the wharf at a beach resort town in Queensland, Australia. The sales volume varies with the seasonal population of tourists. There is a large influx of visitors to the town at Christmas and for the local surfing festival, held every March since 1988. Over time, the shop has expanded its premises, range of products, and staff. 166 | 167 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question5} 168 | 169 | # a. Produce a time plot of the data and describe the patterns in the graph. Identify any unusual or unexpected fluctuations in the time series. 170 | autoplot(fancy) 171 | head(fancy, 50) 172 | # Sales generally increased from January to December. Sales increased dramatically in Decembers. Sales in Decembers increased as time went on, but in 1991, sales decreased. In most years, there was also unexpected increase every March, but the increases were a lot smaller than the increases in Decembers. 173 | 174 | # b. Explain why it is necessary to take logarithms of these data before fitting a model. 175 | # The size of the seasonal variations should be almost same across the whole series to be fitted well to a model. Fancy data shows that seasonal variations increased exponentially. Therefore it is necessary to take logarithms of the data. 176 | 177 | # c. Use R to fit a regression model to the logarithms of these sales data with a linear trend, seasonal dummies and a "surfing festival" dummy variable. 178 | 179 | # make "surfing_festival" dummy variable using time index of fancy. The value is 1 if the year is equal to or above 1988 and the month is March. 180 | Time <- time(fancy) 181 | surfing_festival <- c() 182 | for(i in 1:length(Time)){ 183 | month <- round(12*(Time[i] - floor(Time[i]))) + 1 184 | year <- floor(Time[i]) 185 | if(year >= 1988 & month == 3){ 186 | surfing_festival[i] <- 1 187 | } else { 188 | surfing_festival[i] <- 0 189 | } 190 | } 191 | # If I had made surfing_festival as a list, I should've needed to use unlist function to make it as atomic vector, not nested list. tslm function can get vector or factor type data, but it cannot get nested list. 192 | 193 | tslm_log_fancy <- tslm( 194 | BoxCox(fancy, 0) ~ trend + season + surfing_festival 195 | ) 196 | 197 | # d. Plot the residuals against time and against the fitted values. Do these plots reveal any problems with the model? 198 | autoplot(tslm_log_fancy$residuals) 199 | # The residuals have pattern against time. It means that there is correlation between residuals and time. 200 | 201 | cbind(Residuals = tslm_log_fancy$residuals, 202 | Fitted_values = tslm_log_fancy$fitted.values) %>% 203 | as.data.frame() %>% 204 | ggplot(aes(x = Fitted_values, 205 | y = Residuals)) + 206 | geom_point() 207 | # The size of the residuals changed as we move along the x-axis(fitted values). It means that even after log transformation, there are still heteroscedacity in the errors or that the variance of the residuals aren't still constant. 208 | 209 | # e. Do boxplots of the residuals for each month. Does this reveal any problems with the model? 210 | cbind.data.frame( 211 | Month = factor( 212 | month.abb[round(12*(Time - floor(Time)) + 1)], 213 | labels = month.abb, 214 | ordered = TRUE 215 | ), 216 | Residuals = tslm_log_fancy$residuals 217 | ) %>% 218 | ggplot(aes(x = Month, 219 | y = Residuals)) + 220 | geom_boxplot() 221 | # If vectors are combined by cbind function, the class of the result is matrix, which should hold one type of data. If I want to make the columns to have different data types, I need to use cbind.data.frame function instead. Instead, if I still want to use cbind function, I need to use as.numeric function in mapping of ggplot. 222 | # If the mapping of boxplot is (factor x factor), it would be difficult to see any box because boxplot function can't aggregate factor type data. The result would be looked like a scatter plot. 223 | 224 | # To see the change of the residuals for each month, I used ggsubseriesplot function. 225 | ggsubseriesplot(tslm_log_fancy$residuals) 226 | 227 | # The distribution of the residuals was unsymetrical for some months. And for some months, the median of the residuals wasn't 0(residuals' mean should be 0 for all months because getting the minimum SSE means getting mean). Residuals with such properties can't have normal distribution, which will make it difficult to get prediction interval. 228 | 229 | # f. What do the values of the coefficients tell you about each variable? 230 | tslm_log_fancy$coefficients 231 | # The model has positive trend. It means that as time goes on, the sales amount generally increases. 232 | # And all seasonal variables are positive. It means that the sales amount was minimum on January and the sales of the other months were greater than January for most of years. 233 | # Finally, surfing_festival variable's coefficient is 0.501 and it isn't small compared to the others. It means that there were increased sales in Marchs when surfing festival happened. 234 | 235 | # g. What does the Breusch-Godfrey test tell you about your model? 236 | checkresiduals(tslm_log_fancy) 237 | # The p value of the test is less than 0.05. It means that the residuals can be distinguished from white noise. The residuals can be correlated with each other. 238 | 239 | # h. Regardless of your answers to the above questions, use your regression model to predict the monthly sales for 1994, 1995, and 1996. Produce prediction intervals for each of your forecasts. 240 | # make surfing festival data for the months of 1994 through 1996. 241 | future_fancy <- rep(0, 36) 242 | for(i in 1:36){ 243 | if(i %% 12 == 3){ 244 | future_fancy[i] <- 1 245 | } 246 | } 247 | # make future data as time series. 248 | future_fancy <- ts(data = future_fancy, 249 | start = 1994, 250 | end = c(1996, 12), 251 | frequency = 12) 252 | 253 | # forecast 254 | fc_tslm_log_fancy <- forecast( 255 | tslm_log_fancy, 256 | newdata = data.frame(Time = time(future_fancy), 257 | surfing_festival = future_fancy) 258 | ) 259 | 260 | # plot the forecast 261 | autoplot(fc_tslm_log_fancy) 262 | 263 | # show prediction interval 264 | fc_tslm_log_fancy$upper 265 | fc_tslm_log_fancy$lower 266 | # The intervals on Decembers were especially large. 267 | 268 | # i. Transform your predictions and intervals to obtain predictions and intervals for the raw data. 269 | 270 | # make fc_tslm_fancy object, which are inverse log transformed version of fc_tslm_log_fancy. 271 | fc_tslm_fancy <- fc_tslm_log_fancy 272 | 273 | # members which should be inverse log transformed. 274 | members_inv.log <- c('x', 'mean', 'lower', 'upper', 'residuals', 'fitted') 275 | 276 | # apply inverse log transform to the members. 277 | fc_tslm_fancy[members_inv.log] <- lapply( 278 | fc_tslm_log_fancy[members_inv.log], 279 | InvBoxCox, 280 | lambda = 0 281 | ) 282 | 283 | # apply inverse log transform to 'BoxCox(fancy, 0)' member in model's model. 284 | fc_tslm_fancy[['model']][['model']][1] <- lapply( 285 | fc_tslm_log_fancy[['model']][['model']][1], 286 | InvBoxCox, 287 | lambda = 0 288 | ) 289 | 290 | autoplot(fc_tslm_fancy) 291 | # Even if I transformed the data, fitted values and forecasts, the name of predicted values is still 'BoxCox(fancy, 0)'. 292 | # I can't change it because it came from the formula in tslm function. Changing it means making new model, not just changing the variable's name. 293 | # I think that it is better to set lambda = 0 in forecast function from the very first to forecast using log transformation. 294 | 295 | fc_tslm_fancy$upper 296 | fc_tslm_fancy$lower 297 | # The range of prediction intervals became a lot bigger after inverse log transformation. 298 | 299 | # j. How could you improve these predictions by modifying the model? 300 | # The predictions when I don't use log transformation. 301 | tslm_fancy <- tslm( 302 | fancy ~ trend + season + surfing_festival 303 | ) 304 | 305 | fc_tslm_fancy2 <- forecast( 306 | tslm_fancy, 307 | newdata = data.frame(Time = time(future_fancy), 308 | surfing_festival = future_fancy) 309 | ) 310 | 311 | autoplot(fc_tslm_fancy2) 312 | # The result shows that the forecasts couldn't reflect the exponential growth trend. 313 | 314 | # I could've improved the predictions by using log transformation. By using the transformation, the predictions could reflect the sales' exponential growth trend better. 315 | 316 | ``` 317 | 318 | 319 | 6. The gasoline series consists of weekly data for supplies of US finished motor gasoline product, from 2 February 1991 to 20 January 2017. The units are in "thousand barrels per day". Consider only the data to the end of 2004. 320 | 321 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question7} 322 | 323 | # a. Fit a harmonic regression with trend to the data. Experiment with changing the number Fourier terms. Plot the observed gasoline and fitted values and comment on what you see. 324 | str(gasoline) 325 | head(gasoline) 326 | # They are weekly data and it would be useful to make model with harmonic regression. 327 | 328 | # extract gasoline data up to the end of 2004 and plot it 329 | gasoline_until_2004 <- window(gasoline, end = 2005) 330 | autoplot(gasoline_until_2004, xlab = "Year") 331 | 332 | # make tslm model 333 | for(num in c(1, 2, 3, 5, 10, 20)){ 334 | #make variable names for each model using pair number. 335 | var_name <- paste("tslm_ft", 336 | as.character(num), 337 | "_gasoline_until_2004", 338 | sep = "") 339 | 340 | #assign ts linear model to each variable name. 341 | assign(var_name, 342 | tslm(gasoline_until_2004 ~ trend + fourier( 343 | gasoline_until_2004, 344 | K = num 345 | )) 346 | ) 347 | 348 | #plot the data and fitted values 349 | print( 350 | autoplot(gasoline_until_2004) + 351 | autolayer(get(var_name)$fitted.values, 352 | series = as.character(num)) + 353 | ggtitle(var_name) + 354 | ylab("gasoline") + 355 | guides(colour = guide_legend(title = "Number of Fourier Transform pairs")) 356 | ) 357 | } 358 | 359 | autoplot(gasoline_until_2004) + 360 | autolayer(tslm_ft1_gasoline_until_2004$fitted.values, series = "1") + 361 | autolayer(tslm_ft5_gasoline_until_2004$fitted.values, series = "2") + 362 | autolayer(tslm_ft10_gasoline_until_2004$fitted.values, series = "3") + 363 | autolayer(tslm_ft10_gasoline_until_2004$fitted.values, series = "5") + 364 | autolayer(tslm_ft20_gasoline_until_2004$fitted.values, series = "10") + 365 | autolayer(tslm_ft20_gasoline_until_2004$fitted.values, series = "20") + 366 | guides(colour = guide_legend(title = "Fourier Transform pairs")) + 367 | scale_color_discrete(breaks = c(1, 2, 3, 5, 10, 20)) 368 | # as more number of Fourier pairs used, the fitted line looks more like the original data. But the fitted lines didn't follow the trend well. 369 | 370 | # b. Select the appropriate number of Fourier terms to include by minimizing the AICc or CV value. 371 | for(i in c(1, 2, 3, 5, 10, 20)){ 372 | tslm_ft_gasoline_until_2004.name <- paste( 373 | "tslm_ft", as.character(i), "_gasoline_until_2004", 374 | sep = "" 375 | ) 376 | writeLines( 377 | paste( 378 | "\n", tslm_ft_gasoline_until_2004.name, "\n" 379 | ) 380 | ) 381 | print(CV(get(tslm_ft_gasoline_until_2004.name))) 382 | } 383 | 384 | # In the above 6 K values, 10 minimized the AICc and CV value. 385 | 386 | # Get exact number of Fourier pairs to minimize AICc or CV 387 | min_AICc <- Inf 388 | min_K_by_AICc <- 0 389 | min_CV <- Inf 390 | min_K_by_CV <- 0 391 | AICc_K <- 0 392 | CV_K <- 0 393 | 394 | # maximum number of pairs is 26 because the frequency of gasoline data is about 52.18 395 | for(num in 1:26){ 396 | AICc_K <- CV( 397 | tslm( 398 | gasoline_until_2004 ~ trend + fourier(gasoline_until_2004, K = num) 399 | ) 400 | )[["AICc"]] 401 | print(AICc_K) 402 | CV_K <- CV( 403 | tslm( 404 | gasoline_until_2004 ~ trend + fourier(gasoline_until_2004, K = num) 405 | ) 406 | )[["CV"]] 407 | print(CV_K) 408 | 409 | # If the minimum AICc and CV values are found, the loop don't need to run anymore. Therefore print the result number of pairs and break the loop. 410 | # If num = 1, don't run below codes and move to num = 2. With just the result of num = 1, I cannot know whether the AICc and CV values are minimum. 411 | if(num != 1){ 412 | if(AICc_K >= min_AICc & CV_K >= min_CV){ 413 | writeLines( 414 | paste("The number of Fourier Transform pairs to minimize AICc", 415 | "\n", 416 | as.character(min_K_by_AICc) 417 | ) 418 | ) 419 | writeLines( 420 | paste("The number of Fourier Transform pairs to minimize CV", 421 | "\n", 422 | as.character(min_K_by_CV) 423 | ) 424 | ) 425 | break 426 | } 427 | } 428 | 429 | # find the minimum AICc and CV and the number of pairs at the state. 430 | if(AICc_K < min_AICc){ 431 | min_AICc <- AICc_K 432 | min_K_by_AICc <- num 433 | } 434 | if(CV_K < min_CV){ 435 | min_CV <- CV_K 436 | min_K_by_CV <- num 437 | } 438 | } 439 | # To get minimum AICc or CV, I need 7 pairs. 440 | 441 | # c. Check the residuals of the final model using the checkresiduals() function. Even though the residuals fail the correlation tests, the results are probably not severe enough to make much difference to the forecasts and prediction intervals. (Note that the correlations are relatively small, even though they are significant.) 442 | tslm_ft7_gasoline_until_2004 <- tslm( 443 | gasoline_until_2004 ~ trend + fourier( 444 | gasoline_until_2004, 445 | K = 7 446 | ) 447 | ) 448 | 449 | checkresiduals(tslm_ft7_gasoline_until_2004) 450 | 451 | # d. To forecast using harmonic regression, you will need to generate the future values of the Fourier terms. This can be done as follows. 452 | 453 | fc_gasoline_2005 <- forecast( 454 | tslm_ft7_gasoline_until_2004, 455 | newdata=data.frame(fourier( 456 | gasoline_until_2004, K = 7, h = 52) 457 | ) 458 | ) 459 | 460 | # where tslm_ft7_gasoline_until_2004 is the fitted model using tslm, K is the number of Fourier terms used in creating fit, and h is the forecast horizon required. Got the next year's forecasts. 461 | 462 | # e. Plot the forecasts along with the actual data for 2005. What do you find? 463 | autoplot(fc_gasoline_2005) + 464 | autolayer(window( 465 | gasoline, 466 | start = 2004, 467 | end = 2006 468 | ) 469 | ) + 470 | scale_x_continuous(limits = c(2004, 2006)) 471 | 472 | # Almost all of actual data were in the 80% prediction interval. But the model couldn't predict the sudden fall in the fall. The drop was a lot bigger than expected. 473 | 474 | ``` 475 | 476 | 477 | 7. Data set huron gives the water level of Lake Huron in feet from 1875-1972. 478 | 479 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question7} 480 | 481 | # a. Plot the data and comment on its features. 482 | autoplot(huron) 483 | str(huron) 484 | head(huron) 485 | # There are lots of fluctuations that show some cyclic behaviour. And up to about the year 1930 there were downward trend, but after that the trend disappeared. 486 | 487 | # b. Fit a linear regression and compare this to a piecewise linear trend model with a knot at 1915. 488 | h <- 8 489 | # simple linear regression 490 | tslm_huron <- tslm(huron ~ trend) 491 | fc_tslm_huron <- forecast(tslm_huron, h=h) 492 | 493 | # linear regression with log transformation 494 | tslm_log_huron <- tslm(huron ~ trend, 495 | lambda = 0) 496 | fc_tslm_log_huron <- forecast(tslm_log_huron, h=h) 497 | 498 | # piecewise linear regression 499 | t <- time(huron) 500 | t.break <- 1915 501 | t_piece <- ts(pmax(0,t-t.break), start=1875) 502 | 503 | tslm_pw_huron <- tslm(huron ~ t + t_piece) 504 | t_new <- t[length(t)]+seq(h) 505 | t_piece_new <- t_piece[length(t_piece)]+seq(h) 506 | 507 | newdata <- cbind(t=t_new, 508 | t_piece=t_piece_new) %>% 509 | as.data.frame() 510 | 511 | fc_tslm_pw_huron <- forecast( 512 | tslm_pw_huron, 513 | newdata = newdata 514 | ) 515 | 516 | # cubic spline regression 517 | tslm_spline_huron <- splinef(huron, lambda = 0) 518 | fc_tslm_spline_huron <- forecast( 519 | tslm_spline_huron, 520 | newdata = newdata 521 | ) 522 | 523 | # plot the results 524 | autoplot(huron) + 525 | autolayer(fitted(tslm_huron), series = "Linear") + 526 | autolayer(fitted(tslm_log_huron), series="Logarithm") + 527 | autolayer(fitted(tslm_pw_huron), series = "Piecewise") + 528 | autolayer(fitted(tslm_spline_huron), series = "Cubic Spline") + 529 | autolayer(fc_tslm_pw_huron, series="Piecewise") + 530 | autolayer(fc_tslm_huron$mean, series = "Linear") + 531 | autolayer(fc_tslm_log_huron$mean, series="Logarithm") + 532 | autolayer(fc_tslm_spline_huron$mean, series="Cubic Spline") + 533 | xlab("Year") + ylab("Water level") + 534 | ggtitle("Lake Huron water level change") + 535 | guides(colour=guide_legend(title=" ")) 536 | # It looked like spline model didn't catch the trend well. Linear model and log transformed linear model couldn't reflect the trend change around the year 1915. 537 | 538 | # c. Generate forecasts from these two models for the period upto 1980 and comment on these. 539 | autoplot(huron) + 540 | autolayer(fitted(tslm_huron), series = "Linear") + 541 | autolayer(fc_tslm_huron, series = "Linear") + 542 | xlab("Year") + ylab("Water level") + 543 | ggtitle("Lake Huron water level change", 544 | subtitle = "using Linear Regression model") + 545 | guides(colour=guide_legend(title=" ")) + 546 | theme(plot.title = element_text(hjust = 0.5), 547 | plot.subtitle = element_text(hjust = 0.5)) 548 | 549 | autoplot(huron) + 550 | autolayer(fitted(tslm_pw_huron), series = "Piecewise") + 551 | autolayer(fc_tslm_pw_huron, series="Piecewise") + 552 | xlab("Year") + ylab("Water level") + 553 | ggtitle("Lake Huron water level change", 554 | subtitle = "using piecewise linear model") + 555 | guides(colour=guide_legend(title=" ")) + 556 | theme(plot.title = element_text(hjust = 0.5), 557 | plot.subtitle = element_text(hjust = 0.5)) 558 | 559 | # Linear regression model shows that the point forecasts and the upper and the lower bounds of prediction intervals decrease as time goes on. It didn't reflect the trend change around the year 1915. 560 | # Piecewise linear regression model shows that the point forecasts and the prediction intervals are almost same over time. It reflected the trend change. 561 | 562 | ``` 563 | 564 | 565 | ### Question 4, 8 are related with math, not related with coding that I didn't include them in here. -------------------------------------------------------------------------------- /Chapter6.rmd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JehyeonHeo/Forecasting_with_R_practices/0f884ec022e2d94738d74c9e59b7f29c3c613fab/Chapter6.rmd -------------------------------------------------------------------------------- /Chapter7.rmd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JehyeonHeo/Forecasting_with_R_practices/0f884ec022e2d94738d74c9e59b7f29c3c613fab/Chapter7.rmd -------------------------------------------------------------------------------- /Chapter8.rmd: -------------------------------------------------------------------------------- 1 | # Chapter 8 2 | 3 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Load_packages} 4 | 5 | library(fpp2) 6 | library(xlsx) 7 | library(rdatamarket) 8 | library(tseries) 9 | 10 | ``` 11 | 12 | 2. A classic example of a non-stationary series is the daily closing IBM stock price series (data set ibmclose). Use R to plot the daily closing prices for IBM stock and the ACF and PACF. Explain how each plot shows that the series is non-stationary and should be differenced. 13 | 14 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question2} 15 | 16 | ggtsdisplay(ibmclose) 17 | # ACF plot shows that the autocorrelation values are bigger than critical value and decrease slowly. Also, r1 is large(near to 1) and positive. It means that the IBM stock data are non-stationary(that is, predictable using lagged values). 18 | # PACF plot shows that there is a strong correlation between IBM stock data and their 1 lagged values. It means that IBM stock data can be predicted by 1 lagged values and they aren't stationary. 19 | 20 | # To get stationary data, IBM stock data need differencing. Differencing can help stabilize the mean of a time series by removing changes in the level of a time series. Therefore it will eliminate or reduce trend and seasonality. And the effect can make non-staionary data stationary. 21 | 22 | ``` 23 | 24 | 25 | 3. For the following series, find an appropriate Box-Cox transformation and order of differencing in order to obtain stationary data. 26 | 27 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question3} 28 | 29 | # a. usnetelec 30 | autoplot(usnetelec) 31 | # It is almost linearly increasing data. It looked like that the data only need first differencing. 32 | Box.test(diff(usnetelec), type = "Ljung-Box") 33 | # first differenced usnetelec data can be thought of as a white noise series. 34 | kpss.test(diff(usnetelec)) 35 | # kpss test result also shows that first differencing made the data stationary. 36 | 37 | # b. usgdp 38 | autoplot(usgdp) 39 | # It is almost linearly increasing data. It looked like that the data only need first differencing. 40 | Box.test(diff(usgdp), type = "Ljung-Box") 41 | # first differenced usnetelec data cannot be thought of as a white noise series. 42 | autoplot(diff(usgdp)) 43 | # There is still a trend left in the differenced data. It looked like one more differencing would be enough, but use ndiffs function to check the number of differencing needed. 44 | ndiffs(usgdp) 45 | # One more differencing would be enough. 46 | autoplot(diff(diff(usgdp))) 47 | # Plot shows that the twice differenced data is like white noise series. 48 | Box.test(diff(diff(usgdp)), type = "Ljung-Box") 49 | # But it couldn't pass Ljung-Box test. 50 | ggAcf(diff(diff(usgdp))) 51 | # There are still some autocorrelations left. 52 | kpss.test(diff(diff(usnetelec))) 53 | # But kpss test result shows that differencing twice was enough to make the data stationary. Therefore in usgdp data case, even if twice differencing didn't make the data like white noise series, it made the data stationary. 54 | 55 | # c. mcopper 56 | autoplot(mcopper) 57 | # mcopper data have increasing trend. And they have bigger variation for bigger prices. Therefore I'll use Box-Cox transformation before differencing. 58 | lambda_mcopper <- BoxCox.lambda(mcopper) 59 | autoplot(diff(BoxCox(mcopper, lambda_mcopper))) 60 | Box.test(diff(BoxCox(mcopper, lambda_mcopper)), 61 | type = "Ljung-Box") 62 | # Plot result looked like BoxCox transformation and first differencing made the data like white noise series. But Ljung-Box test shows that it didn't. 63 | ggAcf(diff(BoxCox(mcopper, lambda_mcopper))) 64 | # There are still some autocorrelations left. 65 | kpss.test(diff(BoxCox(mcopper, lambda_mcopper))) 66 | # But kpss test result shows that differencing with Box-Cox transformation was enough to make the data stationary. 67 | # Even if differencing with Box-Cox transformation didn't make the data like white noise series, it made the data stationary. 68 | 69 | # d. enplanements 70 | autoplot(enplanements) 71 | # enplanements data have seasonality and increasing trend even if the number of enplanements fell in 2001. Therefore, I think that the data need seasonal differencing, too. 72 | # The variations are bigger for bigger numbers. Therefore I'll use Box-Cox transformation before differencing. 73 | lambda_enplanements <- BoxCox.lambda(enplanements) 74 | ndiffs(enplanements) 75 | nsdiffs(enplanements) 76 | # the data need 1 first differencing and 1 seasonal differencing. 77 | autoplot( 78 | diff( 79 | diff( 80 | BoxCox(enplanements, lambda_enplanements), 81 | lag = 12 82 | ) 83 | ) 84 | ) 85 | Box.test( 86 | diff( 87 | diff( 88 | BoxCox(enplanements, lambda_enplanements), 89 | lag = 12 90 | ) 91 | ), 92 | type = "Ljung-Box" 93 | ) 94 | # Plot result looked like BoxCox transformation and multiple differencings made the data like white noise series. But Ljung-Box test shows that it didn't. 95 | ggAcf( 96 | diff( 97 | diff( 98 | BoxCox(enplanements, lambda_enplanements), 99 | lag = 12 100 | ) 101 | ) 102 | ) 103 | # There are still some autocorrelations left. 104 | kpss.test( 105 | diff( 106 | diff( 107 | BoxCox(enplanements, lambda_enplanements), 108 | lag = 12 109 | ) 110 | ) 111 | ) 112 | # But kpss test result shows that differencings with Box-Cox transformation was enough to make the data stationary. In enplanements data case, even if differencings with Box-Cox transformation didn't make the data like white noise series, it made the data stationary. 113 | 114 | # e. visitors 115 | autoplot(visitors) 116 | # visitors data are similar to enplanements data. They have seasonality and increasing trend. It looked like they also need Box-Cox transformation, first and seasonal differencing. 117 | lambda_visitors <- BoxCox.lambda(visitors) 118 | ndiffs(visitors) 119 | nsdiffs(visitors) 120 | # visitors data need 1 first and 1 seasonal differencing. 121 | autoplot( 122 | diff( 123 | diff( 124 | BoxCox(visitors, lambda_visitors), 125 | lag = 12 126 | ) 127 | ) 128 | ) 129 | Box.test( 130 | diff( 131 | diff( 132 | BoxCox(visitors, lambda_visitors), 133 | lag = 12 134 | ) 135 | ), 136 | type = "Ljung-Box" 137 | ) 138 | # Plot result looked like BoxCox transformation and multiple differencings made the data like white noise series. But Ljung-Box test shows that it didn't. 139 | ggAcf( 140 | diff( 141 | diff( 142 | BoxCox(visitors, lambda_visitors), 143 | lag = 12 144 | ) 145 | ) 146 | ) 147 | # There are still some autocorrelations left. 148 | kpss.test( 149 | diff( 150 | diff( 151 | BoxCox(visitors, lambda_visitors), 152 | lag = 12 153 | ) 154 | ) 155 | ) 156 | # But kpss test result shows that differencings with Box-Cox transformation was enough to make the data stationary. In visitors data case, even if differencings with Box-Cox transformation didn't make the data like white noise series, it made the data stationary. 157 | 158 | ``` 159 | 160 | 161 | 4. For the enplanements data, write down the differences you chose above using backshift operator notation. 162 | 163 | ### the data needed 1 first difference, 1 seasonal difference after Box-Cox transformation. The model of the data can be written as ARIMA(0, 1, 0)(0, 1, 0)12 with Box-Cox transformation(lambda = -0.227). 164 | ### The model expression using backshift operator notation B: 165 | ### first equation : wt = (yt^(-0.227) - 1)/(-0.227) 166 | ### second equation : (1 - B)(1 - B^12)wt = et, where et is a white noise series. 167 | 168 | 169 | 5. For your retail data (from Exercise 3 in Section 2.10), find the appropriate order of differencing (after transformation if necessary) to obtain stationary data. 170 | 171 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question5} 172 | 173 | retail <- read.xlsx("retail.xlsx", 174 | startRow = 2, 175 | sheetIndex = 1) 176 | 177 | retail.ts <- ts(retail[,"A3349873A"], 178 | frequency=12, 179 | start=c(1982,4)) 180 | 181 | autoplot(retail.ts) 182 | # the data have increasing trend and strong seasonality. And there are bigger variations for bigger numbers. Therefore I think that I need to use first differencing and seasonal differencing. And it would be better to do Box-Cox transformation. 183 | ndiffs(retail.ts) 184 | nsdiffs(retail.ts) 185 | # I'm going to do 1 first differencing and 1 seasonal differencing. 186 | kpss.test( 187 | diff( 188 | diff( 189 | BoxCox(retail.ts, BoxCox.lambda(retail.ts)), 190 | lag = 12 191 | ) 192 | ) 193 | ) 194 | # To make retail.ts data stationary, I did Box-Cox transformation, 1 first differencing and 1 seasonal differencing. 195 | 196 | ``` 197 | 198 | 6. Use R to simulate and plot some data from simple ARIMA models. 199 | 200 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question6} 201 | 202 | # a. Use the following R code to generate data from an AR(1) model with phi1 = 0.6 and sigma^2 = 1. The process starts with y1 = 0. 203 | 204 | y <- ts(numeric(100)) 205 | e <- rnorm(100) 206 | for(i in 2:100){ 207 | y[i] <- 0.6*y[i-1] + e[i] 208 | } 209 | 210 | # b. Produce a time plot for the series. How does the plot change as you change phi1? 211 | ar1generator <- function(phi1){ 212 | # generate 100 data points from an AR(1) model with input phi1. 213 | y <- ts(numeric(100)) 214 | # error 'e's have variation sigma^2 as 1. 215 | e <- rnorm(100) 216 | for(i in 2:100){ 217 | y[i] <- phi1*y[i-1] + e[i] 218 | } 219 | return(y) 220 | } 221 | 222 | # produce plots changing phi1 value. 223 | autoplot(ar1generator(0.3), series = "0.3") + 224 | geom_line(size = 1, colour = "red") + 225 | autolayer(y, series = "0.6", size = 1) + 226 | autolayer(ar1generator(0.9), size = 1, series = "0.9") + 227 | ylab("AR(1) models") + 228 | guides(colour = guide_legend(title = "Phi1")) 229 | # As phi increases, the variation of y increased. 230 | 231 | # c. Write your own code to generate data from an MA(1) model with theta1 = 0.6 and sigma^2 = 1. 232 | ma1generator <- function(theta1){ 233 | # generate 100 data points from an MA(1) model with input theta1. 234 | y <- ts(numeric(100)) 235 | # error 'e's have variation sigma^2 as 1. 236 | e <- rnorm(100) 237 | for(i in 2:100){ 238 | y[i] <- theta1*e[i-1] + e[i] 239 | } 240 | return(y) 241 | } 242 | 243 | # d. Produce a time plot for the series. How does the plot change as you change theta1? 244 | # produce plots changing theta1 value. 245 | autoplot(ma1generator(0.3), series = "0.3") + 246 | geom_line(size = 1, colour = "red") + 247 | autolayer(y, series = "0.6", size = 1) + 248 | autolayer(ar1generator(0.9), size = 1, series = "0.9") + 249 | ylab("MA(1) models") + 250 | guides(colour = guide_legend(title = "Theta1")) 251 | # As theta increases, the variation of y increased. 252 | 253 | # e. Generate data from an ARMA(1,1) model with phi1 = 0.6, theta1 = 0.6 and sigma^2 = 1. 254 | y_arima.1.0.1 <- ts(numeric(50)) 255 | e <- rnorm(50) 256 | for(i in 2:50){ 257 | y_arima.1.0.1[i] <- 0.6*y_arima.1.0.1[i-1] + 0.6*e[i-1] + e[i] 258 | } 259 | 260 | # f. Generate data from an AR(2) model with phi1 = -0.8, phi2 = 0.3 and sigma^2 = 1. (Note that these parameters will give a non-stationary series.) 261 | y_arima.2.0.0 <- ts(numeric(50)) 262 | e <- rnorm(50) 263 | for(i in 3:50){ 264 | y_arima.2.0.0[i] <- -0.8*y_arima.2.0.0[i-1] + 0.3*y_arima.2.0.0[i-2] + e[i] 265 | } 266 | 267 | # g. Graph the latter two series and compare them. 268 | autoplot(y_arima.1.0.1, series = "ARMA(1, 1)") + 269 | autolayer(y_arima.2.0.0, series = "AR(2)") + 270 | ylab("y") + 271 | guides(colour = guide_legend(title = "Models")) 272 | 273 | autoplot(y_arima.1.0.1) 274 | # data from an AR(2) model increased with oscillation. They are non-staionary data. But data from an ARMA(1, 1) model were stationary. 275 | 276 | ``` 277 | 278 | 279 | 7. Consider the number of women murdered each year (per 100,000 standard population) in the United States. (Data set wmurders). 280 | 281 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question7} 282 | 283 | # a. By studying appropriate graphs of the series in R, find an appropriate ARIMA(p,d,q) model for these data. 284 | autoplot(wmurders) 285 | # It looked like the data don't need seasonal differencing or Box-Cox transformation. 286 | autoplot(diff(wmurders)) 287 | # It looked like 1 more differencing would be needed to make the data stationary. Differenced data slowly go to minus infinity. 288 | ndiffs(wmurders) 289 | # ndiffs function shows that the data need 2 differencing. 290 | autoplot(diff(wmurders, differences = 2)) 291 | kpss.test(diff(wmurders, differences = 2)) 292 | # twice differencing made the data stationary. 293 | diff(wmurders, differences = 2) %>% ggtsdisplay() 294 | # PACF is decaying. And there are significant spikes at lag 1, and 2 in the ACF, but none beyond lag 2. If the data can be modelled by ARIMA(0, 2, q) or ARIMA(p, 2, 0), I'm going to model the data by ARIMA(0, 2, 2). 295 | 296 | # b. Should you include a constant in the model? Explain. 297 | # ARIMA model of the data includes twice differencing. If there is a constant in the model, twice integrated contant will yield quadratic trend, which is dangerous for forecasting. Therefore I won't include a constant in the model. 298 | 299 | # c. Write this model in terms of the backshift operator. 300 | # (1 - B)^2*yt = (1 + theta1*B + theta2*B^2)*et 301 | 302 | # d. Fit the model using R and examine the residuals. Is the model satisfactory? 303 | wmurders_arima.0.2.2 <- Arima(wmurders, 304 | order = c(0, 2, 2)) 305 | 306 | checkresiduals(wmurders_arima.0.2.2) 307 | # The residuals of the model can be thought of as white noise series. A little sorry that they aren't normally distributed. But it is satisfactory to get them. 308 | 309 | # e. Forecast three times ahead. Check your forecasts by hand to make sure that you know how they have been calculated. 310 | fc_wmurders_arima.0.2.2 <- forecast( 311 | wmurders_arima.0.2.2, h = 3 312 | ) 313 | 314 | # forecasts by Arima function 315 | fc_wmurders_arima.0.2.2$mean 316 | 317 | # get forecasts by manual calculation 318 | fc_wmurders_arima.0.2.2$model 319 | # formula 320 | # (1 - B)^2*yt = (1 - 1.0181*B + 0.1470*B^2)*et 321 | # yt = 2yt-1 - yt-2 + et - 1.0181*et-1 + 0.1470*et-2 322 | years <- length(wmurders) 323 | e <- fc_wmurders_arima.0.2.2$residuals 324 | 325 | fc1 <- 2*wmurders[years] - wmurders[years - 1] - 1.0181*e[years] + 0.1470*e[years - 1] 326 | 327 | fc2 <- 2*fc1 - wmurders[years] + 0.1470*e[years] 328 | 329 | fc3 <- 2*fc2 - fc1 330 | 331 | # forecasts by manual calculation 332 | c(fc1, fc2, fc3) 333 | # the forecasts are almost similar to the ones got by Arima function. 334 | 335 | # f. Create a plot of the series with forecasts and prediction intervals for the next three periods shown. 336 | autoplot(fc_wmurders_arima.0.2.2) 337 | 338 | # g. Does auto.arima give the same model you have chosen? If not, which model do you think is better? 339 | fc_wmurders_autoarima <- forecast( 340 | auto.arima(wmurders), h = 3 341 | ) 342 | 343 | # Without RMSE, all errors show that ARIMA(0, 2, 2) is better than ARIMA(1, 2, 1). 344 | accuracy(fc_wmurders_arima.0.2.2) 345 | accuracy(fc_wmurders_autoarima) 346 | 347 | # try using auto.arima function with stepwise and approximation options false. 348 | fc_wmurders_autoarima2 <- forecast( 349 | auto.arima(wmurders, stepwise = FALSE, approximation = FALSE), 350 | h = 3 351 | ) 352 | # It is ARIMA(0, 2, 3) model. 353 | 354 | accuracy(fc_wmurders_autoarima2) 355 | # In this case, some errors were better while others were worse. I'll check residuals and ACF, PACF plots. 356 | ggtsdisplay(diff(wmurders, differences = 2)) 357 | # It looked like that the data are similar to ARIMA(0, 2, 2) rather than ARIMA(0, 2, 3). 358 | 359 | checkresiduals(fc_wmurders_arima.0.2.2) 360 | checkresiduals(fc_wmurders_autoarima2) 361 | # almost similar residuals. 362 | 363 | # Therefore I'll choose ARIMA(0, 2, 2). 364 | 365 | ``` 366 | 367 | 368 | 8. Consider the total international visitors to Australia (in millions) for the period 1980-2015. (Data set austa.) 369 | 370 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question8} 371 | 372 | # a. Use auto.arima to find an appropriate ARIMA model. What model was selected. Check that the residuals look like white noise. Plot forecasts for the next 10 periods. 373 | autoplot(austa) 374 | 375 | fc_austa_autoarima <- forecast( 376 | auto.arima(austa), h = 10 377 | ) 378 | 379 | fc_austa_autoarima$model 380 | # ARIMA(0, 1, 1) with drift model was chosen. 381 | 382 | checkresiduals(fc_austa_autoarima) 383 | # The residuals are like white noise. 384 | 385 | autoplot(fc_austa_autoarima) 386 | 387 | # b. Plot forecasts from an ARIMA(0,1,1) model with no drift and compare these to part (a). Remove the MA term and plot again. 388 | fc_austa_arima.0.1.1 <- forecast( 389 | Arima(austa, order = c(0, 1, 1)), h = 10 390 | ) 391 | 392 | autoplot(fc_austa_arima.0.1.1) 393 | 394 | fc_austa_arima.0.1.0 <- forecast( 395 | Arima(austa, order = c(0, 1, 0)), h = 10 396 | ) 397 | 398 | autoplot(fc_austa_arima.0.1.0) 399 | # the forecasts of both models are like the result of naive forecast. Increasing trend isn't reflected in the forecasts. 400 | 401 | fc_austa_arima.0.1.1$upper - fc_austa_arima.0.1.0$upper 402 | fc_austa_arima.0.1.0$lower - fc_austa_arima.0.1.1$lower 403 | # But prediction interval of ARIMA(0, 1, 1) model was generally larger than the one of ARIMA(0, 1, 0) model. I think that it is because of one more error term in ARIMA(0, 1, 1) model. 404 | 405 | # c. Plot forecasts from an ARIMA(2,1,3) model with drift. Remove the constant and see what happens. 406 | fc_austa_arima.2.1.3.drift <- forecast( 407 | Arima(austa, order = c(2, 1, 3), include.drift = TRUE), 408 | h = 10 409 | ) 410 | 411 | autoplot(fc_austa_arima.2.1.3.drift) 412 | # The forecasts are increasing, but the speed of the increase is decreasing. 413 | 414 | drift_austa <- fc_austa_arima.2.1.3.drift$model$coef[6] 415 | fc_austa_arima.2.1.3.nodrift <- fc_austa_arima.2.1.3.drift$mean - drift_austa*seq_len(10) 416 | 417 | autoplot(fc_austa_arima.2.1.3.drift) + 418 | autolayer(fc_austa_arima.2.1.3.nodrift) 419 | # Without drift constant, the forecasts are unlikely. 420 | 421 | # d. Plot forecasts from an ARIMA(0,0,1) model with a constant. Remove the MA term and plot again. 422 | fc_austa_arima.0.0.1.const <- forecast( 423 | Arima( 424 | austa, order = c(0, 0, 1), include.constant = TRUE 425 | ), 426 | h = 10 427 | ) 428 | 429 | autoplot(fc_austa_arima.0.0.1.const) 430 | # the forecasts are fastly decreased to the mean of the data history. 431 | 432 | fc_austa_arima.0.0.0.const <- forecast( 433 | Arima(austa, order = c(0, 0, 0), include.constant = TRUE), 434 | h = 10 435 | ) 436 | 437 | autoplot(fc_austa_arima.0.0.0.const) 438 | # All of the forecasts are the mean of the data history. It is like the result of mean method. 439 | 440 | # e. Plot forecasts from an ARIMA(0,2,1) model with no constant. 441 | fc_austa_arima.0.2.1 <- forecast( 442 | Arima(austa, order = c(0, 2, 1)), 443 | h = 10 444 | ) 445 | 446 | autoplot(fc_austa_arima.0.2.1) 447 | # the forecasts show increasing trend. PI is being larger for the farther future forecast. 448 | 449 | ``` 450 | 451 | 452 | 9. For the usgdp series: 453 | 454 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question9} 455 | 456 | # a. if necessary, find a suitable Box-Cox transformation for the data; 457 | autoplot(usgdp) 458 | autoplot(BoxCox(usgdp, BoxCox.lambda(usgdp))) 459 | # When I transformed the original data, I could get more linearly increasing line. Therefore I'm going to do Box-Cox transformation. 460 | lambda_usgdp <- BoxCox.lambda(usgdp) 461 | 462 | # b.fit a suitable ARIMA model to the transformed data using auto.arima(); 463 | usgdp_autoarima <- auto.arima(usgdp, 464 | lambda = lambda_usgdp) 465 | 466 | autoplot(usgdp, series = "Data") + 467 | autolayer(usgdp_autoarima$fitted, series = "Fitted") 468 | # It looked like the model fits well to the data. 469 | 470 | usgdp_autoarima 471 | #ARIMA(2, 1, 0) with drift model after Box-Cox transformation. 472 | 473 | # c. try some other plausible models by experimenting with the orders chosen; 474 | ndiffs(BoxCox(usgdp, lambda_usgdp)) 475 | # the data need 1 first differencing to be stationary. 476 | ggtsdisplay(diff(BoxCox(usgdp, lambda_usgdp))) 477 | # ACF plot shows sinusoidal decrease while PACF plot shows significant spikes at lag 1 and 12. I think that I can ignore the spike at lag 12 because the data are aggregated quarterly, not monthly. Therefore, I'll experiment with ARIMA(1, 1, 0) model. 478 | usgdp_arima.1.1.0 <- Arima( 479 | usgdp, lambda = lambda_usgdp, order = c(1, 1, 0) 480 | ) 481 | 482 | usgdp_arima.1.1.0 483 | 484 | autoplot(usgdp, series = "Data") + 485 | autolayer(usgdp_arima.1.1.0$fitted, series = "Fitted") 486 | 487 | # I'll also try ARIMA(1, 1, 0) with drift model. 488 | usgdp_arima.1.1.0.drift <- Arima( 489 | usgdp, lambda = lambda_usgdp, order = c(1, 1, 0), 490 | include.drift = TRUE 491 | ) 492 | 493 | usgdp_arima.1.1.0.drift 494 | 495 | autoplot(usgdp, series = "Data") + 496 | autolayer(usgdp_arima.1.1.0.drift$fitted, series = "Fitted") 497 | # It looked like that these models also fit well to the data. 498 | 499 | # d. choose what you think is the best model and check the residual diagnostics; 500 | accuracy(usgdp_autoarima) 501 | accuracy(usgdp_arima.1.1.0) 502 | accuracy(usgdp_arima.1.1.0.drift) 503 | # Some errors show that ARIMA(2, 1, 0) with drift is the best model while others show that ARIMA(1, 1, 0) with drift is the best. Check the residuals of both cases. 504 | checkresiduals(usgdp_autoarima) 505 | checkresiduals(usgdp_arima.1.1.0.drift) 506 | # In either case, the residuals are like white noise series and are not normally distributed. 507 | # I'll choose the best model as ARIMA(2, 1, 0) with drift model. With the model, RMSE and MASE values were lower. And there wasn't significant spike at lag 2 in ACF plot of ARIMA(2, 1, 0) with drift model, even if it exists in ARIMA(1, 1, 0) with drift model. 508 | 509 | # e. produce forecasts of your fitted model. Do the forecasts look reasonable? 510 | fc_usgdp_autoarima <- forecast( 511 | usgdp_autoarima 512 | ) 513 | 514 | autoplot(fc_usgdp_autoarima) 515 | # It looked like the forecasts are reasonable. 516 | 517 | # f. compare the results with what you would obtain using ets() (with no transformation). 518 | fc_usgdp_ets <- forecast( 519 | ets(usgdp) 520 | ) 521 | 522 | autoplot(fc_usgdp_ets) 523 | # It looked like these forecasts are more likely than the ones with ARIMA model. When trend is obvious, is ETS better than ARIMA model? I wonder about it. 524 | 525 | ``` 526 | 527 | 528 | 10. Consider austourists, the quarterly number of international tourists to Australia for the period 1999-2010. (Data set austourists.) 529 | 530 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question10} 531 | 532 | # a. Describe the time plot. 533 | autoplot(austourists) 534 | # the data have strong seasonality and increasing trend. Also the size of variations increased as the number increased. 535 | 536 | # b. What can you learn from the ACF graph? 537 | ggAcf(austourists) 538 | # autocorrelations are slowly decreasing. And the values at the lags of multiple of 4 were big compared to the others. 539 | 540 | # c. What can you learn from the PACF graph? 541 | ggPacf(austourists) 542 | # there are 5 significant spikes, and then no significant spikes thereafter (apart from one at lag 8, which are probably related with quarterly seasonality). 543 | 544 | # d. Produce plots of the seasonally differenced data (1 - B^4)Yt. What model do these graphs suggest? 545 | ggtsdisplay(diff(austourists, lag = 4)) 546 | # the seasonally differenced data are looked like to need at least one more differencing to make it stationary. 547 | # For the values at the lags of multiple of 4, there are just significant spikes at lag 4. It is same for ACF and PACF plots. The order of seasonal ARIMA model can be (1, 1, 0)[4] or (0, 1, 1)[4]. I'll choose (1, 1, 0)[4] order. 548 | # Disregarding the values at lag 4, autocorrelation values are looked like decreasing sinusoidally while partial autocorrelation values have spikes at lag 1 and 5. There aren't significant spikes at lag 2 and 3. 549 | # I think that the spike at lag 5 doesn't mean that there are still important information unused at 5th lagged values. It should've been positively significant because of the big negatively significant spike at lag 4 after seasonal differencing. 550 | # I can find this by drawing PACF plot after doing one more differencing. 551 | ggtsdisplay(diff(diff(austourists, lag = 4))) 552 | # Therefore I suggest ARIMA(1, 1, 0)(1, 1, 0)[4] model. 553 | 554 | # e. Does auto.arima give the same model that you chose? If not, which model do you think is better? 555 | fc_austourists_autoarima <- forecast( 556 | auto.arima(austourists) 557 | ) 558 | 559 | fc_austourists_autoarima$model 560 | # auto.arima gave ARIMA(1, 0, 0)(1, 1, 0)[4] model. 561 | 562 | fc_austourists_arima.1.1.0.1.1.0.4 <- forecast( 563 | Arima(austourists, 564 | order = c(1, 1, 0), 565 | seasonal = c(1, 1, 0)) 566 | ) 567 | 568 | autoplot(fc_austourists_autoarima) 569 | autoplot(fc_austourists_arima.1.1.0.1.1.0.4) 570 | # ARIMA(1, 1, 0)(1, 1, 0)[4] shows more fastly increasing trend. 571 | 572 | accuracy(fc_austourists_autoarima) 573 | accuracy(fc_austourists_arima.1.1.0.1.1.0.4) 574 | # ARIMA(1, 0, 0)(1, 1, 0)[4] with drift model was fitted better. Therefore I think that this model is better than ARIMA(1, 1, 0)(1, 1, 0)[4] model. 575 | 576 | checkresiduals(fc_austourists_autoarima) 577 | # The residuals are like white noise series. 578 | 579 | # f. Write the model in terms of the backshift operator, then without using the backshift operator. 580 | fc_austourists_autoarima$model 581 | # (1 - phi1*B)(1 - phis1*B)(1 - B^4)(yt - c*t) = et 582 | # c = drift*(1 - phi1)(1 - phis1)*m^D = 1.7793 583 | 584 | # (1 - phi1*B - phis1*B + phi1*phis1*B^2)(1 - B^4)(yt - c*t) = 585 | # (1 - phi1*B - phis1*B + phi1*phis1*B^2 - B^4 + phi1*B^5 + phis1*B^5 - phi1*phis1*B^6)(yt - c*t) = et 586 | # yt = c + (phi1 + phis1)*yt-1 - phi1*phis1*yt-2 + yt-4 - (phi1 + phis1)*yt-5 + phi1*phis1*yt-6 + et 587 | # yt = 1.7793 - 0.06*yt-1 + 0.2496*yt-2 + yt-4 + 0.06*yt-5 - 0.2496*yt-6 + et 588 | 589 | ``` 590 | 591 | 592 | 11. Consider the total net generation of electricity (in billion kilowatt hours) by the U.S. electric industry (monthly for the period January 1973 - June 2013). (Data set usmelec.) In general there are two peaks per year: in mid-summer and mid-winter. 593 | 594 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question11} 595 | 596 | # a. Examine the 12-month moving average of this series to see what kind of trend is involved. 597 | usmelec_ma2x12 <- ma(usmelec, order = 12, centre = TRUE) 598 | 599 | autoplot(usmelec, series = "Data") + 600 | autolayer(usmelec_ma2x12, series = "2X12-MA") + 601 | ylab(expression(paste("Electricity(x", 10^{9}, "KWh)"))) + 602 | ggtitle("Monthly total net generation of electricity") + 603 | scale_color_discrete(breaks = c("Data", "2X12-MA")) 604 | # Total net generation amount increased first but stoped increasing from about 2008. 605 | 606 | # b. Do the data need transforming? If so, find a suitable transformation. 607 | # The data show bigger variation for bigger amount. Therefore I think that Box-Cox transformation would be suitable for the data. 608 | lambda_usmelec <- BoxCox.lambda(usmelec) 609 | 610 | # c. Are the data stationary? If not, find an appropriate differencing which yields stationary data. 611 | # The data are non-stationary. 612 | ndiffs(usmelec) 613 | nsdiffs(usmelec) 614 | # I need to do 1 seasonal differencing to make the data stationary. If seasonal differencing isn't enough to make them stationary, I need to do first differencing, too. 615 | 616 | # d. Identify a couple of ARIMA models that might be useful in describing the time series. Which of your models is the best according to their AIC values? 617 | ggtsdisplay(diff( 618 | BoxCox(usmelec, lambda_usmelec), 619 | lag = 12 620 | )) 621 | # Definitely, I need to use first differencing, too. 622 | ggtsdisplay( 623 | diff( 624 | diff( 625 | BoxCox(usmelec, lambda_usmelec), 626 | lag = 12 627 | ) 628 | ) 629 | ) 630 | # I think that ARIMA(0, 1, 2)(0, 1, 1)[12] with Box-Cox transformation model might describe the data well. I'll try ARIMA(0, 1, 3)(0, 1, 1)[12] with Box-Cox transformation model, too. 631 | usmelec_arima.0.1.2.0.1.1.12 <- Arima( 632 | usmelec, 633 | lambda = lambda_usmelec, 634 | order = c(0, 1, 2), 635 | seasonal = c(0, 1, 1) 636 | ) 637 | 638 | usmelec_arima.0.1.3.0.1.1.12 <- Arima( 639 | usmelec, 640 | lambda = lambda_usmelec, 641 | order = c(0, 1, 3), 642 | seasonal = c(0, 1, 1) 643 | ) 644 | 645 | usmelec_arima.0.1.2.0.1.1.12$aic 646 | usmelec_arima.0.1.3.0.1.1.12$aic 647 | # ARIMA(0, 1, 2)(0, 1, 1)[12] with Box-Cox transformation model was the best. 648 | 649 | # e. Estimate the parameters of your best model and do diagnostic testing on the residuals. Do the residuals resemble white noise? If not, try to find another ARIMA model which fits better. 650 | usmelec_arima.0.1.2.0.1.1.12 651 | #theta1 = -0.4317, theta2 = -0.2552, phis1 = -0.8536 652 | 653 | checkresiduals(usmelec_arima.0.1.2.0.1.1.12) 654 | # Ljung-Box test result shows that the residuals can be thought of as white noise. And they are normally distributed. 655 | 656 | # I want to know what model was selected if I used auto.arima function. I'll try it. 657 | usmelec_autoarima <- auto.arima( 658 | usmelec, 659 | lambda = lambda_usmelec 660 | ) 661 | 662 | usmelec_autoarima 663 | # The result is ARIMA(2, 1, 4)(0, 0, 2)[12] with drift after Box-Cox transformation model. AIC is -4722. But I can't compare the AIC value with what I got above, because the number of differencing was different(Differencing changes the way the likelihood is computed). 664 | 665 | checkresiduals(usmelec_autoarima) 666 | # And the residuals aren't like white noise. Therefore I'll choose ARIMA(0, 1, 2)(0, 1, 1)[12] with Box-Cox transformation model. 667 | 668 | # f. Forecast the next 15 years of electricity generation by the U.S. electric industry. Get the latest figures from https://goo.gl/WZIItv to check the accuracy of your forecasts. 669 | fc_usmelec_arima.0.1.2.0.1.1.12 <- forecast( 670 | usmelec_arima.0.1.2.0.1.1.12, 671 | h = 12*15 672 | ) 673 | 674 | # Get the latest figures. 675 | usmelec.new <- read.csv("MER_T07_02A.csv", sep = ",") 676 | 677 | # need to do data munging before using the data. 678 | # make new columns Year, Month using YYYYMM column. 679 | usmelec.new[, "Year"] <- as.numeric( 680 | substr(usmelec.new[, "YYYYMM"], 1, 4) 681 | ) 682 | usmelec.new[, "Month"] <- as.numeric( 683 | substr(usmelec.new[, "YYYYMM"], 5, 6) 684 | ) 685 | # make usmelec.new only have Year, Month and Value columns with net generation total data. 686 | usmelec.new <- subset( 687 | usmelec.new, 688 | Description == "Electricity Net Generation Total, All Sectors", 689 | select = c("Year", "Month", "Value") 690 | ) 691 | # remove data if month is 13. They are old yearly data. 692 | usmelec.new <- subset(usmelec.new, Month != 13) 693 | # change the Value column data type to number. And divide the numbers by 1000 because the unit of the values in usmelec.new are Million KWh, not Billion KWh. 694 | usmelec.new[, "Value"] <- as.numeric( 695 | as.character(usmelec.new[, "Value"]) 696 | )/1000 697 | # as.numeric(usmelec.new[, "Value"]) yields wrong data. Need to recognize the letters as character first, and then change the type as number. 698 | head(usmelec.new) 699 | tail(usmelec.new) 700 | # first observation was taken in January, 1973. Final observation was taken in October, 2017. 701 | 702 | # make ts time series using usmelec.new Value column data. 703 | usmelec.new.ts <- ts( 704 | usmelec.new[, "Value"], 705 | start = c(1973, 1), 706 | frequency = 12 707 | ) 708 | tail(usmelec.new.ts) 709 | # final observation was taken in October, 2017 as expected. 710 | 711 | # get accuracy for 4 years of forecast horizon. 712 | usmelec.new.ts_next4years <- subset( 713 | usmelec.new.ts, 714 | start = length(usmelec) + 1, 715 | end = length(usmelec) + 12*4 716 | ) 717 | accuracy( 718 | fc_usmelec_arima.0.1.2.0.1.1.12, 719 | usmelec.new.ts_next4years 720 | ) 721 | 722 | # plot the results 723 | autoplot(fc_usmelec_arima.0.1.2.0.1.1.12, series = "Forecasts") + 724 | autolayer(usmelec.new.ts, series = "Real data") + 725 | scale_x_continuous(limits = c(2010, 2030)) + 726 | ggtitle("Forecast from ARIMA(0,1,2)(0,1,1)[12] with real data") 727 | # Real data are really similar to the forecasts. Even when they were different, real data didn't get out of the prediction interval. 728 | 729 | # g. How many years of forecasts do you think are sufficiently accurate to be usable? 730 | # In usmelec data case, even 4 years of forecasts were sufficiently accurate to be usable. I think that it happened because the pattern in the data almost didn't change. 731 | 732 | ``` 733 | 734 | 735 | 12. For the mcopper data: 736 | 737 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question12} 738 | 739 | # a. if necessary, find a suitable Box-Cox transformation for the data; 740 | autoplot(mcopper) 741 | # they are monthly data but there isn't seasonality in them. 742 | autoplot(BoxCox(mcopper, BoxCox.lambda(mcopper))) 743 | # It looked like Box-Cox transformation makes the variations in the data evenly over time. Therefore I'm going to use the transformation. 744 | lambda_mcopper <- BoxCox.lambda(mcopper) 745 | 746 | # b. fit a suitable ARIMA model to the transformed data using auto.arima(); 747 | mcopper_autoarima <- auto.arima( 748 | mcopper, 749 | lambda = lambda_mcopper 750 | ) 751 | 752 | mcopper_autoarima 753 | # auto.arima yielded ARIMA(0, 1, 1) with Box-Cox transformation model. AICc was -86.08. 754 | 755 | # c. try some other plausible models by experimenting with the orders chosen; 756 | ndiffs(mcopper) 757 | nsdiffs(mcopper) 758 | # the data need 1 first differencing. 759 | ggtsdisplay(diff(mcopper)) 760 | # It looked like autocorrelation values are sinusoidally decreasing. I'll choose ARIMA model's order as (1, 1, 0) and (5, 1, 0). 761 | 762 | mcopper_arima.1.1.0 <- Arima( 763 | mcopper, order = c(1, 1, 0), lambda = lambda_mcopper 764 | ) 765 | 766 | mcopper_arima.1.1.0 767 | # AICc was -75.64. 768 | 769 | mcopper_arima.5.1.0 <- Arima( 770 | mcopper, order = c(5, 1, 0), lambda = lambda_mcopper 771 | ) 772 | 773 | mcopper_arima.5.1.0 774 | # AICc was -78.48. 775 | 776 | # I'll try auto.arima function without approximation and stepwise options. 777 | mcopper_autoarima2 <- auto.arima( 778 | mcopper, lambda = lambda_mcopper, 779 | approximation = FALSE, stepwise = FALSE 780 | ) 781 | 782 | mcopper_autoarima2 783 | # the result model is the same as when I didn't use the options. 784 | 785 | # d. choose what you think is the best model and check the residual diagnostics; 786 | # When I compared AICc values, I got the smallest when I used auto.arima function. (I could've used AICc in comparing because the differencing was the same for all models I chose.) Best model is ARIMA(0, 1, 1) with Box-Cox transformation. 787 | checkresiduals(mcopper_autoarima) 788 | # The residuals are like white noise. I'll select the model. 789 | 790 | # e. produce forecasts of your fitted model. Do the forecasts look reasonable? 791 | fc_mcopper_autoarima <- forecast( 792 | mcopper_autoarima 793 | ) 794 | 795 | autoplot(fc_mcopper_autoarima) 796 | # The forecasts aren't reasonable. 797 | 798 | # I'll try other models I made. 799 | fc_mcopper_arima.1.1.0 <- forecast( 800 | mcopper_arima.1.1.0 801 | ) 802 | 803 | autoplot(fc_mcopper_arima.1.1.0) 804 | # got almost same result 805 | 806 | fc_mcopper_arima.5.1.0 <- forecast( 807 | mcopper_arima.5.1.0 808 | ) 809 | 810 | autoplot(fc_mcopper_arima.5.1.0) 811 | # got almost same result, too 812 | 813 | # f. compare the results with what you would obtain using ets() (with no transformation). 814 | fc_mcopper_ets <- forecast( 815 | ets(mcopper) 816 | ) 817 | 818 | autoplot(fc_mcopper_ets) 819 | # These forecasts are more reasonable than what I got above. 820 | 821 | ``` 822 | 823 | 824 | 13. Choose one of the following seasonal time series: hsales, auscafe, qauselec, qcement, qgas. 825 | 826 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question13} 827 | 828 | # a. Do the data need transforming? If so, find a suitable transformation. 829 | # I'll analyze and forecast qauselec data. 830 | autoplot(qauselec) 831 | # The data need Box-Cox transformation to make the variations evenly over time. 832 | lambda_qauselec <- BoxCox.lambda(qauselec) 833 | 834 | # b. Are the data stationary? If not, find an appropriate differencing which yields stationary data. 835 | # The data have strong seasonality and increasing trend. It means that the data aren't stationary. 836 | nsdiffs(qauselec) 837 | ndiffs(qauselec) 838 | # The data need 1 seasonal differencing. 839 | kpss.test(diff(qauselec, lag = 4)) 840 | # The data don't need first differencing. But I'll try with first differencing, too. 841 | 842 | # c. Identify a couple of ARIMA models that might be useful in describing the time series. Which of your models is the best according to their AIC values? 843 | ggtsdisplay(diff( 844 | BoxCox(qauselec, lambda_qauselec), lag = 4 845 | )) 846 | ggtsdisplay(diff(diff( 847 | BoxCox(qauselec, lambda_qauselec), lag = 4 848 | ))) 849 | 850 | # suggest models: 851 | # ARIMA(0, 0, 1)(0, 1, 1)[4] 852 | qauselec_arima0.0.1.0.1.1.4 <- Arima( 853 | qauselec, lambda = lambda_qauselec, 854 | order = c(0, 0, 1), seasonal = c(0, 1, 1) 855 | ) 856 | 857 | qauselec_arima0.0.1.0.1.1.4 858 | # AIC = -97.01 859 | 860 | # ARIMA(0, 1, 1)(0, 1, 1)[4] 861 | qauselec_arima0.1.1.0.1.1.4 <- Arima( 862 | qauselec, lambda = lambda_qauselec, 863 | order = c(0, 1, 1), seasonal = c(0, 1, 1) 864 | ) 865 | 866 | qauselec_arima0.1.1.0.1.1.4 867 | #AIC = -292.59 868 | 869 | # ARIMA(0, 1, 1)(0, 1, 2)[4] 870 | qauselec_arima0.1.1.0.1.2.4 <- Arima( 871 | qauselec, lambda = lambda_qauselec, 872 | order = c(0, 1, 1), seasonal = c(0, 1, 2) 873 | ) 874 | 875 | qauselec_arima0.1.1.0.1.2.4 876 | #AIC = -292.73 877 | 878 | # try using auto.arima function 879 | qauselec_autoarima <- auto.arima( 880 | qauselec, lambda = lambda_qauselec 881 | ) 882 | 883 | qauselec_autoarima 884 | # AIC = -300.84 885 | 886 | # According to AIC values, ARIMA(1, 1, 1)(1, 1, 2)[4] with Box-Cox transformation is the best model. But above 2 models and the model made by auto.arima function used different number of differencings. 887 | 888 | # d. Estimate the parameters of your best model and do diagnostic testing on the residuals. Do the residuals resemble white noise? If not, try to find another ARIMA model which fits better. 889 | # phi1 = 0.2523, theta1 = -0.6905, phis1 = 0.8878, thetas1 = -1.6954, thetas2 = -0.7641 890 | checkresiduals(qauselec_autoarima) 891 | # The residuals aren't like white noise. 892 | 893 | # try using other models. 894 | checkresiduals(qauselec_arima0.0.1.0.1.1.4) 895 | checkresiduals(qauselec_arima0.1.1.0.1.1.4) 896 | checkresiduals(qauselec_arima0.1.1.0.1.2.4) 897 | # The residuals don't resemble white noise regardless of the model. Therefore I'm going to use the best model. 898 | 899 | # e. Forecast the next 24 months of data using your preferred model. 900 | fc_qauselec_autoarima <- forecast( 901 | qauselec_autoarima, h = 8 902 | ) 903 | 904 | autoplot(fc_qauselec_autoarima) 905 | # The forecasts are reasonable. 906 | 907 | # f. Compare the forecasts obtained using ets(). 908 | fc_qauselec_ets <- forecast( 909 | ets(qauselec), h = 8 910 | ) 911 | 912 | autoplot(fc_qauselec_ets) 913 | # These forecasts also are reasonable. 914 | 915 | ``` 916 | 917 | 918 | 14. For the same time series you used in the previous exercise, try using a non-seasonal model applied to the seasonally adjusted data obtained from STL. The stlf() function will make the calculations easy (with method="arima"). Compare the forecasts with those obtained in the previous exercise. Which do you think is the best approach? 919 | 920 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question14} 921 | 922 | fc_qauselec_stlf <- stlf( 923 | qauselec, lambda = BoxCox.lambda(qauselec), 924 | s.window = 5, robust = TRUE, method = "arima", 925 | h = 8 926 | ) 927 | 928 | autoplot(fc_qauselec_stlf) + 929 | scale_x_continuous(limits = c(2005, 2012)) + 930 | scale_y_continuous(limits = c(50, 70)) 931 | 932 | autoplot(fc_qauselec_ets) + 933 | scale_x_continuous(limits = c(2005, 2012)) + 934 | scale_y_continuous(limits = c(50, 70)) 935 | 936 | autoplot(fc_qauselec_autoarima) + 937 | scale_x_continuous(limits = c(2005, 2012)) + 938 | scale_y_continuous(limits = c(50, 70)) 939 | # I don't know which forecasts are best. Forecasts from STL + ARIMA(0, 1, 1) with drift model yielded highest forecasts. It looked like it followed latest trend a lot. ETS(M, A, M) model yielded broadest PI. ARIMA(1, 1, 1)(1, 1, 2)[4] model yielded least varianced forecasts. 940 | # For qauselec data, the choice of model didn't affect forecasts much because they already have strong seasonality and almost non-changing trend. 941 | 942 | ``` 943 | 944 | 15. For your retail time series (Exercise 5 above): 945 | 946 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question15} 947 | 948 | # a. develop an appropriate seasonal ARIMA model; 949 | fc_retail_autoarima <- forecast( 950 | auto.arima(retail.ts), 951 | h = 36 952 | ) 953 | 954 | autoplot(fc_retail_autoarima) 955 | # ARIMA(1, 0, 2)(0, 1, 1)[12] with drift model was chosen. 956 | 957 | # b. compare the forecasts with those you obtained in earlier chapters; 958 | # In chapter 3, I used seasonal naive method to forecast. 959 | fc_retail_snaive <- snaive(retail.ts, h = 36) 960 | 961 | autoplot(fc_retail_snaive) 962 | 963 | # In chapter 7, I thought that Holt-Winters' multiplicative method was best among ets models. 964 | fc_retail_ets <- forecast( 965 | ets(retail.ts, lambda = BoxCox.lambda(retail.ts)), 966 | h = 36 967 | ) 968 | 969 | autoplot(fc_retail_ets) 970 | 971 | # c. Obtain up-to-date retail data from the ABS website (Cat 8501.0, Table 11), and compare your forecasts with the actual numbers. How good were the forecasts from the various models? 972 | # Get the latest figures. 973 | retail.new <- read.xlsx("8501011.xlsx", 974 | sheetName = "Data1", 975 | startRow = 10) 976 | 977 | retail.new.ts <- ts(retail.new[, "A3349873A"], 978 | start = c(1982, 4), 979 | frequency = 12) 980 | 981 | retail.new.test <- subset( 982 | retail.new.ts, 983 | start = length(retail.ts) + 1 984 | ) 985 | 986 | # draw plots for the data in forecast horizon. 987 | autoplot(fc_retail_autoarima$mean, series = "ARIMA") + 988 | geom_line(size = 1, color = "red") + 989 | autolayer(fc_retail_ets$mean, series = "Holt-Winters'", size = 1) + 990 | autolayer(fc_retail_snaive$mean, series = "Seasonal Naive", size = 1) + 991 | autolayer(retail.new.test, series = "Real data", size = 1) + 992 | ggtitle("Turnover of other retailings not elsewhere classified", 993 | subtitle = "- From New South Wales in Australia") + 994 | ylab("Trade amount($ Millions)") + 995 | scale_color_discrete(breaks = c("Real data", "ARIMA", "Holt-Winters'", "Seasonal Naive")) 996 | # plots shows that the forecasts from Holt-Winters' model were the most accurate. 997 | 998 | # get accuracy for each models' forecasts. 999 | accuracy(fc_retail_autoarima, retail.new.test) 1000 | accuracy(fc_retail_ets, retail.new.test) 1001 | accuracy(fc_retail_snaive, retail.new.test) 1002 | # Without ACF1, all errors show that the Holt-Winters' model was best. The next one was ARIMA model. Seasonal naive model showed the poorest forecasting accuracy. 1003 | 1004 | ``` 1005 | 1006 | 1007 | 16. 1008 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question16} 1009 | 1010 | # a. Produce a time plot of the sheep population of England and Wales from 1867-1939 (data set sheep). 1011 | autoplot(sheep) 1012 | # There is decreasing trend, but no definite seasonality. 1013 | 1014 | # b. Assume you decide to fit the following model: 1015 | # yt = yt-1 + phi1(yt-1 - yt-2) + phi2(yt-2 - yt-3) + phi3(yt-3 - yt-4) + et 1016 | # where et is a white noise series. What sort of ARIMA model is this (i.e., what are p, d, and q)? 1017 | 1018 | # (yt - yt-1) - phi1(yt-1 - yt-2) - phi2(yt-2 - yt-3) - phi3(yt-3 - yt-4) = et 1019 | # (1 - B)yt - phi1*B(1- B)yt - phi2*B^2(1- B)yt - phi3*B^3(1- B)yt = et 1020 | # (1 - phi1*B - phi2*B^2 - phi3*B^3)(1 - B)yt = et 1021 | # It is ARIMA(3, 1, 0) model. 1022 | 1023 | # c. By examining the ACF and PACF of the differenced data, explain why this model is appropriate. 1024 | ggtsdisplay(diff(sheep)) 1025 | # ACF plot shows sinusoidally decreasing autocorrelation values while PACF plot shows significant spikes at lag 1 to 3, but no beyond lag 3. Therefore ARIMA(3, 1, 0) is appropriate. 1026 | 1027 | # d. The last five values of the series are given below: 1028 | # Year 1935 1936 1937 1938 1939 1029 | # Millions of sheep 1648 1665 1627 1791 1797 1030 | # The estimated parameters are phi1 = 0.42, phi2 = -0.20, and phi3 = -0.30. Without using the forecast function, calculate forecasts for the next three years (1940-1942). 1031 | 1032 | sheep.1940 = 1797 + 0.42*(1797 - 1791) -0.20*(1791 - 1627) - 0.30*(1627 - 1665) 1033 | 1034 | sheep.1941 = sheep.1940 + 0.42*(sheep.1940 - 1797) -0.20*(1797 - 1791) - 0.30*(1791 - 1627) 1035 | 1036 | sheep.1942 = sheep.1941 + 0.42*(sheep.1941 - sheep.1940) -0.20*(sheep.1940 - 1797) - 0.30*(1797 - 1791) 1037 | 1038 | c(sheep.1940, sheep.1941, sheep.1942) 1039 | 1040 | # e. Now fit the model in R and obtain the forecasts using forecast. How are they different from yours? Why? 1041 | fc_sheep_arima.3.1.0 <- forecast( 1042 | Arima(sheep, order = c(3, 1, 0)), 1043 | h = 3 1044 | ) 1045 | 1046 | fc_sheep_arima.3.1.0$mean 1047 | # calculated forecasts were a little bigger than the forecasts from Arima function. And the differences between the forecasts at the same time became bigger as time goes on. I think that it happened because of the differences of the coefficients. 1048 | # Small differences in the coefficients made the difference between the first forecasts. And then the forecast values were used to calculate the next time point's forecasts. When the next time point's forecasts of Arima function were calculated, the difference became bigger. It looked like such situation repeated. 1049 | ar1 <- fc_sheep_arima.3.1.0$model$coef[1] 1050 | ar2 <- fc_sheep_arima.3.1.0$model$coef[2] 1051 | ar3 <- fc_sheep_arima.3.1.0$model$coef[3] 1052 | 1053 | sheep.1940.new = 1797 + ar1*(1797 - 1791) + ar2*(1791 - 1627) + ar3*(1627 - 1665) 1054 | 1055 | sheep.1941.new = sheep.1940.new + ar1*(sheep.1940.new - 1797) + ar2*(1797 - 1791) + ar3*(1791 - 1627) 1056 | 1057 | sheep.1942.new = sheep.1941.new + ar1*(sheep.1941.new - sheep.1940.new) + ar2*(sheep.1940.new - 1797) + ar3*(1797 - 1791) 1058 | 1059 | c(sheep.1940.new, sheep.1941.new, sheep.1942.new) 1060 | # above calculation confirms what I said about the differences. 1061 | 1062 | ``` 1063 | 1064 | 1065 | 17. 1066 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question17} 1067 | 1068 | # a. Plot the annual bituminous coal production in the United States from 1920 to 1968 (data set bicoal). 1069 | autoplot(bicoal) 1070 | # It looked like there isn't any particular trend or seasonality. 1071 | 1072 | # b. You decide to fit the following model to the series: 1073 | # yt = c + phi1*yt-1 + phi2*yt-2 + phi3*yt-3 + phi4*yt-4 + et 1074 | # where yt is the coal production in year t and et is a white noise series. What sort of ARIMA model is this (i.e., what are p, d, and q)? 1075 | 1076 | # (1 - phi1*B - phi2*B^2 - phi3*B^3 - phi4*B^4)*yt = c + et 1077 | # if mu is the mean of yt, 1078 | # c = mu*(1 - phi1*B - phi2*B^2 - phi3*B^3 - phi4*B^4) 1079 | # This model is ARIMA(4, 0, 0) or AR(4). 1080 | 1081 | # c. Explain why this model was chosen using the ACF and PACF. 1082 | ggAcf(bicoal, lag.max = 36) 1083 | ggPacf(bicoal, lag.max = 36) 1084 | # ACF plot shows sinusoidally decreasing autocorrelation values. PACF plot shows significant spikes at lag 1 and 4, but none beyond lag 4. Therefore AR(4) model is the appropriate choice. 1085 | 1086 | # d. The last five values of the series are given below. 1087 | # Year 1964 1965 1966 1967 1968 1088 | # Millions of tons 467 512 534 552 545 1089 | # The estimated parameters are c = 162.00, phi1 = 0.83, phi2 = -0.34, phi3 = 0.55 and phi4 = -0.38. Without using the forecast function, calculate forecasts for the next three years (1969-1971). 1090 | c = 162.00 1091 | phi1 = 0.83 1092 | phi2 = -0.34 1093 | phi3 = 0.55 1094 | phi4 = -0.38 1095 | 1096 | bicoal.1969 <- c + phi1*545 + phi2*552 + phi3*534 + phi4*512 1097 | 1098 | bicoal.1970 <- c + phi1*bicoal.1969 + phi2*545 + phi3*552 + phi4*534 1099 | 1100 | bicoal.1971 <- c + phi1*bicoal.1970 + phi2*bicoal.1969 + phi3*545 + phi4*552 1101 | 1102 | c(bicoal.1969, bicoal.1970, bicoal.1971) 1103 | 1104 | # e. Now fit the model in R and obtain the forecasts from the same model. How are they different from yours? Why? 1105 | fc_bicoal_ar4 <- forecast(ar(bicoal, 4), h = 3) 1106 | 1107 | fc_bicoal_ar4$mean 1108 | # The forecasts from ar function were a little bigger than the calculated forecasts. It also happened because of the small differences of coefficients. 1109 | phi1 <- fc_bicoal_ar4$model$ar[1] 1110 | phi2 <- fc_bicoal_ar4$model$ar[2] 1111 | phi3 <- fc_bicoal_ar4$model$ar[3] 1112 | phi4 <- fc_bicoal_ar4$model$ar[4] 1113 | c <- fc_bicoal_ar4$model$x.mean*(1 - phi1 - phi2 - phi3 - phi4) 1114 | 1115 | bicoal.1969.new <- c + phi1*545 + phi2*552 + phi3*534 + phi4*512 1116 | 1117 | bicoal.1970.new <- c + phi1*bicoal.1969.new + phi2*545 + phi3*552 + phi4*534 1118 | 1119 | bicoal.1971.new <- c + phi1*bicoal.1970.new + phi2*bicoal.1969.new + phi3*545 + phi4*552 1120 | 1121 | c(bicoal.1969.new, bicoal.1970.new, bicoal.1971.new) 1122 | # These calculations and results confirm the causation of the differences in forecasts. 1123 | 1124 | ``` 1125 | 1126 | 1127 | 18. 1128 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question18} 1129 | 1130 | # a. Install the rdatamarket package in R using 1131 | # install.packages("rdatamarket") 1132 | 1133 | # b. Select a time series from http://datamarket.com/data/list/?q=pricing:free. Then copy its short URL and import the data using 1134 | # x <- ts(rdatamarket::dmseries("shorturl")[,1], start=??, frequency=??) 1135 | # (Replace ?? with the appropriate values.) 1136 | oil.NA <- ts(dmseries("http://data.is/1qk5Uvf")[, "Total.North.America"], start=1965, frequency=1) 1137 | 1138 | str(oil.NA) 1139 | head(oil.NA) 1140 | 1141 | # c. Plot graphs of the data, and try to identify an appropriate ARIMA model. 1142 | autoplot(oil.NA) 1143 | ndiffs(oil.NA) 1144 | ggtsdisplay(diff(oil.NA)) 1145 | # I think that ARIMA(1, 1, 0) model will fit well to the data. 1146 | 1147 | # d. Do residual diagnostic checking of your ARIMA model. Are the residuals white noise? 1148 | oil.NA_arima.1.1.0 <- Arima( 1149 | oil.NA, order = c(1, 1, 0) 1150 | ) 1151 | 1152 | checkresiduals(oil.NA_arima.1.1.0) 1153 | # The residuals are like white noise. 1154 | 1155 | # e. Use your chosen ARIMA model to forecast the next four years. 1156 | fc_oil.NA_arima.1.1.0 <- forecast( 1157 | oil.NA_arima.1.1.0, h = 4 1158 | ) 1159 | 1160 | autoplot(fc_oil.NA_arima.1.1.0) 1161 | # The forecast values of oil production are increasing. But the increase speed is damping. 1162 | 1163 | # f. Now try to identify an appropriate ETS model. 1164 | oil.NA_ets <- ets(oil.NA) 1165 | 1166 | oil.NA_ets 1167 | # chosen model is ETS(A, A, N). 1168 | 1169 | # g. Do residual diagnostic checking of your ETS model. Are the residuals white noise? 1170 | checkresiduals(oil.NA_ets) 1171 | # The residuals are like white noise. 1172 | 1173 | # h. Use your chosen ETS model to forecast the next four years. 1174 | fc_oil.NA_ets <- forecast( 1175 | oil.NA_ets, h = 4 1176 | ) 1177 | 1178 | autoplot(fc_oil.NA_ets) 1179 | # The forecast values of oil production are increasing. And the increase speed doesn't almost change. 1180 | 1181 | # i. Which of the two models do you prefer? 1182 | # I prefer ARIMA model. Because I think that it is unlikely that the oil production amounts increase a lot for more than 10 years in a row. It is likely that the increase of the production amounts will be decreased. 1183 | # But the real production amount was about 931 Million Ton in 2015. The forecasts from ETS model were more similar to the real production amounts for about 3 years of forecast horizon. 1184 | # http://tools.bp.com/energy-charting-tool.aspx?_ga=2.211706901.1002175336.1518264329-1800202796.1518264329#/st/oil/dt/production/unit/KBD/region/NOA/SCA/EU/MIE/AFR/AP/view/area/ 1185 | 1186 | ``` 1187 | 1188 | 1189 | ### Question 1 isn't related with coding that I didn't include in this file. -------------------------------------------------------------------------------- /Chapter9.rmd: -------------------------------------------------------------------------------- 1 | # Chapter 9 2 | 3 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Load_packages} 4 | 5 | library(fpp2) 6 | library(xlsx) 7 | 8 | ``` 9 | 10 | 1. Consider monthly sales and advertising data for an automotive parts company (data set advert). 11 | 12 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question1} 13 | 14 | # a. Plot the data using autoplot. Why is it useful to set facets=TRUE? 15 | str(advert) 16 | head(advert) 17 | 18 | autoplot(advert, facets = TRUE) 19 | # Can see the advertising expenditure data and sales volume data in different panels. facets = TRUE option can plot the subsets of data in each panel. 20 | 21 | # b. Fit a standard regression model yt = a + b*xt + nt where yt denotes sales and xt denotes advertising using the tslm() function. 22 | advert_tslm <- tslm(sales ~ advert, data = advert) 23 | 24 | # c. Show that the residuals have significant autocorrelation. 25 | checkresiduals(advert_tslm) 26 | # The residuals have significant autocorrelations at lag 1 and 2. 27 | 28 | # d. What difference does it make you use the function instead: 29 | # Arima(advert[,"sales"], xreg=advert[,"advert"], order=c(0,0,0)) 30 | advert_dreg.0.0.0 <- Arima( 31 | advert[, "sales"], xreg = advert[, "advert"], 32 | order = c(0, 0, 0) 33 | ) 34 | 35 | checkresiduals(advert_dreg.0.0.0) 36 | advert_tslm$residuals - advert_dreg.0.0.0$residuals 37 | # The residuals from dynamic regression model are almost same as the residuals from tslm function. 38 | # But when I use Arima function, I can do ARIMA modeling for residuals by designating order. 39 | 40 | # e. Refit the model using auto.arima(). How much difference does the error model make to the estimated parameters? What ARIMA model for the errors is selected? 41 | advert_dreg.auto <- auto.arima( 42 | advert[, "sales"], xreg = advert[, "advert"] 43 | ) 44 | 45 | advert_dreg.0.0.0 46 | # error model coefficients: 47 | # intercept : 78.7343, slope_advert : 0.5343 48 | 49 | advert_dreg.auto 50 | # error model : ARIMA(0, 1, 0) 51 | # xreg : 0.5063 52 | 53 | # f. Check the residuals of the fitted model. 54 | checkresiduals(advert_dreg.auto) 55 | # The residuals are like white noise. 56 | 57 | autoplot(advert[, "sales"], series = "Data") + 58 | geom_line(color = "red", size = 1) + 59 | autolayer(advert_dreg.auto$fitted, size = 1, series = "Dynamic Regression fitted values") + 60 | autolayer(advert_tslm$fitted.values, size = 1, series = "Linear Regression fitted values") + 61 | ylab("Sales volume") 62 | 63 | accuracy(advert_dreg.0.0.0) 64 | accuracy(advert_dreg.auto) 65 | # The plot above and most of errors show that dynamic regression with ARIMA(0, 1, 0) error model was better than the linear regression model. 66 | 67 | # g. Assuming the advertising budget for the next six months is exactly 10 units per month, produce and plot sales forecasts with prediction intervals for the next six months. 68 | fc_advert_dreg.auto <- forecast( 69 | advert_dreg.auto, h = 6, 70 | xreg = rep(10, 6) 71 | ) 72 | 73 | autoplot(fc_advert_dreg.auto) 74 | # The forecasts are like the result of naive method. 75 | 76 | ``` 77 | 78 | 79 | 2. This exercise uses data set huron giving the level of Lake Huron from 1875-1972. 80 | 81 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question2} 82 | 83 | # a. Fit a piecewise linear trend model to the Lake Huron data with a knot at 1920 and an ARMA error structure. 84 | autoplot(huron) 85 | 86 | t <- time(huron) 87 | t.knot <- 1920 88 | # make t.pw variable which is a vector in which contains piecewise times. Values in t.pw are 0 for all years before knot year(1920). But the values increase as year increases after the knot year. 89 | t.pw <- ts(pmax(0, t - t.knot), start = t[1]) 90 | huron_xreg <- cbind(t = t, t.pw = t.pw) 91 | huron_dreg.auto <- auto.arima( 92 | huron, xreg = huron_xreg 93 | ) 94 | 95 | huron_dreg.auto 96 | # Regression with AR(2) errors model. 97 | 98 | autoplot(huron) + 99 | autolayer(huron_dreg.auto$fitted) 100 | 101 | # b. Forecast the level for the next 30 years. 102 | h <- 30 103 | t.new <- t[length(t)] + seq(h) 104 | t.pw.new <- t.pw[length(t.pw)] + seq(h) 105 | newdata <- cbind(t = t.new, t.pw = t.pw.new) 106 | 107 | fc_huron_dreg.auto <- forecast( 108 | huron_dreg.auto, xreg = newdata, h = 30 109 | ) 110 | 111 | autoplot(fc_huron_dreg.auto) 112 | # The level of lake Huron was forecasted to increase slowly after 1973. 113 | 114 | checkresiduals(fc_huron_dreg.auto) 115 | # The residuals are like white noise. 116 | 117 | ``` 118 | 119 | 120 | 3. This exercise concerns motel: the total monthly takings from accommodation and the total room nights occupied at hotels, motels, and guest houses in Victoria, Australia, between January 1980 and June 1995. Total monthly takings are in thousands of Australian dollars; total room nights occupied are in thousands. 121 | 122 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question3} 123 | 124 | # a. Use the data to calculate the average cost of a night's accommodation in Victoria each month. 125 | autoplot(motel, facets = TRUE) 126 | 127 | avg.cost_night.room <- motel[, "Takings"] / motel[, "Roomnights"] 128 | autoplot(avg.cost_night.room) 129 | # Average cost of a night's accomodation in Victoria increased fastly until 1990, and then just oscillated without increasing until 1995. 130 | 131 | # b. Estimate the monthly CPI. 132 | # I'll use ARIMA model to estimate monthly CPI. 133 | # I'll use log transformation before fitting the model using lambda = 0 option. 134 | CPI_autoarima <- auto.arima( 135 | motel[, "CPI"], lambda = 0 136 | ) 137 | 138 | autoplot(motel[, "CPI"]) + 139 | autolayer(CPI_autoarima$fitted) 140 | # Fitted values of ARIMA model show the estimates of monthly CPI. 141 | 142 | # c. Produce time series plots of both variables and explain why logarithms of both variables need to be taken before fitting any models. 143 | autoplot(avg.cost_night.room) 144 | autoplot(CPI_autoarima$fitted) 145 | # logarithms can make the variations almost same for all the time. It can also make slowly increasing data linearly. Therefore it would be better for the above 2 variables to take logarithms before fitting any model. 146 | 147 | # d. Fit an appropriate regression model with ARIMA errors. Explain your reasoning in arriving at the final model. 148 | # fit avg.cost_night.room using linear regression model after log transformation. I will use fitted values of CPI_autoarima as a regressor. 149 | avg.cost_night.room_tslm <- tslm( 150 | avg.cost_night.room ~ CPI_autoarima$fitted, 151 | lambda = 0 152 | ) 153 | 154 | checkresiduals(avg.cost_night.room_tslm) 155 | # The residuals aren't like white noise. ARIMA model can be useful to explain the autocorrelations in the residuals. Therefore I'm going to fit with dynamic regression model. 156 | # as I said in part c, it would be better to use logarithm transformation before fitting the model. 157 | 158 | avg.cost_night.room_dreg.auto <- auto.arima( 159 | avg.cost_night.room, xreg = CPI_autoarima$fitted, 160 | lambda = 0, stepwise = FALSE, approximation = FALSE 161 | ) 162 | 163 | checkresiduals(avg.cost_night.room_dreg.auto) 164 | # The residuals are like white noise. 165 | 166 | # The unit of average cost is thousand of AU$. Therefore make a function to transform average cost value to have AU$ unit. I'm going to use the function in plot labeling. 167 | formatter1000 <- function(x){ 168 | scales::dollar(x*1000) 169 | } 170 | 171 | autoplot(avg.cost_night.room, series = "Data") + 172 | autolayer(avg.cost_night.room_dreg.auto$fitted, series = "Dynamic regression model") + 173 | autolayer(avg.cost_night.room_tslm$fitted.values, series = "Linear regression model") + 174 | ylab("Avg.cost(AU$)") + 175 | ggtitle("Average cost of a night's accomodation", 176 | subtitle = "In Victoria, Australia") + 177 | theme(plot.subtitle = element_text(size=13)) + 178 | scale_y_continuous(labels = formatter1000) 179 | # dynamic regression model after log transformation fits better than the linear regression model. 180 | 181 | # e. Forecast the average price per room for the next twelve months using your fitted model. (Hint: You will need to produce forecasts of the CPI figures first.) 182 | fc_CPI_autoarima <- forecast( 183 | CPI_autoarima, h = 12 184 | ) 185 | 186 | fc_avg.cost_night.room_dreg.auto <- forecast( 187 | avg.cost_night.room_dreg.auto, 188 | xreg = fc_CPI_autoarima$mean, 189 | h = 12 190 | ) 191 | 192 | autoplot(fc_avg.cost_night.room_dreg.auto) 193 | # It is forecasted that the average cost of a night's accomodation in Victoria, Australia will be increased a little with oscillation. 194 | 195 | ``` 196 | 197 | 198 | 4. We fitted a harmonic regression model to part of the gasoline series in Exercise 6 in Section 5.10. We will now revisit this model, and extend it to include more data and ARMA errors. 199 | 200 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question4} 201 | 202 | # a. Using tslm, fit a harmonic regression with a piecewise linear time trend to the full gasoline series. Select the position of the knots in the trend and the appropriate number of Fourier terms to include by minimizing the AICc or CV value. 203 | str(gasoline) 204 | head(gasoline) 205 | 206 | autoplot(gasoline) 207 | # It looked like this data can be divided into 3 parts. First part is increase from the 1991(start) to June, 2007. Second part is decrease from July, 2007 to the end of 2012. Third part is increase from 2013 to 2017(end). 208 | # the data are weekly data. Therefore I'll set first knot as 2007.5 and second knot as 2013. 209 | t <- time(gasoline) 210 | t.knot1 <- 2007.5 211 | t.knot2 <- 2013 212 | t.pw1 <- ts(pmax(0, t - t.knot1), start = t[1], 213 | frequency = 365.25/7) 214 | t.pw2 <- ts(pmax(0, t - t.knot2), start = t[1], 215 | frequency = 365.25/7) 216 | 217 | # Set default AICc as infinite and default number of pairs as 0. 218 | AICc <- Inf 219 | K_min.Aicc <- 0 220 | 221 | # use for-loop to find the harmonic regression model which yields least AICc. Maximum number of repetition is 26 times because the maximum number of pairs should be less than a half of the number of weeks in a year, 52.18. 222 | for(num in c(1:26)){ 223 | gasoline_tslm <- tslm( 224 | gasoline ~ trend + t.pw1 + t.pw2 + fourier( 225 | gasoline, K = num 226 | ) 227 | ) 228 | AICc_value <- CV(gasoline_tslm)["AICc"] 229 | 230 | if(AICc > AICc_value){ 231 | AICc <- AICc_value 232 | }else{ 233 | K_min.Aicc <- num 234 | break 235 | } 236 | } 237 | 238 | K_min.Aicc 239 | # 11 Fourier pairs were chosen. 240 | gasoline_tslm 241 | # 26 variables including intercept were used. 242 | 243 | autoplot(gasoline) + 244 | geom_line(color = "gray") + 245 | autolayer(gasoline_tslm$fitted.values) 246 | # fitted values are somewhat similar to the data. 247 | 248 | # b. Now refit the model using auto.arima to allow for correlated errors, keeping the same predictor variables as you used with tslm. 249 | 250 | # gasoline_autoarima <- auto.arima( 251 | # gasoline, xreg = cbind(t=t, t.pw1=t.pw1, t.pw2=t.pw2, Fourier = fourier(gasoline, K = 11))) 252 | # It takes lots of time to find the optimal model. Therefore I'm going to make ARIMA(4, 0, 2)(1, 0, 0)[52] error model designating the order in the function. 253 | gasoline_autoarima <- Arima( 254 | gasoline, xreg = cbind(t=t, t.pw1=t.pw1, t.pw2=t.pw2, Fourier = fourier(gasoline, K = 11)), 255 | order = c(4, 0, 2), seasonal = c(1, 0, 0) 256 | ) 257 | 258 | gasoline_autoarima 259 | # ARIMA(4, 0, 2)(1, 0, 0)[52] error model was chosen. 260 | 261 | # c. Check the residuals of the final model using the checkresiduals() function. Do they look sufficiently like white noise to continue? If not, try modifying your model, or removing the first few years of data. 262 | checkresiduals(gasoline_autoarima) 263 | # The residuals aren't like white noise. 264 | 265 | # I'll make dynamic regression model using data from 2000. 266 | gasoline.from2000 <- window(gasoline, start = 2000) 267 | t.from2000 <- window(t, start = 2000) 268 | t.pw1.from2000 <- window(t.pw1, start = 2000) 269 | t.pw2.from2000 <- window(t.pw2, start = 2000) 270 | 271 | # find the number of Fourier pairs for new data. 272 | AICc <- Inf 273 | K_min.Aicc <- 0 274 | 275 | for(num in c(1:26)){ 276 | gasoline.from2000_tslm <- tslm( 277 | gasoline.from2000 ~ trend + t.pw1.from2000 + t.pw2.from2000 + fourier( 278 | gasoline.from2000, K = num 279 | ) 280 | ) 281 | AICc_value <- CV(gasoline.from2000_tslm)["AICc"] 282 | 283 | if(AICc > AICc_value){ 284 | AICc <- AICc_value 285 | }else{ 286 | K_min.Aicc <- num 287 | break 288 | } 289 | } 290 | 291 | K_min.Aicc 292 | # still 11 Fourier pairs were chosen. 293 | gasoline.from2000_tslm 294 | # again, 26 variables including intercept were used. 295 | 296 | # bind new data regressors to a variable. 297 | xreg.from2000 <- cbind( 298 | t = t.from2000, 299 | t.pw1 = t.pw1.from2000, 300 | t.pw2 = t.pw2.from2000, 301 | Fourier = fourier( 302 | gasoline.from2000, K = 11 303 | ) 304 | ) 305 | 306 | # It also takes some minutes to run. 307 | gasoline.from2000_autoarima <- auto.arima( 308 | gasoline.from2000, 309 | xreg = xreg.from2000 310 | ) 311 | 312 | gasoline.from2000_autoarima 313 | # ARIMA(1, 0, 1)(1, 0, 1)[52] error model was chosen. 314 | 315 | checkresiduals(gasoline.from2000_autoarima) 316 | # The residuals aren't still like white noise. 317 | ggtsdisplay(gasoline.from2000_autoarima$residuals) 318 | # I think that if I use more autoregressive terms, I can make the residuals like white noise even if the likelihood will be worse. 319 | 320 | # The result when I tried ARIMA(6, 0, 1) error model. 321 | gasoline.from2000_arima.6.0.1 <- Arima( 322 | gasoline.from2000, 323 | xreg = xreg.from2000, 324 | order = c(6, 0, 1) 325 | ) 326 | 327 | checkresiduals(gasoline.from2000_arima.6.0.1) 328 | # now residuals are like white noise. 329 | # I finally got white-noise-like residuals by using fewer data and by modifying the model bearing worse likelihood. 330 | 331 | # d. Once you have a model with white noise residuals, produce forecasts for the next year. 332 | h = 52 333 | t.new <- t.from2000[length(t.from2000)] + seq(h)/365.25 334 | t.pw1.new <- t.pw1.from2000[length(t.pw1.from2000)] + seq(h)/365.25 335 | t.pw2.new <- t.pw2.from2000[length(t.pw2.from2000)] + seq(h)/365.25 336 | 337 | xreg.new <- cbind( 338 | t = t.new, 339 | t.pw1 = t.pw1.new, 340 | t.pw2 = t.pw2.new, 341 | Fourier = fourier( 342 | gasoline.from2000, K = 11, h = h 343 | ) 344 | ) 345 | 346 | fc_gasoline.from2000_arima.6.0.1 <- forecast( 347 | gasoline.from2000_arima.6.0.1, 348 | xreg = xreg.new, 349 | h = h 350 | ) 351 | 352 | autoplot(fc_gasoline.from2000_arima.6.0.1) 353 | # It looked like the forecasts are reasonable. 354 | 355 | ``` 356 | 357 | 358 | 6. For the retail time series considered in earlier chapters: 359 | ```{r echo=FALSE, message=FALSE, warning=FALSE, Question6} 360 | 361 | # a. Develop an appropriate dynamic regression model with Fourier terms for the seasonality. Use the AIC to select the number of Fourier terms to include in the model. (You will probably need to use the same Box-Cox transformation you identified previously.) 362 | retail <- read.xlsx("retail.xlsx", 363 | sheetIndex = 1, 364 | startRow = 2) 365 | retail.ts <- ts(retail[,"A3349873A"], 366 | frequency=12, 367 | start=c(1982,4)) 368 | 369 | autoplot(retail.ts) 370 | 371 | # lambda of Box-Cox transformation 372 | lambda_retail <- BoxCox.lambda(retail.ts) 373 | 374 | # select the number of Fourier pairs. 375 | min.AIC <- Inf 376 | K_min.Aic <- 0 377 | 378 | for(num in c(1:6)){ 379 | retail.ts_tslm <- tslm( 380 | retail.ts ~ trend + fourier(retail.ts, K = num), 381 | lambda = lambda_retail 382 | ) 383 | 384 | AIC <- CV(retail.ts_tslm)["AIC"] 385 | 386 | if(AIC < min.AIC){ 387 | min.AIC <- AIC 388 | K_min.Aic <- num 389 | } 390 | } 391 | 392 | # make harmonic regression model using the selected number of Fourier pairs. 393 | retail.ts_tslm <- tslm( 394 | retail.ts ~ trend + fourier(retail.ts, K = K_min.Aic), 395 | lambda = lambda_retail 396 | ) 397 | 398 | autoplot(retail.ts) + 399 | autolayer(retail.ts_tslm$fitted.values) 400 | # seasonal patterns look similar. 401 | 402 | # Fit dynamic regression model. 403 | retail.ts_autoarima <- auto.arima( 404 | retail.ts, 405 | lambda = lambda_retail, 406 | xreg = cbind( 407 | Fourier = fourier(retail.ts, K = K_min.Aic), 408 | time = time(retail.ts) 409 | ) 410 | ) 411 | 412 | retail.ts_autoarima 413 | # ARIMA(0, 1, 0) error model was chosen. 414 | 415 | autoplot(retail.ts) + 416 | autolayer(retail.ts_autoarima$fitted) 417 | # The residuals of the last 2 peak points were huge. 418 | 419 | # b. Check the residuals of the fitted model. Does the residual series look like white noise? 420 | checkresiduals(retail.ts_autoarima) 421 | # The residuals aren't like white noise. 422 | 423 | retail.ts_autoarima2 <- auto.arima( 424 | retail.ts, 425 | lambda = lambda_retail, 426 | xreg = cbind( 427 | Fourier = fourier(retail.ts, K = K_min.Aic), 428 | time = time(retail.ts) 429 | ), 430 | approximation = FALSE, 431 | stepwise = FALSE 432 | ) 433 | 434 | retail.ts_autoarima2 435 | # ARIMA(0, 1, 2)(2, 0, 1)[12] error model was chosen. 436 | 437 | autoplot(retail.ts) + 438 | autolayer(retail.ts_autoarima2$fitted) 439 | # The residuals of the last 2 peak points became smaller. 440 | 441 | checkresiduals(retail.ts_autoarima2) 442 | # But the residuals aren't still like white noise. But I'll use this model in forecasting because these residuals are more like white noise than the above model's residuals. 443 | 444 | # c. Compare the forecasts with those you obtained earlier using alternative models. 445 | # Get the latest figures. 446 | retail.new <- read.xlsx("8501011.xlsx", 447 | sheetName = "Data1", 448 | startRow = 10) 449 | 450 | retail.new.ts <- ts(retail.new[, "A3349873A"], 451 | start = c(1982, 4), 452 | frequency = 12) 453 | 454 | retail.new.test <- subset( 455 | retail.new.ts, 456 | start = length(retail.ts) + 1 457 | ) 458 | 459 | # make a variable which takes future values of regressors. 460 | t <- time(retail.ts) 461 | xreg.new = cbind( 462 | Fourier = fourier(retail.ts, K = K_min.Aic, h = 36), 463 | time = t[length(t)] + seq(36)/12 464 | ) 465 | 466 | # forecast. 467 | fc_retail.ts_autoarima2 <- forecast( 468 | retail.ts_autoarima2, 469 | h = 36, 470 | xreg = xreg.new 471 | ) 472 | 473 | autoplot(fc_retail.ts_autoarima2) 474 | 475 | accuracy(fc_retail.ts_autoarima2, retail.new.test) 476 | # It was worse than Holt-Winters' model, but it was better than ARIMA or seasonal naive model. 477 | 478 | ``` 479 | 480 | 481 | ### Question 5 isn't related with coding that I didn't include it in here. -------------------------------------------------------------------------------- /readme.rmd: -------------------------------------------------------------------------------- 1 | # Forecasting using R language practices 2 | 3 | ## Introduction 4 | 5 | - I made this repository to solve the forecasting questions in 'Forecasting: Principles and Practice(2nd Ed.)'. The book was written by Rob J Hyndman and George Athanasopoulos. You can read online version of the book in [here](https://otexts.org/fpp2/). 6 | 7 | ### Caution 8 | 9 | - The codes what I wrote are just my solutions, not related with official solution. They are the result of my practice, not the authors' solutions. I did my best to solve the questions as accurately as possible, but there can be wrong answers. 10 | 11 | - Some questions ask you to download data and forecast using them. You need to download the data for the questions. 12 | 13 | - I didn't solve the questions which aren't related with coding. I mentioned them at the bottom of each file. 14 | 15 | - I didn't solve the questions of Chapter 1 and 4. 16 | 17 | ## Prerequisites 18 | 19 | - You need to understand about the concepts and algorithms behind the forecast models and evaluation methods. You can study them in [here](https://otexts.org/fpp2/). 20 | 21 | - All codes are written in rmd(r markdown) files. Therefore you need to install R and Rstudio to open the files 22 | 23 | - If you already installed the Rstudio, you need to check if fpp2 package is already installed. It is the most important package that I used in forecasting. If you need to install it, type below code in the console. 24 | 25 | install.packages("fpp2") 26 | 27 | You also need to install other packages like xlsx, rdatamarket, hts, seasonal, tseries, vars, etc. 28 | 29 | ## Built With 30 | 31 | 1) readme.rmd 32 | - Introduce about this repository. 33 | 34 | 2) Chapter2.rmd 35 | - 'Time series graphics' 36 | - learn how to make ts object and deal with it using several functions like autoplot(), frequency(), etc. 37 | - learn how to draw several types of plots like seasonal plot, seasonal subseries plot, scatter plots and lag plot, etc. 38 | - Understand what is autocorrelation and learn how to draw autocorrelation function(ACF) plot. 39 | - Understand what is white noise and learn how to discern white noise series using ACF plots. 40 | 41 | 3) Chapter3.rmd 42 | - 'The forecaster's toolbox' 43 | - learn how to do simple forecasting using 4 benchmark methods. 44 | - learn transformation methods like Box-Cox transformation and how to implement transformations to data. 45 | - understand the concept of fitted values and residuals, and learn about Pormanteau tests which are used to see if the residuals are like white noise series. 46 | - learn how to evaluate forecast accuracy. 47 | - learn how to get prediction intervals in different situations. 48 | 49 | 4) Chapter5.rmd 50 | - 'Linear regression models' 51 | 52 | 5) Chapter6.rmd 53 | - 'Time series decomposition' 54 | 55 | 6) Chapter7.rmd 56 | - 'Exponential smoothing' 57 | 58 | 7) Chapter8.rmd 59 | - 'ARIMA models' 60 | 61 | 8) Chapter9.rmd 62 | - 'Dynamic regression models' 63 | 64 | 9) Chapter10.rmd 65 | - 'Forecasting hierarchical or grouped time series' 66 | 67 | 10) Chapter11.rmd 68 | - 'Advanced forecasting methods' 69 | 70 | 11) Chapter12.rmd 71 | - 'Some practical forecasting issues' 72 | - Even if there aren't any exercise in the chapter, there are some examples that are worth practicing. Therefore I'm going to deal with them in here. 73 | 74 | ## Acknowledgments and other useful resources 75 | 76 | - Forecasting: Principles and Practice online [textbook]( https://otexts.org/fpp2/) 77 | 78 | - r forecast package: https://github.com/robjhyndman/forecast 79 | 80 | - Professor Rob J Hyndman's [blog](https://robjhyndman.com/) --------------------------------------------------------------------------------