├── exercises ├── OpenGeoHub-machine-learning-training-1.R ├── OpenGeoHub-machine-learning-training-1.Rnw ├── OpenGeoHub-machine-learning-training-1.pdf ├── OpenGeoHub-machine-learning-training-2.R ├── OpenGeoHub-machine-learning-training-2.Rnw └── OpenGeoHub-machine-learning-training-2.pdf └── presentation ├── OpenGeoHub-machine-learning-lecture-1-handout.pdf ├── OpenGeoHub-machine-learning-lecture-2-handout.pdf └── OpenGeoHub-machine-learning-lecture-2-teaser_overview-soil-mapping-study.pdf /exercises/OpenGeoHub-machine-learning-training-1.R: -------------------------------------------------------------------------------- 1 | ## ----general-options,echo=FALSE----------------------------------------- 2 | 3 | library(knitr) 4 | # output code, but no warnings 5 | opts_chunk$set(echo = TRUE,eval=TRUE,warning=FALSE,cache=TRUE) 6 | # auto check dependencies (of cached chunks, its an approximation only) 7 | opts_chunk$set(autodep = TRUE) 8 | # dep_auto() # print dependencies 9 | 10 | 11 | 12 | ## ----load-packages,message=FALSE---------------------------------------- 13 | # install.packages(c("grpgreg", "glmnet", "kernlab", "caret", "randomForest", "mboost", 14 | # "gbm", "geoGAM", "raster")) 15 | library(grpreg) # for grouped lasso 16 | library(glmnet) # for general lasso 17 | library(kernlab) # for support vector machines 18 | library(caret) # for model tuning 19 | library(randomForest) # to fit random forest 20 | library(mboost) # for the boosting models with linear and spline terms 21 | library(gbm) # for the boosting model with trees 22 | library(geoGAM) # for the berne dataset 23 | library(raster) # for plotting as a raster 24 | library(parallel) # for parallel computing 25 | 26 | 27 | ## ----read-in-data------------------------------------------------------- 28 | data(berne) 29 | dim(berne) 30 | # Continuous response 31 | d.ph10 <- berne[berne$dataset == "calibration" & !is.na(berne$ph.0.10), ] 32 | d.ph10 <- d.ph10[complete.cases(d.ph10[13:ncol(d.ph10)]), ] 33 | # select validation data for subsequent validation 34 | d.ph10.val <- berne[berne$dataset == "validation" & !is.na(berne$ph.0.10), ] 35 | d.ph10.val <- d.ph10.val[complete.cases(d.ph10.val[13:ncol(d.ph10)]), ] 36 | # Binary response 37 | d.wlog100 <- berne[berne$dataset=="calibration"&!is.na(berne$waterlog.100), ] 38 | d.wlog100 <- d.wlog100[complete.cases(d.wlog100[13:ncol(d.wlog100)]), ] 39 | # Ordered/multinomial tesponse 40 | d.drain <- berne[berne$dataset == "calibration" & !is.na(berne$dclass), ] 41 | d.drain <- d.drain[complete.cases(d.drain[13:ncol(d.drain)]), ] 42 | # covariates start at col 13 43 | l.covar <- names(d.ph10[, 13:ncol(d.ph10)]) 44 | 45 | 46 | ## ----apply-example------------------------------------------------------ 47 | # loop 48 | # first create a vector to save the results 49 | t.result <- c() 50 | for( ii in 1:10 ){ t.result <- c(t.result, ii^2) } 51 | # the same as apply 52 | t.result <- sapply(1:10, function(ii){ ii^2 }) 53 | # of course, this example is even shorter using: 54 | t.result <- (1:10)^2 55 | 56 | 57 | ## ----lasso-continuous-response,cache=TRUE------------------------------- 58 | 59 | # define groups: dummy coding of a factor is treated as group 60 | # find factors 61 | l.factors <- names(d.ph10[l.covar])[ 62 | t.f <- unlist( lapply(d.ph10[l.covar], is.factor) ) ] 63 | l.numeric <- names(t.f[ !t.f ]) 64 | 65 | # create a vector that labels the groups with the same number 66 | g.groups <- c( 1:length(l.numeric), 67 | unlist( 68 | sapply(1:length(l.factors), function(n){ 69 | rep(n+length(l.numeric), nlevels(d.ph10[, l.factors[n]])-1) 70 | }) 71 | ) 72 | ) 73 | # grpreg needs model matrix as input 74 | XX <- model.matrix( ~., d.ph10[, c(l.numeric, l.factors), F])[,-1] 75 | 76 | # cross validation (CV) to find lambda 77 | ph.cvfit <- cv.grpreg(X = XX, y = d.ph10$ph.0.10, 78 | group = g.groups, 79 | penalty = "grLasso", 80 | returnY = T) # access CV results 81 | 82 | 83 | ## ----lasso-predictions-------------------------------------------------- 84 | # choose optimal lambda: CV minimum error + 1 SE (see glmnet) 85 | l.se <- ph.cvfit$cvse[ ph.cvfit$min ] + ph.cvfit$cve[ ph.cvfit$min ] 86 | idx.se <- min( which( ph.cvfit$cve < l.se ) ) - 1 87 | 88 | # create model matrix for validation set 89 | newXX <- model.matrix( ~., d.ph10.val[, c(l.factors, l.numeric), F])[,-1] 90 | t.pred.val <- predict(ph.cvfit, X = newXX, 91 | type = "response", 92 | lambda = ph.cvfit$lambda[idx.se]) 93 | # get CV predictions, e.g. to compute R2 94 | ph.lasso.cv.pred <- ph.cvfit$Y[,idx.se] 95 | 96 | 97 | ## ----lasso-get-model---------------------------------------------------- 98 | # get the non-zero coefficients: 99 | t.coef <- ph.cvfit$fit$beta[, idx.se ] 100 | t.coef[ t.coef > 0 ] 101 | 102 | 103 | ## ----lasso-plot-cv,echo=FALSE,fig.width=7,fig.height=4.5, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Cross validation error plotted against the tuning parameter lambda. The dashed line indicates lambda at minimal error, the dotted darkgrey line is the optimal lambda with minimal error + 1 SE."---- 104 | 105 | plot(ph.cvfit) 106 | abline( h = l.se, col = "grey", lty = "dotted") 107 | abline( v = log( ph.cvfit$lambda[ idx.se ]), col = "grey30", lty = "dotted") 108 | 109 | 110 | ## ----lasso-multinomial-response,cache = TRUE---------------------------- 111 | 112 | # create model matrix for drainage classes 113 | # use a subset of covariates only, because model optimization for 114 | # multinomial takes long otherwise 115 | 116 | set.seed(42) # makes sample() reproducible 117 | XX <- model.matrix(~.,d.drain[, l.covar[sample(1:length(l.covar), 30)]])[,-1] 118 | 119 | drain.cvfit <- cv.glmnet( XX, d.drain$dclass, nfold = 10, 120 | keep = T, # access CV results 121 | family = "multinomial", 122 | type.multinomial = "grouped") 123 | 124 | 125 | ## ----lasso-multinomial-response-coeffs,cache=TRUE----------------------- 126 | 127 | drain.fit <- glmnet( XX, d.drain$dclass, 128 | family = "multinomial", 129 | type.multinomial = "grouped", 130 | lambda = drain.cvfit$lambda.min) 131 | # The coeffs are here: 132 | # drain.fit$beta$well 133 | # drain.fit$beta$moderate 134 | # drain.fit$beta$poor 135 | 136 | 137 | ## ----svm,cache=TRUE----------------------------------------------------- 138 | 139 | # We have to set up the design matrix ourselfs 140 | # (without intercept, hence remove first column) 141 | XX <- model.matrix( ~., d.ph10[, c(l.covar), F])[,-1] 142 | 143 | # set seed for random numbers to split cross-valiation sets 144 | set.seed(31) 145 | # Setup for 10fold cross-validation 146 | ctrl <- trainControl(method="cv", 147 | number=10, 148 | savePredictions = "final") 149 | 150 | # 1. pass of training - find region of C and lambda 151 | svm.tune1 <- train(x = XX, 152 | y = d.ph10[, "ph.0.10"], 153 | method = "svmRadial", # radial kernel function 154 | tuneLength = 9, # check 9 values of the cost function 155 | preProc = c("center","scale"), # center and scale data 156 | trControl=ctrl) 157 | 158 | # 2. pass of training - find best value for C and lambda 159 | # setup a tuning grid with values around the result of the first pass 160 | sig <- svm.tune1$bestTune$sigma 161 | t.sigma <- sort( unique( round(abs( c(sig, sig + seq(0, sig*2, by = sig/1), 162 | sig - seq(0, sig*2, by = sig/1)) ), 6))) 163 | tune.grid <- expand.grid( 164 | sigma = t.sigma[t.sigma>0], # sigma must be positive 165 | C = sort( unique( abs( c(svm.tune1$bestTune$C, 166 | svm.tune1$bestTune$C - seq(0, 0.3, by = 0.1), 167 | svm.tune1$bestTune$C + seq(0, 0.3, by = 0.1) )) )) 168 | ) 169 | #Train and Tune the SVM 170 | svm.model <- train(x = XX, 171 | y = d.ph10[, "ph.0.10"], 172 | method = "svmRadial", 173 | preProc = c("center","scale"), 174 | tuneGrid = tune.grid, 175 | trControl = ctrl) 176 | 177 | # -> if this takes too long: take a short cut with 178 | # svm.model <- svm.tune1 179 | 180 | 181 | 182 | ## ----svm-validation-plots,fig.width=10,fig.height=5, fig.align='center', out.width='0.85\\textwidth',fig.cap = "Predictions from cross-validation (left) and the validation dataset (right) plotted against the observed values (dashed: 1:1-line, green: lowess scatterplott smoother)."---- 183 | # create validation plots with lowess scatterplot smoothers 184 | # for cross-validation 185 | par(mfrow = c(1,2)) 186 | plot(svm.model$pred$pred, svm.model$pred$obs, 187 | xlab = "cross-validation predictions", 188 | ylab = "observed", 189 | asp = 1) 190 | abline(0,1, lty = "dashed", col = "grey") 191 | lines(lowess(svm.model$pred$pred, svm.model$pred$obs), col = "darkgreen", lwd = 2) 192 | 193 | # for independent validation set 194 | # calculate predictions for the validation set 195 | newXX <- model.matrix( ~., d.ph10.val[, l.covar, F])[,-1] 196 | t.pred.val <- predict.train(svm.model, newdata = newXX) 197 | plot(t.pred.val, d.ph10.val[, "ph.0.10"], 198 | xlab = "predictions on validation set", 199 | ylab = "observed", 200 | asp = 1) 201 | abline(0,1, lty = "dashed", col = "grey") 202 | lines(lowess(t.pred.val, d.ph10.val[, "ph.0.10"]), col = "darkgreen", lwd = 2) 203 | 204 | 205 | ## ----random-forest,cache=TRUE------------------------------------------- 206 | 207 | # Fit a random forest with default parameters 208 | # (often results are already quite good) 209 | set.seed(1) 210 | rf.model.basic <- randomForest(x = d.ph10[, l.covar ], 211 | y = d.ph10[, "ph.0.10"]) 212 | 213 | # tune main tuning parameter "mtry" 214 | # (the number of covariates that are randomly selected to try at each split) 215 | 216 | # define function to use below 217 | f.tune.randomforest <- function(test.mtry, # mtry to test 218 | d.cal, # calibration data 219 | l.covariates ){ # list of covariates 220 | # set seed 221 | set.seed(1) 222 | # fit random forest with mtry = test.mtry 223 | rf.tune <- randomForest(x = d.cal[, l.covariates ], 224 | y = d.cal[, "ph.0.10"], 225 | mtry = test.mtry) 226 | # return the mean squared error (mse) of this model fit 227 | return( tail(rf.tune$mse, n=1) ) 228 | } 229 | 230 | # vector of mtry to test 231 | seq.mtry <- c(1:(length(l.covar) - 1)) 232 | # Only take every fifth for speed reasons 233 | seq.mtry <- seq.mtry[ seq.mtry %% 5 == 0 ] 234 | 235 | # Apply function to sequence. 236 | t.OOBe <- mclapply(seq.mtry, # give sequence 237 | FUN = f.tune.randomforest, # give function name 238 | mc.cores = 1, ## number of CPUs 239 | mc.set.seed = FALSE, # do not use new seed each time 240 | # now here giv the arguments to the function: 241 | d.cal = d.ph10, 242 | l.covar = l.covar ) 243 | 244 | # Hint: Who is not comfortable with "mclapply" 245 | # the same could be achieved with 246 | # for(test.mtry in 1:m.end){ 247 | # .. content of function + vector to collect result... } 248 | 249 | # create a dataframe of the results 250 | mtry.oob <- data.frame(mtry.n = seq.mtry, mtry.OOBe = unlist(t.OOBe)) 251 | 252 | # get the mtry with the minimum MSE 253 | s.mtry <- mtry.oob$mtry.n[ which.min(mtry.oob$mtry.OOBe) ] 254 | 255 | # compute random forest with optimal mtry 256 | set.seed(1) 257 | rf.model.tuned <- randomForest(x = d.ph10[, l.covar ], 258 | y = d.ph10[, "ph.0.10"], 259 | mtry = s.mtry) 260 | 261 | 262 | ## ----random-forest-plot-mtry,fig.width=6,fig.height=4.7, fig.align='center',fig.pos='!h',out.width='0.6\\textwidth',fig.cap = "Tuning parameter mtry plotted against the out-of-bag mean squared error (grey line: lowess smoothing line, dashed line: mtry at minimum MSE)."---- 263 | plot( mtry.oob$mtry.n, mtry.oob$mtry.OOBe, pch = 4, 264 | ylab = "out-of-bag MSE error", xlab = "mtry") 265 | abline(v = s.mtry, lty = "dashed", col = "darkgrey") 266 | lines( lowess( mtry.oob$mtry.n, mtry.oob$mtry.OOBe ), lwd = 1.5, col = "darkgrey") 267 | 268 | 269 | ## ----boosted-trees-tuning,cache=TRUE------------------------------------ 270 | 271 | # create a grid of the tuning parameters to be tested, 272 | # main tuning parameters are: 273 | gbm.grid <- expand.grid( 274 | # how many splits does each tree have 275 | interaction.depth = c(2,5,10,15,20), 276 | # how many trees do we add (number of iterations of boosting algorithm) 277 | n.trees = seq(2,250, by = 5), 278 | # put the shrinkage factor to 0.1 (=10% updates as used 279 | # in package mboost), the default (0.1%) is a bit too small, 280 | # makes model selection too slow. 281 | # minimum number of observations per node can be left as is 282 | shrinkage = 0.1, n.minobsinnode = 10) 283 | 284 | # make tuning reproducible (there are random samples for the cross validation) 285 | set.seed(291201945) 286 | 287 | # train the gbm model 288 | # Remove "ge_caco3" throws an error since Package gbm 2.1.5, 289 | # this bug is reported: https://github.com/gbm-developers/gbm/issues/40 290 | gbm.model <- train(x=d.ph10[, l.covar[-c(50)] ], 291 | y=d.ph10[, "ph.0.10"], 292 | method = "gbm", # choose "generalized boosted regression model" 293 | tuneGrid = gbm.grid, 294 | verbose = FALSE, 295 | trControl = trainControl( 296 | # use 10fold cross validation (CV) 297 | method = "cv", number = 10, 298 | # save fitted values (e.g. to calculate RMSE of the CV) 299 | savePredictions = "final")) 300 | 301 | # print optimal tuning parameter 302 | gbm.model$bestTune 303 | 304 | 305 | ## ----boosted-trees-map,fig.width=5,fig.height=5, fig.align='center', out.width='0.7\\textwidth',fig.cap = "Predictions computed with an optimized boosted trees model of topsoil pH (0--10 cm) for a very small part of the Berne study region (white areas are streets, developped areas or forests, CRAN does not accept larger datasets)."---- 306 | 307 | # compute predictions for the small part of the study area 308 | # (agricultural land, the empty pixels are streets, forests etc.) 309 | data("berne.grid") 310 | 311 | berne.grid$pred <- predict.train(gbm.model, newdata = berne.grid ) 312 | 313 | # create a spatial object for a proper spatial plot 314 | coordinates(berne.grid) <- ~x+y 315 | # add the Swiss projection (see ?berne.grid) 316 | # see https://epsg.io for details on projections 317 | proj4string(berne.grid) <- CRS("+init=epsg:21781") 318 | # create a raster object from the spatial point dataframe 319 | gridded(berne.grid) <- TRUE 320 | plot(raster(berne.grid, layer = "pred")) 321 | 322 | 323 | 324 | ## ----boosted-trees-partial-dependencies,fig.pos="h",fig.width=7,fig.height=7, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Partial dependence plots of boosted trees model for the four most important covariates."---- 325 | 326 | # get variable importance 327 | t.imp <- varImp(gbm.model$finalModel) 328 | 329 | # check how many covariates were never selected 330 | sum( t.imp$Overall == 0 ) 331 | 332 | # order and select 4 most important covariates 333 | t.names <- dimnames(t.imp)[[1]][ order(t.imp$Overall, decreasing = T)[1:4] ] 334 | 335 | par(mfrow = c(2,2)) 336 | for( name in t.names ){ 337 | # select index of covariate 338 | ix <- which( gbm.model$finalModel$var.names == name ) 339 | plot(gbm.model$finalModel, i.var = ix) 340 | } 341 | 342 | # -> improve the plots by using the same y-axis (e.g. ylim=c(..,..)) 343 | # for all of them, and try to add labels (xlab = , ylab = ) 344 | # or a title (main = ) 345 | 346 | 347 | 348 | ## ----glmboost,cache=TRUE------------------------------------------------ 349 | # Fit model 350 | ph.glmboost <- glmboost(ph.0.10 ~., data = d.ph10[ c("ph.0.10", l.covar)], 351 | control = boost_control(mstop = 200), 352 | center = TRUE) 353 | 354 | # Find tuning parameter: mstop = number of boosting itertations 355 | set.seed(42) 356 | ph.glmboost.cv <- cvrisk(ph.glmboost, 357 | folds = mboost::cv(model.weights(ph.glmboost), 358 | type = "kfold")) 359 | 360 | # print optimal mstop 361 | mstop(ph.glmboost.cv) 362 | 363 | ## print model with fitted coefficents 364 | # ph.glmboost[ mstop(ph.glmboost.cv)] 365 | 366 | 367 | ## ----glmboost-plot,fig.width=7,fig.height=5, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Path of cross validation error along the boosting iterations.", echo = FALSE---- 368 | plot(ph.glmboost.cv) 369 | 370 | 371 | ## ----gamboost,cache=TRUE,message=FALSE---------------------------------- 372 | 373 | # quick set up formula 374 | 375 | # Response 376 | f.resp <- "ph.0.10 ~ " 377 | 378 | # Intercept, add to dataframe 379 | f.int <- "bols(int, intercept = F, df = 1)" 380 | d.ph10$int <- rep(1, nrow(d.ph10)) 381 | 382 | # Smooth spatial surface (needs > 4 degrees of freedom) 383 | f.spat <- "bspatial(x, y, df = 5, knots = 12)" 384 | 385 | # Linear baselearners for factors, maybe use df = 5 386 | f.fact <- paste( 387 | paste( "bols(", l.factors, ", intercept = F)" ), 388 | collapse = "+" 389 | ) 390 | 391 | # Splines baselearners for continuous covariates 392 | f.num <- paste( 393 | paste( "bbs(", l.numeric, ", center = T, df = 5)" ), 394 | collapse = "+" 395 | ) 396 | 397 | # create complete formula 398 | ph.form <- as.formula( paste( f.resp, 399 | paste( c(f.int, f.num, f.spat, f.fact), 400 | collapse = "+")) ) 401 | # fit the boosting model 402 | ph.gamboost <- gamboost(ph.form, data = d.ph10, 403 | control = boost_control(mstop = 200)) 404 | 405 | # Find tuning parameter 406 | ph.gamboost.cv <- cvrisk(ph.gamboost, 407 | folds = mboost::cv(model.weights(ph.gamboost), 408 | type = "kfold")) 409 | 410 | 411 | ## ----gamboost-results--------------------------------------------------- 412 | # print optimal mstop 413 | mstop(ph.gamboost.cv) 414 | 415 | ## print model info 416 | ph.gamboost[ mstop(ph.glmboost.cv)] 417 | ## print number of chosen baselearners 418 | length( t.sel <- summary( ph.gamboost[ mstop(ph.glmboost.cv)] )$selprob ) 419 | 420 | # Most often selected were: 421 | summary( ph.gamboost[ mstop(ph.glmboost.cv)] )$selprob[1:5] 422 | 423 | 424 | ## ----gamboost-partial-plots,echo=FALSE,fig.width=7,fig.height=6, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Residual plots of the 4 covariates with highest selection frequency."---- 425 | par(mfrow=c(2,2) ) 426 | plot(ph.gamboost[ mstop(ph.glmboost.cv)], which = names(t.sel[1:4]) ) 427 | 428 | 429 | ## ----gamboost-partial-plots-spatial,echo=FALSE,fig.width=7,fig.height=5, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Modelled smooth spatial surface based on the coordinates."---- 430 | par(mfrow=c(1,1) ) 431 | plot(ph.gamboost[ mstop(ph.glmboost.cv)], which = grep("bspat", names(t.sel), value = T) ) 432 | 433 | 434 | ## ----session-info,results='asis'---------------------------------------- 435 | toLatex(sessionInfo(), locale = FALSE) 436 | 437 | 438 | ## ----export-r-code,echo=FALSE,result="hide"----------------------------- 439 | # purl("OpenGeoHub-machine-learning-training-1.Rnw") 440 | 441 | -------------------------------------------------------------------------------- /exercises/OpenGeoHub-machine-learning-training-1.Rnw: -------------------------------------------------------------------------------- 1 | % !TeX spellcheck = en_US 2 | %#------------------------------------------------------------------------------ 3 | %# Name: OpenGeoHub-machine-learning-training.Rnw 4 | %# (knitr document: R + Latex) 5 | %# 6 | %# Inhalt: Exercises for OpenGeoHub Summer School, 1. Series 7 | %# 8 | %# Autorin: Madlene Nussbaum, BFH-HAFL 9 | %# Datum: August 2018 10 | %# Licence: GNU General Public License 11 | %#------------------------------------------------------------------------------ 12 | 13 | \documentclass[11pt,a4paper,twoside]{article} 14 | 15 | \usepackage[utf8]{inputenc} 16 | \usepackage{blindtext} % for blind text 17 | \usepackage{hyperref} % links in table of contents 18 | \usepackage[english]{babel} 19 | \usepackage{amsthm} % for renvironment 20 | \usepackage{natbib} 21 | \usepackage[iso,german]{isodate} 22 | \usepackage{subcaption} 23 | 24 | \newtheorem{rexample}{R Example}[section] 25 | 26 | % Some colors for the links 27 | \definecolor{darkblue}{rgb}{0,0,0.5} 28 | \definecolor{darkmagenta}{rgb}{0.5,0,0.5} 29 | \definecolor{darkgreen}{rgb}{0,0.4,0} 30 | \definecolor{darkred}{rgb}{0.7,0,0} 31 | 32 | \hypersetup{ 33 | draft=false, 34 | colorlinks=true,linkcolor=darkblue,citecolor=darkred,urlcolor=darkgreen, 35 | breaklinks=true, bookmarksnumbered=true 36 | } 37 | 38 | % % Headers 39 | \usepackage{fancyhdr} 40 | \pagestyle{fancy} 41 | \fancyhf{} 42 | \fancyhead[OL]{\leftmark}% odd page, left 43 | \fancyhead[OR]{\thepage}% odd page, right 44 | \fancyhead[EL]{\thepage}% even page, left 45 | \fancyhead[ER]{\leftmark}% even page, right 46 | \renewcommand{\headrulewidth}{0.4pt} 47 | \fancyheadoffset[R]{1pt} 48 | 49 | % captions in bold 50 | \usepackage[font = {small}, labelfont = bf]{caption} 51 | 52 | % format page borders 53 | \usepackage[a4paper]{geometry} 54 | \geometry{verbose,tmargin=2.5cm,bmargin=2.3cm,lmargin=2.5cm,rmargin=3cm,headheight=1.7cm,headsep=0.9cm,footskip=1.5cm} 55 | 56 | % no indents 57 | \setlength\parindent{0pt} 58 | \setlength{\parskip}{3pt} 59 | 60 | 61 | % Top aling logos on title page 62 | \def\imagebox#1#2{\vtop to #1{\null\hbox{#2}\vfill}} 63 | 64 | \newcommand{\bskip}{\vspace{0.7cm}} 65 | 66 | \begin{document} 67 | 68 | % % Logos 69 | \begin{figure} 70 | \centering 71 | \begin{subfigure}[t]{0.27\textwidth} 72 | % \includegraphics[width=\textwidth]{BFH-Logo.pdf} 73 | \imagebox{37.5mm}{\includegraphics[width=\textwidth]{figure/BFH-Logo.pdf}} 74 | \end{subfigure} 75 | \hfill 76 | \begin{subfigure}[t]{0.3\textwidth} 77 | % \includegraphics[width=\textwidth]{3455_original_isric.png} 78 | \imagebox{37.5mm}{\includegraphics[width=\textwidth]{figure/logo-opengeohub.png}} 79 | \end{subfigure} 80 | \end{figure} 81 | 82 | % Title 83 | \vspace{5cm} 84 | {\LARGE\textsf{OpenGeoHub Summer School}} 85 | 86 | \vspace{0.7cm} 87 | {\Large\textbf{\textsf{Mastering Machine Learning for Spatial Prediction I} }} 88 | 89 | \vspace{0.3cm} 90 | 91 | {\Large\textsf{Practical training} } 92 | 93 | \vspace{0.5cm} 94 | \textsf{Madlene Nussbaum, 4/5 September 2019} 95 | 96 | { \small \textsf{\copyright~ CC-BY-NC-SA } } 97 | \bigskip 98 | 99 | 100 | % Table of contents (with empty back page() 101 | \setlength{\parskip}{0pt} 102 | \tableofcontents 103 | \thispagestyle{empty} 104 | \setlength{\parskip}{4pt} 105 | 106 | % \newpage 107 | % \mbox{} 108 | % \thispagestyle{empty} 109 | % \newpage 110 | 111 | % -------- 112 | 113 | 114 | <>= 115 | 116 | library(knitr) 117 | # output code, but no warnings 118 | opts_chunk$set(echo = TRUE,eval=TRUE,warning=FALSE,cache=TRUE) 119 | # auto check dependencies (of cached chunks, its an approximation only) 120 | opts_chunk$set(autodep = TRUE) 121 | # dep_auto() # print dependencies 122 | 123 | @ 124 | 125 | 126 | 127 | \section*{Preparation} 128 | 129 | Load needed packages: 130 | 131 | <>= 132 | # install.packages(c("grpgreg", "glmnet", "kernlab", "caret", "randomForest", "mboost", 133 | # "gbm", "geoGAM", "raster")) 134 | library(grpreg) # for grouped lasso 135 | library(glmnet) # for general lasso 136 | library(kernlab) # for support vector machines 137 | library(caret) # for model tuning 138 | library(randomForest) # to fit random forest 139 | library(mboost) # for the boosting models with linear and spline terms 140 | library(gbm) # for the boosting model with trees 141 | library(geoGAM) # for the berne dataset 142 | library(raster) # for plotting as a raster 143 | library(parallel) # for parallel computing 144 | @ 145 | 146 | As an example you can work with the Berne soil mapping study area: dataset \texttt{berne} in R package \texttt{geoGAM}, contains continuous, binary and a multinomial/ordered response and a spatial data \texttt{berne.grid} for prediction. 147 | 148 | Feel free to work with your own data! 149 | 150 | Hint: The processing of this code is quite time consuming on a laptop. Normaly, one uses high performance computing facilities for machine learning. 151 | 152 | Load the data, select the calibration set and remove missing values in covariates: 153 | 154 | <>= 155 | data(berne) 156 | dim(berne) 157 | # Continuous response 158 | d.ph10 <- berne[berne$dataset == "calibration" & !is.na(berne$ph.0.10), ] 159 | d.ph10 <- d.ph10[complete.cases(d.ph10[13:ncol(d.ph10)]), ] 160 | # select validation data for subsequent validation 161 | d.ph10.val <- berne[berne$dataset == "validation" & !is.na(berne$ph.0.10), ] 162 | d.ph10.val <- d.ph10.val[complete.cases(d.ph10.val[13:ncol(d.ph10)]), ] 163 | # Binary response 164 | d.wlog100 <- berne[berne$dataset=="calibration"&!is.na(berne$waterlog.100), ] 165 | d.wlog100 <- d.wlog100[complete.cases(d.wlog100[13:ncol(d.wlog100)]), ] 166 | # Ordered/multinomial tesponse 167 | d.drain <- berne[berne$dataset == "calibration" & !is.na(berne$dclass), ] 168 | d.drain <- d.drain[complete.cases(d.drain[13:ncol(d.drain)]), ] 169 | # covariates start at col 13 170 | l.covar <- names(d.ph10[, 13:ncol(d.ph10)]) 171 | @ 172 | 173 | 174 | 175 | \section{Lasso -- linear shrinkage method} 176 | \markboth{Lasso -- linear shrinkage method}{Lasso -- linear shrinkage method} 177 | 178 | \paragraph{Lasso for continuous response} \mbox{} \nolinebreak 179 | 180 | The \texttt{berne} dataset contains categorical covariates (factors, e.g. geological map with different substrate classes). The group lasso (R package \texttt{grpreg}) ensures that all dummy covariates of one factor are excluded (coefficients set to 0) together or remain in the model as a group. 181 | 182 | The main tuning parameter $\lambda$ is selected by cross validation. $\lambda$ determines the degree of shrinkage that is applied to the coefficients. 183 | 184 | 185 | \bigskip 186 | 187 | HINT for R newbies: the \texttt{apply}-functions in R are replacements for loops (\texttt{sapply}: loop over a sequence of numbers, \texttt{lapply}: loop over a list). Compared to \texttt{for}, an \texttt{apply} is much faster and general coding style, though a bit more tricky to program. 188 | 189 | Example, how to replace a \texttt{for} by a \texttt{sapply}: 190 | <>= 191 | # loop 192 | # first create a vector to save the results 193 | t.result <- c() 194 | for( ii in 1:10 ){ t.result <- c(t.result, ii^2) } 195 | # the same as apply 196 | t.result <- sapply(1:10, function(ii){ ii^2 }) 197 | # of course, this example is even shorter using: 198 | t.result <- (1:10)^2 199 | @ 200 | 201 | 202 | Now we create the setup using \texttt{apply} and fit the grouped lasso: 203 | <>= 204 | 205 | # define groups: dummy coding of a factor is treated as group 206 | # find factors 207 | l.factors <- names(d.ph10[l.covar])[ 208 | t.f <- unlist( lapply(d.ph10[l.covar], is.factor) ) ] 209 | l.numeric <- names(t.f[ !t.f ]) 210 | 211 | # create a vector that labels the groups with the same number 212 | g.groups <- c( 1:length(l.numeric), 213 | unlist( 214 | sapply(1:length(l.factors), function(n){ 215 | rep(n+length(l.numeric), nlevels(d.ph10[, l.factors[n]])-1) 216 | }) 217 | ) 218 | ) 219 | # grpreg needs model matrix as input 220 | XX <- model.matrix( ~., d.ph10[, c(l.numeric, l.factors), F])[,-1] 221 | 222 | # cross validation (CV) to find lambda 223 | ph.cvfit <- cv.grpreg(X = XX, y = d.ph10$ph.0.10, 224 | group = g.groups, 225 | penalty = "grLasso", 226 | returnY = T) # access CV results 227 | @ 228 | 229 | 230 | Compute predictions for the validation set with optimal number of groups chosen by lasso: 231 | <>= 232 | # choose optimal lambda: CV minimum error + 1 SE (see glmnet) 233 | l.se <- ph.cvfit$cvse[ ph.cvfit$min ] + ph.cvfit$cve[ ph.cvfit$min ] 234 | idx.se <- min( which( ph.cvfit$cve < l.se ) ) - 1 235 | 236 | # create model matrix for validation set 237 | newXX <- model.matrix( ~., d.ph10.val[, c(l.factors, l.numeric), F])[,-1] 238 | t.pred.val <- predict(ph.cvfit, X = newXX, 239 | type = "response", 240 | lambda = ph.cvfit$lambda[idx.se]) 241 | # get CV predictions, e.g. to compute R2 242 | ph.lasso.cv.pred <- ph.cvfit$Y[,idx.se] 243 | @ 244 | 245 | Get the lasso (non-zero) coefficients of the optimal model: 246 | <>= 247 | # get the non-zero coefficients: 248 | t.coef <- ph.cvfit$fit$beta[, idx.se ] 249 | t.coef[ t.coef > 0 ] 250 | @ 251 | 252 | 253 | 254 | <>= 255 | 256 | plot(ph.cvfit) 257 | abline( h = l.se, col = "grey", lty = "dotted") 258 | abline( v = log( ph.cvfit$lambda[ idx.se ]), col = "grey30", lty = "dotted") 259 | @ 260 | 261 | 262 | \paragraph{Lasso for multinomial response} \mbox{} \nolinebreak 263 | 264 | I am not aware of a lasso implementation for multinomial responses 265 | that can handle groups of factors. Therefore, we use ``standard'' lasso from R package \texttt{glmnet} (the option \texttt{type.multinomial = "grouped"} does only ensure all coefficients of the multinomial model for the same covariate are treated as groups). 266 | 267 | 268 | <>= 269 | 270 | # create model matrix for drainage classes 271 | # use a subset of covariates only, because model optimization for 272 | # multinomial takes long otherwise 273 | 274 | set.seed(42) # makes sample() reproducible 275 | XX <- model.matrix(~.,d.drain[, l.covar[sample(1:length(l.covar), 30)]])[,-1] 276 | 277 | drain.cvfit <- cv.glmnet( XX, d.drain$dclass, nfold = 10, 278 | keep = T, # access CV results 279 | family = "multinomial", 280 | type.multinomial = "grouped") 281 | @ 282 | 283 | For getting the coefficients of the final model you run the \texttt{glmnet} function again with the selected $\lambda$. 284 | Please note: The multinomial fit results in a coefficient for each covariate and response level. 285 | 286 | <>= 287 | 288 | drain.fit <- glmnet( XX, d.drain$dclass, 289 | family = "multinomial", 290 | type.multinomial = "grouped", 291 | lambda = drain.cvfit$lambda.min) 292 | # The coeffs are here: 293 | # drain.fit$beta$well 294 | # drain.fit$beta$moderate 295 | # drain.fit$beta$poor 296 | @ 297 | 298 | 299 | \paragraph{Please continue:} 300 | 301 | \begin{itemize} 302 | \item Select the lasso for a binary response (e.g. presence/absence of waterlogging \texttt{waterlog.100}). Use \texttt{family = "binomial"} in \texttt{cv.grpreg} and make sure your response is coded as 0/1. 303 | \item For the multinomial lasso fit of drainage class: compute predictions for the validation set (\texttt{predict} with \texttt{s="lambda.1se"} or \texttt{s="lambda.min"}). Then, evaluate prediction accuracy by e.g. using Pierce Skill Score, see function \texttt{verify} or \texttt{multi.cont} in R package \texttt{verification}. 304 | % t.pred.val <- predict(drain.cvfit, newx = newXX, s="lambda.min", type = "class") 305 | \end{itemize} 306 | 307 | 308 | \clearpage 309 | \section{Support vector machines} 310 | 311 | 312 | We use support vector machines (SVM) for regression from the package \texttt{kernlab} with radial kernel basis functions that fit local relations in feature space. The tuning parameter \texttt{C} defines the flexibility of the SVM to allow for wrongly predicted data points and the parameter $\sigma$ the degree of non-linearity of the radial kernel. Here we apply a two step approach to find optimal tuning parameters. \texttt{C} and $\sigma$ of a first pass are used as starting points to find optimal parameters around the first estimates. 313 | 314 | We tune the SVM with \texttt{caret} with a cross-validation. \texttt{caret} is a meta package that provides a homogenous interface to about 80 machine learning methods. 315 | 316 | <>= 317 | 318 | # We have to set up the design matrix ourselfs 319 | # (without intercept, hence remove first column) 320 | XX <- model.matrix( ~., d.ph10[, c(l.covar), F])[,-1] 321 | 322 | # set seed for random numbers to split cross-valiation sets 323 | set.seed(31) 324 | # Setup for 10fold cross-validation 325 | ctrl <- trainControl(method="cv", 326 | number=10, 327 | savePredictions = "final") 328 | 329 | # 1. pass of training - find region of C and lambda 330 | svm.tune1 <- train(x = XX, 331 | y = d.ph10[, "ph.0.10"], 332 | method = "svmRadial", # radial kernel function 333 | tuneLength = 9, # check 9 values of the cost function 334 | preProc = c("center","scale"), # center and scale data 335 | trControl=ctrl) 336 | 337 | # 2. pass of training - find best value for C and lambda 338 | # setup a tuning grid with values around the result of the first pass 339 | sig <- svm.tune1$bestTune$sigma 340 | t.sigma <- sort( unique( round(abs( c(sig, sig + seq(0, sig*2, by = sig/1), 341 | sig - seq(0, sig*2, by = sig/1)) ), 6))) 342 | tune.grid <- expand.grid( 343 | sigma = t.sigma[t.sigma>0], # sigma must be positive 344 | C = sort( unique( abs( c(svm.tune1$bestTune$C, 345 | svm.tune1$bestTune$C - seq(0, 0.3, by = 0.1), 346 | svm.tune1$bestTune$C + seq(0, 0.3, by = 0.1) )) )) 347 | ) 348 | #Train and Tune the SVM 349 | svm.model <- train(x = XX, 350 | y = d.ph10[, "ph.0.10"], 351 | method = "svmRadial", 352 | preProc = c("center","scale"), 353 | tuneGrid = tune.grid, 354 | trControl = ctrl) 355 | 356 | # -> if this takes too long: take a short cut with 357 | # svm.model <- svm.tune1 358 | 359 | @ 360 | 361 | 362 | <>= 363 | # create validation plots with lowess scatterplot smoothers 364 | # for cross-validation 365 | par(mfrow = c(1,2)) 366 | plot(svm.model$pred$pred, svm.model$pred$obs, 367 | xlab = "cross-validation predictions", 368 | ylab = "observed", 369 | asp = 1) 370 | abline(0,1, lty = "dashed", col = "grey") 371 | lines(lowess(svm.model$pred$pred, svm.model$pred$obs), col = "darkgreen", lwd = 2) 372 | 373 | # for independent validation set 374 | # calculate predictions for the validation set 375 | newXX <- model.matrix( ~., d.ph10.val[, l.covar, F])[,-1] 376 | t.pred.val <- predict.train(svm.model, newdata = newXX) 377 | plot(t.pred.val, d.ph10.val[, "ph.0.10"], 378 | xlab = "predictions on validation set", 379 | ylab = "observed", 380 | asp = 1) 381 | abline(0,1, lty = "dashed", col = "grey") 382 | lines(lowess(t.pred.val, d.ph10.val[, "ph.0.10"]), col = "darkgreen", lwd = 2) 383 | @ 384 | 385 | 386 | \clearpage 387 | \section{Random forest} 388 | 389 | Here we fit a random forest with the package \texttt{randomForest}. If you work with very large datasets consider using the package \texttt{ranger}. More important: use paralell computing as demonstrated here with the function \texttt{mclapply} (does not work on Windows, then use \texttt{mc.cores = 1}). 390 | 391 | This is advanced programming with functions and \texttt{apply}. But, I wanted to show how you can do the tuning of a ML method yourself without using a meta-function like \texttt{train} from package \texttt{caret} (see below). Like this you can control it does what it should (and the code is not much longer) 392 | 393 | <>= 394 | 395 | # Fit a random forest with default parameters 396 | # (often results are already quite good) 397 | set.seed(1) 398 | rf.model.basic <- randomForest(x = d.ph10[, l.covar ], 399 | y = d.ph10[, "ph.0.10"]) 400 | 401 | # tune main tuning parameter "mtry" 402 | # (the number of covariates that are randomly selected to try at each split) 403 | 404 | # define function to use below 405 | f.tune.randomforest <- function(test.mtry, # mtry to test 406 | d.cal, # calibration data 407 | l.covariates ){ # list of covariates 408 | # set seed 409 | set.seed(1) 410 | # fit random forest with mtry = test.mtry 411 | rf.tune <- randomForest(x = d.cal[, l.covariates ], 412 | y = d.cal[, "ph.0.10"], 413 | mtry = test.mtry) 414 | # return the mean squared error (mse) of this model fit 415 | return( tail(rf.tune$mse, n=1) ) 416 | } 417 | 418 | # vector of mtry to test 419 | seq.mtry <- c(1:(length(l.covar) - 1)) 420 | # Only take every fifth for speed reasons 421 | seq.mtry <- seq.mtry[ seq.mtry %% 5 == 0 ] 422 | 423 | # Apply function to sequence. 424 | t.OOBe <- mclapply(seq.mtry, # give sequence 425 | FUN = f.tune.randomforest, # give function name 426 | mc.cores = 1, ## number of CPUs 427 | mc.set.seed = FALSE, # do not use new seed each time 428 | # now here giv the arguments to the function: 429 | d.cal = d.ph10, 430 | l.covar = l.covar ) 431 | 432 | # Hint: Who is not comfortable with "mclapply" 433 | # the same could be achieved with 434 | # for(test.mtry in 1:m.end){ 435 | # .. content of function + vector to collect result... } 436 | 437 | # create a dataframe of the results 438 | mtry.oob <- data.frame(mtry.n = seq.mtry, mtry.OOBe = unlist(t.OOBe)) 439 | 440 | # get the mtry with the minimum MSE 441 | s.mtry <- mtry.oob$mtry.n[ which.min(mtry.oob$mtry.OOBe) ] 442 | 443 | # compute random forest with optimal mtry 444 | set.seed(1) 445 | rf.model.tuned <- randomForest(x = d.ph10[, l.covar ], 446 | y = d.ph10[, "ph.0.10"], 447 | mtry = s.mtry) 448 | @ 449 | 450 | 451 | <>= 452 | plot( mtry.oob$mtry.n, mtry.oob$mtry.OOBe, pch = 4, 453 | ylab = "out-of-bag MSE error", xlab = "mtry") 454 | abline(v = s.mtry, lty = "dashed", col = "darkgrey") 455 | lines( lowess( mtry.oob$mtry.n, mtry.oob$mtry.OOBe ), lwd = 1.5, col = "darkgrey") 456 | @ 457 | 458 | 459 | \paragraph{Please continue:} 460 | 461 | \begin{itemize} 462 | \item Compute the predictions for the validation set with the tuned and the model with default values (\texttt{predict(rf.model.tuned, newdata = ...)}) and compute a root mean squared error. Was the tuning effort worthwhile? 463 | \item Implement the same tunging with the package \texttt{caret}. Check the option \texttt{methofd="oob"} of \texttt{trainControl}. This function is handed to \texttt{train}. 464 | \end{itemize} 465 | 466 | 467 | 468 | \clearpage 469 | \section{Gradient boosting} 470 | 471 | \subsection{Boosting with trees as baselearners} 472 | 473 | There are various R packages to fit boosting models (e.g. \texttt{mboost}, \texttt{xgboost}). We use \texttt{gbm} here. We can again tune it with \texttt{caret}. \texttt{caret} is a meta package that provides a homogenous interface to about 80 machine learning methods. 474 | 475 | 476 | Fit gradient boosting with trees: 477 | 478 | <>= 479 | 480 | # create a grid of the tuning parameters to be tested, 481 | # main tuning parameters are: 482 | gbm.grid <- expand.grid( 483 | # how many splits does each tree have 484 | interaction.depth = c(2,5,10,15,20), 485 | # how many trees do we add (number of iterations of boosting algorithm) 486 | n.trees = seq(2,250, by = 5), 487 | # put the shrinkage factor to 0.1 (=10% updates as used 488 | # in package mboost), the default (0.1%) is a bit too small, 489 | # makes model selection too slow. 490 | # minimum number of observations per node can be left as is 491 | shrinkage = 0.1, n.minobsinnode = 10) 492 | 493 | # make tuning reproducible (there are random samples for the cross validation) 494 | set.seed(291201945) 495 | 496 | # train the gbm model 497 | # Remove "ge_caco3" throws an error since Package gbm 2.1.5, 498 | # this bug is reported: https://github.com/gbm-developers/gbm/issues/40 499 | gbm.model <- train(x=d.ph10[, l.covar[-c(50)] ], 500 | y=d.ph10[, "ph.0.10"], 501 | method = "gbm", # choose "generalized boosted regression model" 502 | tuneGrid = gbm.grid, 503 | verbose = FALSE, 504 | trControl = trainControl( 505 | # use 10fold cross validation (CV) 506 | method = "cv", number = 10, 507 | # save fitted values (e.g. to calculate RMSE of the CV) 508 | savePredictions = "final")) 509 | 510 | # print optimal tuning parameter 511 | gbm.model$bestTune 512 | @ 513 | 514 | 515 | <>= 516 | 517 | # compute predictions for the small part of the study area 518 | # (agricultural land, the empty pixels are streets, forests etc.) 519 | data("berne.grid") 520 | 521 | berne.grid$pred <- predict.train(gbm.model, newdata = berne.grid ) 522 | 523 | # create a spatial object for a proper spatial plot 524 | coordinates(berne.grid) <- ~x+y 525 | # add the Swiss projection (see ?berne.grid) 526 | # see https://epsg.io for details on projections 527 | proj4string(berne.grid) <- CRS("+init=epsg:21781") 528 | # create a raster object from the spatial point dataframe 529 | gridded(berne.grid) <- TRUE 530 | plot(raster(berne.grid, layer = "pred")) 531 | 532 | @ 533 | 534 | 535 | Lets check the partial dependencies of the 4 most important covariates: 536 | 537 | <>= 538 | 539 | # get variable importance 540 | t.imp <- varImp(gbm.model$finalModel) 541 | 542 | # check how many covariates were never selected 543 | sum( t.imp$Overall == 0 ) 544 | 545 | # order and select 4 most important covariates 546 | t.names <- dimnames(t.imp)[[1]][ order(t.imp$Overall, decreasing = T)[1:4] ] 547 | 548 | par(mfrow = c(2,2)) 549 | for( name in t.names ){ 550 | # select index of covariate 551 | ix <- which( gbm.model$finalModel$var.names == name ) 552 | plot(gbm.model$finalModel, i.var = ix) 553 | } 554 | 555 | # -> improve the plots by using the same y-axis (e.g. ylim=c(..,..)) 556 | # for all of them, and try to add labels (xlab = , ylab = ) 557 | # or a title (main = ) 558 | 559 | @ 560 | 561 | 562 | 563 | 564 | 565 | \subsection{Boosting with linear baselearners (advanced task)} 566 | 567 | Boosting algorithm can be used with any kind of base procedures / baselearners. Many packages (e.g. \texttt{gbm}, \texttt{xgboost}) use trees. Here we try linear and splines baselearners. 568 | 569 | 570 | For details on \texttt{mboost} see the hands-on tutorial in the vignette to the package: 571 | \url{https://cran.r-project.org/web/packages/mboost/vignettes/mboost_tutorial.pdf} 572 | 573 | Select a boosting model with linear baselearners (this results in shrunken coefficients, similar to the lasso, see Hastie et al. 2009): 574 | 575 | <>= 576 | # Fit model 577 | ph.glmboost <- glmboost(ph.0.10 ~., data = d.ph10[ c("ph.0.10", l.covar)], 578 | control = boost_control(mstop = 200), 579 | center = TRUE) 580 | 581 | # Find tuning parameter: mstop = number of boosting itertations 582 | set.seed(42) 583 | ph.glmboost.cv <- cvrisk(ph.glmboost, 584 | folds = mboost::cv(model.weights(ph.glmboost), 585 | type = "kfold")) 586 | 587 | # print optimal mstop 588 | mstop(ph.glmboost.cv) 589 | 590 | ## print model with fitted coefficents 591 | # ph.glmboost[ mstop(ph.glmboost.cv)] 592 | @ 593 | 594 | 595 | <>= 596 | plot(ph.glmboost.cv) 597 | @ 598 | 599 | 600 | \subsection{Boosting with splines baselearners (advanced task)} 601 | 602 | To model non-linear relationships we use splines baselearners. Spatial autocorrelation can be captured by adding a smooth spatial surface. This type of model needs a bit more setup. Each covariate type has its own specification. All baselearners should have the same degrees of freedom, otherwise biased model selection might be the result. 603 | 604 | <>= 605 | 606 | # quick set up formula 607 | 608 | # Response 609 | f.resp <- "ph.0.10 ~ " 610 | 611 | # Intercept, add to dataframe 612 | f.int <- "bols(int, intercept = F, df = 1)" 613 | d.ph10$int <- rep(1, nrow(d.ph10)) 614 | 615 | # Smooth spatial surface (needs > 4 degrees of freedom) 616 | f.spat <- "bspatial(x, y, df = 5, knots = 12)" 617 | 618 | # Linear baselearners for factors, maybe use df = 5 619 | f.fact <- paste( 620 | paste( "bols(", l.factors, ", intercept = F)" ), 621 | collapse = "+" 622 | ) 623 | 624 | # Splines baselearners for continuous covariates 625 | f.num <- paste( 626 | paste( "bbs(", l.numeric, ", center = T, df = 5)" ), 627 | collapse = "+" 628 | ) 629 | 630 | # create complete formula 631 | ph.form <- as.formula( paste( f.resp, 632 | paste( c(f.int, f.num, f.spat, f.fact), 633 | collapse = "+")) ) 634 | # fit the boosting model 635 | ph.gamboost <- gamboost(ph.form, data = d.ph10, 636 | control = boost_control(mstop = 200)) 637 | 638 | # Find tuning parameter 639 | ph.gamboost.cv <- cvrisk(ph.gamboost, 640 | folds = mboost::cv(model.weights(ph.gamboost), 641 | type = "kfold")) 642 | @ 643 | 644 | Analyse boosting model: 645 | 646 | <>= 647 | # print optimal mstop 648 | mstop(ph.gamboost.cv) 649 | 650 | ## print model info 651 | ph.gamboost[ mstop(ph.glmboost.cv)] 652 | ## print number of chosen baselearners 653 | length( t.sel <- summary( ph.gamboost[ mstop(ph.glmboost.cv)] )$selprob ) 654 | 655 | # Most often selected were: 656 | summary( ph.gamboost[ mstop(ph.glmboost.cv)] )$selprob[1:5] 657 | @ 658 | 659 | <>= 660 | par(mfrow=c(2,2) ) 661 | plot(ph.gamboost[ mstop(ph.glmboost.cv)], which = names(t.sel[1:4]) ) 662 | @ 663 | 664 | <>= 665 | par(mfrow=c(1,1) ) 666 | plot(ph.gamboost[ mstop(ph.glmboost.cv)], which = grep("bspat", names(t.sel), value = T) ) 667 | @ 668 | 669 | \clearpage 670 | 671 | 672 | \section{Model averaging} 673 | 674 | So far we calibrated several models to predict topsoil pH. With model averaging we can combine these predictions computing a simple \texttt{mean}. Besides simple averaging, we could use weights like $\frac{1}{MSE}$ (make sure they sum up to 1). 675 | 676 | Compute validation statistics (e.g. root mean squared error, R$^2$) on the validation set for the predictions of each model and the (weighted) averaged predictions. Is the prediction accuracy improved? 677 | 678 | You could now add models computed from random forest, support vector machines or gradient boosted trees. Does this improve model accuracy? 679 | 680 | \bigskip 681 | 682 | Note: Be aware not to select the final model based on the validation data. If you start tuning your predictions on your validation data, you loose the independent estimate of prediction accuracy... better choose your method for the final predictions based on cross validation (e.g. on the same sets). 683 | 684 | 685 | 686 | 687 | \bigskip 688 | \section*{R session information} 689 | 690 | \footnotesize 691 | This document was generated with: 692 | <>= 693 | toLatex(sessionInfo(), locale = FALSE) 694 | @ 695 | \normalsize 696 | 697 | <>= 698 | # purl("OpenGeoHub-machine-learning-training-1.Rnw") 699 | @ 700 | 701 | 702 | \end{document} 703 | -------------------------------------------------------------------------------- /exercises/OpenGeoHub-machine-learning-training-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnocci/2019_OpenGeoHub_machine-learning-madlene/2dcb94fea24e5f3c50e77f04df5b4570e066af25/exercises/OpenGeoHub-machine-learning-training-1.pdf -------------------------------------------------------------------------------- /exercises/OpenGeoHub-machine-learning-training-2.R: -------------------------------------------------------------------------------- 1 | ## ----general-options,echo=FALSE------------------------------------------ 2 | # This code is used to generate the PDF (knitr report) 3 | library(knitr) 4 | # output code, but no warnings 5 | opts_chunk$set(echo = TRUE,eval=TRUE,warning=FALSE) 6 | # auto check dependencies (of cached chunks, its an approximation only) 7 | opts_chunk$set(autodep = TRUE) 8 | # dep_auto() # print dependencies 9 | 10 | ## ----load-packages,message=FALSE----------------------------------------- 11 | library(randomForest) # for random forest models 12 | library(quantregForest) # for quantile random forest 13 | library(grpreg) # for group lasso 14 | library(geoGAM) # for the Berne test data set 15 | 16 | ## ----read-in-data-------------------------------------------------------- 17 | dim(berne) 18 | # Select soil pH in 0-10 cm as continuous response, 19 | # select calibration data and remove rows with missing pH 20 | d.ph10 <- berne[ berne$dataset == "calibration" & !is.na(berne$ph.0.10), ] 21 | d.ph10 <- d.ph10[ complete.cases(d.ph10[13:ncol(d.ph10)]), ] 22 | # covariates start at col 13 23 | l.covar <- names(d.ph10[, 13:ncol(d.ph10)]) 24 | 25 | ## ----fit-random-forest,cache=TRUE---------------------------------------- 26 | set.seed(17) 27 | rf.ph <- randomForest(x = d.ph10[, l.covar], 28 | y = d.ph10$ph.0.10) 29 | 30 | ## ----plot-covar-importance, fig.width=5, fig.height=6, fig.align='center', fig.pos="!hb", out.width='0.5\\textwidth', fig.cap="Covariate importance of 20 most important covariates for topsoil pH (before selection)."---- 31 | varImpPlot(rf.ph, n.var = 20, main = "") 32 | 33 | ## ----select-random-forest,cache=TRUE------------------------------------- 34 | # speed up the process by removing 5-10 covariates at a time 35 | s.seq <- sort( c( seq(5, 95, by = 5), 36 | seq(100, length(l.covar), by = 10) ), 37 | decreasing = T) 38 | 39 | # collect results in list 40 | qrf.elim <- oob.mse <- list() 41 | 42 | # save model and OOB error of current fit 43 | qrf.elim[[1]] <- rf.ph 44 | oob.mse[[1]] <- tail(qrf.elim[[1]]$mse, n=1) 45 | l.covar.sel <- l.covar 46 | 47 | # Iterate through number of retained covariates 48 | for( ii in 1:length(s.seq) ){ 49 | t.imp <- importance(qrf.elim[[ii]], type = 2) 50 | t.imp <- t.imp[ order(t.imp[,1], decreasing = T),] 51 | 52 | qrf.elim[[ii+1]] <- randomForest(x = d.ph10[, names(t.imp[1:s.seq[ii]])], 53 | y = d.ph10$ph.0.10 ) 54 | oob.mse[[ii+1]] <- tail(qrf.elim[[ii+1]]$mse,n=1) 55 | 56 | } 57 | 58 | 59 | # Prepare a data frame for plot 60 | elim.oob <- data.frame(elim.n = c(length(l.covar), s.seq[1:length(s.seq)]), 61 | elim.OOBe = unlist(oob.mse) ) 62 | 63 | ## ----plot-selection-path,fig.align='center',echo=FALSE,fig.height = 5,out.width='0.8\\textwidth',fig.cap = "Path of out-of-bag mean squared error as covariates are removed. Minimum is found at 55 covariates."---- 64 | 65 | plot(elim.oob$elim.n, elim.oob$elim.OOBe, 66 | ylab = "OOB error (MSE)", 67 | xlab = "n covariates", 68 | pch = 20) 69 | abline(v = elim.oob$elim.n[ which.min(elim.oob$elim.OOBe)], lty = "dotted") 70 | 71 | ## ----partial-residual-plots-lm-lasso,fig.width=7,fig.height=4, fig.align='center', out.width='0.9\\textwidth',fig.cap = "Partial residual plots for a climate covariate in the ordinary least squares fit and the lasso."---- 72 | # create a linear model (example, with covariates from lasso) 73 | ols <- lm( ph.0.10 ~ timeset + ge_geo500h3id + cl_mt_gh_4 + 74 | tr_se_curvplan2m_std_25c, data = d.ph10 ) 75 | par(mfrow = c(1,2)) # two plots on same figure 76 | # residual plot for covariate cl_mt_gh_4 77 | termplot(ols, partial.resid = T, terms = "cl_mt_gh_4", 78 | ylim = c(-2,2), 79 | main = "Ordinary Least Squares") 80 | abline(h=0, lty = 2) 81 | 82 | ## Create partial residual plot for lasso 83 | # there is no direct function available, but we can easily 84 | # construct the plot with 85 | # y-axis: residuals + effect of term (XBi), scaled 86 | # x-axis: values covariate 87 | # regression line: model fit of axis y~x 88 | 89 | ## First setup and fit the model 90 | l.factors <- names(d.ph10[l.covar])[ 91 | t.f <- unlist( lapply(d.ph10[l.covar], is.factor) ) ] 92 | l.numeric <- names(t.f[ !t.f ]) 93 | # create a vector that labels the groups with the same number 94 | g.groups <- c( 1:length(l.numeric), 95 | unlist( 96 | sapply(1:length(l.factors), function(n){ 97 | rep(n+length(l.numeric), nlevels(d.ph10[, l.factors[n]])-1) 98 | }) 99 | ) 100 | ) 101 | # grpreg needs model matrix as input 102 | XX <- model.matrix( ~., d.ph10[, c(l.numeric, l.factors), F])[,-1] 103 | # cross validation (CV) to find lambda 104 | ph.cvfit <- cv.grpreg(X = XX, y = d.ph10$ph.0.10, 105 | group = g.groups, 106 | penalty = "grLasso", 107 | returnY = T) # access CV results 108 | # choose optimal lambda: CV minimum error + 1 SE (see glmnet) 109 | l.se <- ph.cvfit$cvse[ ph.cvfit$min ] + ph.cvfit$cve[ ph.cvfit$min ] 110 | idx.se <- min( which( ph.cvfit$cve < l.se ) ) - 1 111 | 112 | # get the non-zero coefficients: 113 | t.coef <- ph.cvfit$fit$beta[, idx.se ] 114 | 115 | # get the index of the covariate 116 | idx <- which( names(t.coef) == "cl_mt_gh_4" ) 117 | 118 | # residuals of lasso model chosen above 119 | residuals <- d.ph10$ph.0.10 - ph.cvfit$Y[,idx.se] 120 | # prediction for this covariate XBi 121 | Xbeta <- ph.cvfit$fit$beta[idx, idx.se] * d.ph10$cl_mt_gh_4 122 | # calculate partial residuals and center with mean 123 | part.resid <- scale(residuals + Xbeta, scale = F)[,1] 124 | 125 | # plot with similar settings 126 | plot(d.ph10$cl_mt_gh_4, 127 | part.resid, pch = 1, col = "grey", 128 | ylim = c(-2,2), 129 | ylab = "partial residuals [%]", xlab = "cl_mt_gh_4", 130 | main = "Lasso") 131 | abline(lm(part.resid ~ d.ph10$cl_mt_gh_4), col = "red") 132 | abline(h=0, lty = 2) 133 | 134 | ## ----partial-dep-rf,fig.width=7,fig.height=8, fig.align='center', out.width='0.9\\textwidth',fig.cap = "Partial dependence plots for the 4 most important covariates."---- 135 | # select the model with minimum OOB error 136 | rf.selected <- qrf.elim[[ which.min(elim.oob$elim.OOBe)]] 137 | 138 | t.imp <- importance(rf.selected, type = 2) 139 | t.imp <- t.imp[ order(t.imp[,1], decreasing = T),] 140 | 141 | # 4 most important covariates 142 | ( t.3 <- names( t.imp[ 1:4 ] ) ) 143 | 144 | par( mfrow = c(2,2)) 145 | 146 | # Bug in partialPlot(): function does not allow a variable for the 147 | # covariate name (e. g. x.var = name) in a loop 148 | partialPlot(x = rf.selected, 149 | pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 150 | x.var = "cl_mt_rr_3", ylab = "ph [-]", main = "") 151 | partialPlot(x = rf.selected, 152 | pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 153 | x.var = "cl_mt_rr_11", ylab = "ph [-]", main = "" ) 154 | partialPlot(x = rf.selected, 155 | pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 156 | x.var = "timeset", ylab = "ph [-]", main = "" ) 157 | partialPlot(x = rf.selected, 158 | pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 159 | x.var = "cl_mt_rr_y", ylab = "ph [-]", main = "" ) 160 | 161 | 162 | ## ----quantRF,cache=TRUE-------------------------------------------------- 163 | # Fit quantile regression forest 164 | ph.quantRF <- quantregForest(x = d.ph10[, l.covar[1:30]], 165 | y = d.ph10$ph.0.10) 166 | 167 | # select validation data 168 | d.ph10.val <- berne[berne$dataset == "validation" & !is.na(berne$ph.0.10), ] 169 | d.ph10.val <- d.ph10.val[complete.cases(d.ph10.val[l.covar]), ] 170 | 171 | # compute predictions (mean) for each validation site 172 | # (use function from random forest package) 173 | ph.pred <- randomForest:::predict.randomForest(ph.quantRF, 174 | newdata = d.ph10.val) 175 | 176 | ## ----investigate-single-point,echo=FALSE,fig.pos='!h',fig.height=5,fig.width=4,fig.align='center', out.width='0.4\\textwidth',fig.cap= "Histogram of predictive distribution for one single prediction point (dotted lines: 90 \\% prediction interval, dashed line: mean prediction)."---- 177 | 178 | ## predict 0.01, 0.02,..., 0.99 quantiles for validation data 179 | ph.pred.distribution <- predict(ph.quantRF, 180 | newdata = d.ph10.val, 181 | what = seq(0.01, 0.99, by = 0.01)) 182 | 183 | # plot predictive distribution for one site 184 | sel.site <- 12 185 | hist( ph.pred.distribution[sel.site,], 186 | col = "grey", main = "", 187 | xlab = "predicted pH [-]", breaks = 12) 188 | 189 | # add 90 % prediction interval to plot 190 | abline(v = c( ph.pred.distribution[sel.site, "quantile= 0.05"], 191 | ph.pred.distribution[sel.site, "quantile= 0.95"]), 192 | lty = "dotted") 193 | abline(v = ph.pred[sel.site], lty = "dashed") 194 | 195 | ## ----create-intervall-plot,fig.height=5,fig.align='center',echo=FALSE, out.width='0.8\\textwidth',fig.cap= "Coverage of 90 \\%-prediction intervals computed by model-based boostrap."---- 196 | 197 | # get 90% quantiles for each point 198 | t.quant90 <- cbind( 199 | ph.pred.distribution[, "quantile= 0.05"], 200 | ph.pred.distribution[, "quantile= 0.95"]) 201 | 202 | # get index for ranking in the plot 203 | t.ix <- sort( ph.pred, index.return = T )$ix 204 | 205 | # plot predictions in increasing order 206 | plot( 207 | ph.pred[t.ix], type = "n", 208 | ylim = range(c(t.quant90, ph.pred, d.ph10.val$ph.0.10)), 209 | xlab = "rank of predictions", 210 | ylab = "ph [-]" 211 | ) 212 | 213 | # add prediction intervals 214 | segments( 215 | 1:nrow( d.ph10.val ), 216 | t.lower <- (t.quant90[,1])[t.ix], 217 | 1:nrow( d.ph10.val ), 218 | t.upper <- (t.quant90[,2])[t.ix], 219 | col = "grey" 220 | ) 221 | 222 | # select colour for dots outside of intervals 223 | t.col <- sapply( 224 | 1:length( t.ix ), 225 | function( i, x, lower, upper ){ 226 | as.integer( cut( x[i], c( -Inf, lower[i]-0.000001, 227 | x[i], upper[i]+0.000001, Inf ) ) ) 228 | }, 229 | x = d.ph10.val$ph.0.10[t.ix], 230 | lower = t.lower, upper = t.upper 231 | ) 232 | 233 | # add observed values on top 234 | points( 235 | 1:nrow( d.ph10.val ), 236 | d.ph10.val$ph.0.10[t.ix], cex = 0.7, 237 | pch = c( 16, 1, 16)[t.col], 238 | col = c( "darkgreen", "black", "darkgreen" )[t.col] 239 | ) 240 | points(ph.pred[t.ix], pch = 16, cex = 0.6, col = "grey60") 241 | 242 | # Add meaningfull legend 243 | legend( "topleft", 244 | bty = "n", cex = 0.85, 245 | pch = c( NA, 16, 1, 16 ), pt.cex = 0.6, lwd = 1, 246 | lty = c( 1, NA, NA, NA ), 247 | col = c( "grey", "grey60", "black", "darkgreen" ), 248 | seg.len = 0.8, 249 | legend = c( 250 | "90 %-prediction interval", 251 | paste0("prediction (n = ", nrow(d.ph10.val), ")"), 252 | paste0("observation within interval (n = ", 253 | sum( t.col %in% c(2) ), ")" ), 254 | paste0("observation outside interval (n = ", 255 | sum( t.col %in% c(1,3)), ", ", 256 | round(sum(t.col %in% c(1,3)) / 257 | nrow(d.ph10.val)*100,1), "%)") ) 258 | ) 259 | 260 | ## ----create-coverage-probabilty-plots,fig.align='center', fig.pos = "h", fig.width=4,fig.height=4.5, out.width='0.45\\textwidth',fig.cap="Coverage probabilities of one-sided prediction intervals computed for the validation data set of topsoil pH of the Berne study area."---- 261 | 262 | # Coverage probabilities plot 263 | # create sequence of nominal probabilities 264 | ss <- seq(0,1,0.01) 265 | # compute coverage for sequence 266 | t.prop.inside <- sapply(ss, function(ii){ 267 | boot.quantile <- t( apply(ph.pred.distribution, 1, quantile, 268 | probs = c(0,ii) ) )[,2] 269 | return( sum(boot.quantile <= d.ph10.val$ph.0.10)/nrow(d.ph10.val) ) 270 | }) 271 | 272 | plot(x = ss, y = t.prop.inside[length(ss):1], 273 | type = "l", asp = 1, 274 | ylab = "coverage probabilities", 275 | xlab="nominal probabilities" ) 276 | # add 1:1-line 277 | abline(0,1, lty = 2, col = "grey60") 278 | # add lines of the two-sided 90 %-prediction interval 279 | abline(v = c(0.05, 0.95), lty = "dotted", col = "grey20") 280 | 281 | ## ----session-info,results='asis'----------------------------------------- 282 | toLatex(sessionInfo(), locale = FALSE) 283 | 284 | ## ----export-r-code,echo=FALSE-------------------------------------------- 285 | #purl("GEOSTAT-machine-learning-training-2.Rnw") 286 | 287 | -------------------------------------------------------------------------------- /exercises/OpenGeoHub-machine-learning-training-2.Rnw: -------------------------------------------------------------------------------- 1 | % !TeX spellcheck = en_US 2 | %#------------------------------------------------------------------------------ 3 | %# Name: OpenGeoHub-machine-learning-training-2.Rnw 4 | %# (knitr document: R + Latex) 5 | %# 6 | %# Inhalt: Exercises for OpenGeoHub Summer School, 2. Series 7 | %# 8 | %# Author: Madlene Nussbaum, BFH-HAFL 9 | %# Date: Mai 2018 10 | %# Licence: CC-BY-NC-SA 11 | %#------------------------------------------------------------------------------ 12 | 13 | \documentclass[11pt,a4paper,twoside]{article} 14 | 15 | \usepackage[utf8]{inputenc} 16 | \usepackage{blindtext} % for blind text 17 | \usepackage{hyperref} % links in table of contents 18 | \usepackage[english]{babel} 19 | \usepackage{amsthm} % for renvironment 20 | \usepackage{natbib} 21 | \usepackage[iso,german]{isodate} 22 | \usepackage{subcaption} 23 | 24 | \newtheorem{rexample}{R Example}[section] 25 | 26 | % Some colors for the links 27 | \definecolor{darkblue}{rgb}{0,0,0.5} 28 | \definecolor{darkmagenta}{rgb}{0.5,0,0.5} 29 | \definecolor{darkgreen}{rgb}{0,0.4,0} 30 | \definecolor{darkred}{rgb}{0.7,0,0} 31 | 32 | \hypersetup{ 33 | draft=false, 34 | colorlinks=true,linkcolor=darkblue,citecolor=darkred,urlcolor=darkgreen, 35 | breaklinks=true, bookmarksnumbered=true 36 | } 37 | 38 | % % Headers 39 | \usepackage{fancyhdr} 40 | \pagestyle{fancy} 41 | \fancyhf{} 42 | \fancyhead[OL]{\leftmark}% odd page, left 43 | \fancyhead[OR]{\thepage}% odd page, right 44 | \fancyhead[EL]{\thepage}% even page, left 45 | \fancyhead[ER]{\leftmark}% even page, right 46 | \renewcommand{\headrulewidth}{0.4pt} 47 | \fancyheadoffset[R]{1pt} 48 | 49 | % captions in bold 50 | \usepackage[font = {small}, labelfont = bf]{caption} 51 | 52 | % format page borders 53 | \usepackage[a4paper]{geometry} 54 | \geometry{verbose,tmargin=2.5cm,bmargin=2.3cm,lmargin=2.5cm,rmargin=3cm,headheight=1.7cm,headsep=0.9cm,footskip=1.5cm} 55 | 56 | % no indents 57 | \setlength\parindent{0pt} 58 | \setlength{\parskip}{3pt} 59 | 60 | % Top aling logos on title page 61 | \def\imagebox#1#2{\vtop to #1{\null\hbox{#2}\vfill}} 62 | 63 | \newcommand{\bskip}{\vspace{0.7cm}} 64 | 65 | \begin{document} 66 | 67 | % % Logos 68 | \begin{figure} 69 | \centering 70 | \begin{subfigure}[t]{0.27\textwidth} 71 | % \includegraphics[width=\textwidth]{BFH-Logo.pdf} 72 | \imagebox{37.5mm}{\includegraphics[width=\textwidth]{figure/BFH-Logo.pdf}} 73 | \end{subfigure} 74 | \hfill 75 | \begin{subfigure}[t]{0.3\textwidth} 76 | % \includegraphics[width=\textwidth]{3455_original_isric.png} 77 | \imagebox{37.5mm}{\includegraphics[width=\textwidth]{figure/logo-opengeohub.png}} 78 | \end{subfigure} 79 | \end{figure} 80 | 81 | % Title 82 | \vspace{5cm} 83 | {\LARGE\textsf{OpenGeoHub Summer School}} 84 | 85 | \vspace{0.7cm} 86 | {\Large\textbf{\textsf{Mastering Machine Learning for Spatial Prediction II} }} 87 | 88 | \vspace{0.3cm} 89 | 90 | {\Large\textsf{Practical training} } 91 | 92 | \vspace{0.5cm} 93 | \textsf{Madlene Nussbaum, 4/5 September 2019} 94 | 95 | { \small \textsf{\copyright~ CC-BY-NC-SA } } 96 | \bigskip 97 | 98 | 99 | % Table of contents (with empty back page() 100 | \setlength{\parskip}{0pt} 101 | \tableofcontents 102 | \thispagestyle{empty} 103 | \setlength{\parskip}{4pt} 104 | 105 | % \newpage 106 | % \mbox{} 107 | % \thispagestyle{empty} 108 | % \newpage 109 | 110 | % -------- 111 | 112 | 113 | <>= 114 | # This code is used to generate the PDF (knitr report) 115 | library(knitr) 116 | # output code, but no warnings 117 | opts_chunk$set(echo = TRUE,eval=TRUE,warning=FALSE) 118 | # auto check dependencies (of cached chunks, its an approximation only) 119 | opts_chunk$set(autodep = TRUE) 120 | # dep_auto() # print dependencies 121 | @ 122 | 123 | 124 | 125 | \section*{Preparation} 126 | 127 | Load needed packages: 128 | 129 | <>= 130 | library(randomForest) # for random forest models 131 | library(quantregForest) # for quantile random forest 132 | library(grpreg) # for group lasso 133 | library(geoGAM) # for the Berne test data set 134 | @ 135 | 136 | 137 | Load again the \textsl{Berne} data, select the calibration set and remove missing values in covariates. 138 | 139 | <>= 140 | dim(berne) 141 | # Select soil pH in 0-10 cm as continuous response, 142 | # select calibration data and remove rows with missing pH 143 | d.ph10 <- berne[ berne$dataset == "calibration" & !is.na(berne$ph.0.10), ] 144 | d.ph10 <- d.ph10[ complete.cases(d.ph10[13:ncol(d.ph10)]), ] 145 | # covariates start at col 13 146 | l.covar <- names(d.ph10[, 13:ncol(d.ph10)]) 147 | @ 148 | 149 | 150 | \section{Selection of covariates} 151 | 152 | For tree based ensemble methods covariate importance can be computed. Based on this measure non-relevant covariates can be excluded and possibly model performance can be increased. 153 | 154 | Fit random forest model: 155 | 156 | <>= 157 | set.seed(17) 158 | rf.ph <- randomForest(x = d.ph10[, l.covar], 159 | y = d.ph10$ph.0.10) 160 | @ 161 | 162 | Create the importance plot: 163 | 164 | <>= 165 | varImpPlot(rf.ph, n.var = 20, main = "") 166 | @ 167 | 168 | 169 | Then, reduce covariates by recursive backward elimination using permuted covariate importance (\texttt{type = 2} in \texttt{importance()}): 170 | 171 | <>= 172 | # speed up the process by removing 5-10 covariates at a time 173 | s.seq <- sort( c( seq(5, 95, by = 5), 174 | seq(100, length(l.covar), by = 10) ), 175 | decreasing = T) 176 | 177 | # collect results in list 178 | qrf.elim <- oob.mse <- list() 179 | 180 | # save model and OOB error of current fit 181 | qrf.elim[[1]] <- rf.ph 182 | oob.mse[[1]] <- tail(qrf.elim[[1]]$mse, n=1) 183 | l.covar.sel <- l.covar 184 | 185 | # Iterate through number of retained covariates 186 | for( ii in 1:length(s.seq) ){ 187 | t.imp <- importance(qrf.elim[[ii]], type = 2) 188 | t.imp <- t.imp[ order(t.imp[,1], decreasing = T),] 189 | 190 | qrf.elim[[ii+1]] <- randomForest(x = d.ph10[, names(t.imp[1:s.seq[ii]])], 191 | y = d.ph10$ph.0.10 ) 192 | oob.mse[[ii+1]] <- tail(qrf.elim[[ii+1]]$mse,n=1) 193 | 194 | } 195 | 196 | 197 | # Prepare a data frame for plot 198 | elim.oob <- data.frame(elim.n = c(length(l.covar), s.seq[1:length(s.seq)]), 199 | elim.OOBe = unlist(oob.mse) ) 200 | @ 201 | 202 | <>= 203 | 204 | plot(elim.oob$elim.n, elim.oob$elim.OOBe, 205 | ylab = "OOB error (MSE)", 206 | xlab = "n covariates", 207 | pch = 20) 208 | abline(v = elim.oob$elim.n[ which.min(elim.oob$elim.OOBe)], lty = "dotted") 209 | @ 210 | 211 | 212 | \paragraph{Please continue:} 213 | 214 | \begin{itemize} 215 | \item Optimize m$_{try}$ before you start the covariate selection (function \texttt{train}, package \texttt{caret}). How much does the OOB error decrease? Are both steps (tuning, selection) worth the effort from a point of view of prediction performance? 216 | \item Implement the same covariate selection for gradient boosting with trees as baselearners (package \texttt{gbm} or \texttt{caret}). Do you find the same covariates in the final set? Why do you expect differences? 217 | \end{itemize} 218 | 219 | 220 | \section{Model interpretation} 221 | \subsection{Partial resiudal plots} 222 | 223 | To demonstrate the principle we create a partial residual plot for an ordinary least squares fit. Then, we add the same plot for the lasso fitted on the topsoil pH data above: 224 | 225 | <>= 226 | # create a linear model (example, with covariates from lasso) 227 | ols <- lm( ph.0.10 ~ timeset + ge_geo500h3id + cl_mt_gh_4 + 228 | tr_se_curvplan2m_std_25c, data = d.ph10 ) 229 | par(mfrow = c(1,2)) # two plots on same figure 230 | # residual plot for covariate cl_mt_gh_4 231 | termplot(ols, partial.resid = T, terms = "cl_mt_gh_4", 232 | ylim = c(-2,2), 233 | main = "Ordinary Least Squares") 234 | abline(h=0, lty = 2) 235 | 236 | ## Create partial residual plot for lasso 237 | # there is no direct function available, but we can easily 238 | # construct the plot with 239 | # y-axis: residuals + effect of term (XBi), scaled 240 | # x-axis: values covariate 241 | # regression line: model fit of axis y~x 242 | 243 | ## First setup and fit the model 244 | l.factors <- names(d.ph10[l.covar])[ 245 | t.f <- unlist( lapply(d.ph10[l.covar], is.factor) ) ] 246 | l.numeric <- names(t.f[ !t.f ]) 247 | # create a vector that labels the groups with the same number 248 | g.groups <- c( 1:length(l.numeric), 249 | unlist( 250 | sapply(1:length(l.factors), function(n){ 251 | rep(n+length(l.numeric), nlevels(d.ph10[, l.factors[n]])-1) 252 | }) 253 | ) 254 | ) 255 | # grpreg needs model matrix as input 256 | XX <- model.matrix( ~., d.ph10[, c(l.numeric, l.factors), F])[,-1] 257 | # cross validation (CV) to find lambda 258 | ph.cvfit <- cv.grpreg(X = XX, y = d.ph10$ph.0.10, 259 | group = g.groups, 260 | penalty = "grLasso", 261 | returnY = T) # access CV results 262 | # choose optimal lambda: CV minimum error + 1 SE (see glmnet) 263 | l.se <- ph.cvfit$cvse[ ph.cvfit$min ] + ph.cvfit$cve[ ph.cvfit$min ] 264 | idx.se <- min( which( ph.cvfit$cve < l.se ) ) - 1 265 | 266 | # get the non-zero coefficients: 267 | t.coef <- ph.cvfit$fit$beta[, idx.se ] 268 | 269 | # get the index of the covariate 270 | idx <- which( names(t.coef) == "cl_mt_gh_4" ) 271 | 272 | # residuals of lasso model chosen above 273 | residuals <- d.ph10$ph.0.10 - ph.cvfit$Y[,idx.se] 274 | # prediction for this covariate XBi 275 | Xbeta <- ph.cvfit$fit$beta[idx, idx.se] * d.ph10$cl_mt_gh_4 276 | # calculate partial residuals and center with mean 277 | part.resid <- scale(residuals + Xbeta, scale = F)[,1] 278 | 279 | # plot with similar settings 280 | plot(d.ph10$cl_mt_gh_4, 281 | part.resid, pch = 1, col = "grey", 282 | ylim = c(-2,2), 283 | ylab = "partial residuals [%]", xlab = "cl_mt_gh_4", 284 | main = "Lasso") 285 | abline(lm(part.resid ~ d.ph10$cl_mt_gh_4), col = "red") 286 | abline(h=0, lty = 2) 287 | @ 288 | 289 | 290 | 291 | \clearpage 292 | \subsection{Partial dependence plots} 293 | 294 | Interpretation of the most important covariates of a random forest model can be done by partial dependence plots. But keep in mind that the remaining covariates after model selection might be still multi-collinear, hence covariates might be exchangeable. 295 | 296 | Partial dependence plots are a general method and can be also applied to other supervised machine learning methods. 297 | 298 | <>= 299 | # select the model with minimum OOB error 300 | rf.selected <- qrf.elim[[ which.min(elim.oob$elim.OOBe)]] 301 | 302 | t.imp <- importance(rf.selected, type = 2) 303 | t.imp <- t.imp[ order(t.imp[,1], decreasing = T),] 304 | 305 | # 4 most important covariates 306 | ( t.3 <- names( t.imp[ 1:4 ] ) ) 307 | 308 | par( mfrow = c(2,2)) 309 | 310 | # Bug in partialPlot(): function does not allow a variable for the 311 | # covariate name (e. g. x.var = name) in a loop 312 | partialPlot(x = rf.selected, 313 | pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 314 | x.var = "cl_mt_rr_3", ylab = "ph [-]", main = "") 315 | partialPlot(x = rf.selected, 316 | pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 317 | x.var = "cl_mt_rr_11", ylab = "ph [-]", main = "" ) 318 | partialPlot(x = rf.selected, 319 | pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 320 | x.var = "timeset", ylab = "ph [-]", main = "" ) 321 | partialPlot(x = rf.selected, 322 | pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 323 | x.var = "cl_mt_rr_y", ylab = "ph [-]", main = "" ) 324 | 325 | @ 326 | 327 | 328 | \paragraph{Please continue:} 329 | 330 | \begin{itemize} 331 | \item Create partial dependence plots for the boosted trees model (\texttt{?plot.gbm}, \texttt{plot(.., i.var = ..)}). Do you find the same relationships? 332 | \item What do you conclude from the plots? Are the plotted covariates good predictors? 333 | \end{itemize} 334 | 335 | 336 | \clearpage 337 | \section{Prediction uncertainty with quantile regression forest} 338 | 339 | When reporting predictions it is important to give prediction uncertainty along with them. For any method not yielding uncertainty estimates form the method itself (e.g. kriging variances) a non-parametric or a model-based bootstrap approach can be used. Quantile regression forest computes quantiles of the predictions directly from the bootstrap (bootstrap aggregation, bagging) that is done within random forest. 340 | 341 | <>= 342 | # Fit quantile regression forest 343 | ph.quantRF <- quantregForest(x = d.ph10[, l.covar[1:30]], 344 | y = d.ph10$ph.0.10) 345 | 346 | # select validation data 347 | d.ph10.val <- berne[berne$dataset == "validation" & !is.na(berne$ph.0.10), ] 348 | d.ph10.val <- d.ph10.val[complete.cases(d.ph10.val[l.covar]), ] 349 | 350 | # compute predictions (mean) for each validation site 351 | # (use function from random forest package) 352 | ph.pred <- randomForest:::predict.randomForest(ph.quantRF, 353 | newdata = d.ph10.val) 354 | @ 355 | 356 | <>= 357 | 358 | ## predict 0.01, 0.02,..., 0.99 quantiles for validation data 359 | ph.pred.distribution <- predict(ph.quantRF, 360 | newdata = d.ph10.val, 361 | what = seq(0.01, 0.99, by = 0.01)) 362 | 363 | # plot predictive distribution for one site 364 | sel.site <- 12 365 | hist( ph.pred.distribution[sel.site,], 366 | col = "grey", main = "", 367 | xlab = "predicted pH [-]", breaks = 12) 368 | 369 | # add 90 % prediction interval to plot 370 | abline(v = c( ph.pred.distribution[sel.site, "quantile= 0.05"], 371 | ph.pred.distribution[sel.site, "quantile= 0.95"]), 372 | lty = "dotted") 373 | abline(v = ph.pred[sel.site], lty = "dashed") 374 | @ 375 | 376 | 377 | Plot for evaluation of prediction intervals (as shown in presentation): 378 | 379 | <>= 380 | 381 | # get 90% quantiles for each point 382 | t.quant90 <- cbind( 383 | ph.pred.distribution[, "quantile= 0.05"], 384 | ph.pred.distribution[, "quantile= 0.95"]) 385 | 386 | # get index for ranking in the plot 387 | t.ix <- sort( ph.pred, index.return = T )$ix 388 | 389 | # plot predictions in increasing order 390 | plot( 391 | ph.pred[t.ix], type = "n", 392 | ylim = range(c(t.quant90, ph.pred, d.ph10.val$ph.0.10)), 393 | xlab = "rank of predictions", 394 | ylab = "ph [-]" 395 | ) 396 | 397 | # add prediction intervals 398 | segments( 399 | 1:nrow( d.ph10.val ), 400 | t.lower <- (t.quant90[,1])[t.ix], 401 | 1:nrow( d.ph10.val ), 402 | t.upper <- (t.quant90[,2])[t.ix], 403 | col = "grey" 404 | ) 405 | 406 | # select colour for dots outside of intervals 407 | t.col <- sapply( 408 | 1:length( t.ix ), 409 | function( i, x, lower, upper ){ 410 | as.integer( cut( x[i], c( -Inf, lower[i]-0.000001, 411 | x[i], upper[i]+0.000001, Inf ) ) ) 412 | }, 413 | x = d.ph10.val$ph.0.10[t.ix], 414 | lower = t.lower, upper = t.upper 415 | ) 416 | 417 | # add observed values on top 418 | points( 419 | 1:nrow( d.ph10.val ), 420 | d.ph10.val$ph.0.10[t.ix], cex = 0.7, 421 | pch = c( 16, 1, 16)[t.col], 422 | col = c( "darkgreen", "black", "darkgreen" )[t.col] 423 | ) 424 | points(ph.pred[t.ix], pch = 16, cex = 0.6, col = "grey60") 425 | 426 | # Add meaningfull legend 427 | legend( "topleft", 428 | bty = "n", cex = 0.85, 429 | pch = c( NA, 16, 1, 16 ), pt.cex = 0.6, lwd = 1, 430 | lty = c( 1, NA, NA, NA ), 431 | col = c( "grey", "grey60", "black", "darkgreen" ), 432 | seg.len = 0.8, 433 | legend = c( 434 | "90 %-prediction interval", 435 | paste0("prediction (n = ", nrow(d.ph10.val), ")"), 436 | paste0("observation within interval (n = ", 437 | sum( t.col %in% c(2) ), ")" ), 438 | paste0("observation outside interval (n = ", 439 | sum( t.col %in% c(1,3)), ", ", 440 | round(sum(t.col %in% c(1,3)) / 441 | nrow(d.ph10.val)*100,1), "%)") ) 442 | ) 443 | @ 444 | 445 | 446 | 447 | <>= 448 | 449 | # Coverage probabilities plot 450 | # create sequence of nominal probabilities 451 | ss <- seq(0,1,0.01) 452 | # compute coverage for sequence 453 | t.prop.inside <- sapply(ss, function(ii){ 454 | boot.quantile <- t( apply(ph.pred.distribution, 1, quantile, 455 | probs = c(0,ii) ) )[,2] 456 | return( sum(boot.quantile <= d.ph10.val$ph.0.10)/nrow(d.ph10.val) ) 457 | }) 458 | 459 | plot(x = ss, y = t.prop.inside[length(ss):1], 460 | type = "l", asp = 1, 461 | ylab = "coverage probabilities", 462 | xlab="nominal probabilities" ) 463 | # add 1:1-line 464 | abline(0,1, lty = 2, col = "grey60") 465 | # add lines of the two-sided 90 %-prediction interval 466 | abline(v = c(0.05, 0.95), lty = "dotted", col = "grey20") 467 | @ 468 | 469 | 470 | \paragraph{Please continue:} 471 | 472 | \begin{itemize} 473 | \item Are you satisfied with the prediction intervals? 474 | \item Create maps of the prediction intervals for the \texttt{berne.grid} data. Is there much spatial structure in the uncertainty? 475 | \end{itemize} 476 | 477 | 478 | 479 | \bigskip 480 | \section*{R session information} 481 | 482 | \footnotesize 483 | This document was generated with: 484 | <>= 485 | toLatex(sessionInfo(), locale = FALSE) 486 | @ 487 | \normalsize 488 | 489 | <>= 490 | purl("OpenGeoHub-machine-learning-training-2.Rnw") 491 | @ 492 | 493 | 494 | \end{document} 495 | -------------------------------------------------------------------------------- /exercises/OpenGeoHub-machine-learning-training-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnocci/2019_OpenGeoHub_machine-learning-madlene/2dcb94fea24e5f3c50e77f04df5b4570e066af25/exercises/OpenGeoHub-machine-learning-training-2.pdf -------------------------------------------------------------------------------- /presentation/OpenGeoHub-machine-learning-lecture-1-handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnocci/2019_OpenGeoHub_machine-learning-madlene/2dcb94fea24e5f3c50e77f04df5b4570e066af25/presentation/OpenGeoHub-machine-learning-lecture-1-handout.pdf -------------------------------------------------------------------------------- /presentation/OpenGeoHub-machine-learning-lecture-2-handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnocci/2019_OpenGeoHub_machine-learning-madlene/2dcb94fea24e5f3c50e77f04df5b4570e066af25/presentation/OpenGeoHub-machine-learning-lecture-2-handout.pdf -------------------------------------------------------------------------------- /presentation/OpenGeoHub-machine-learning-lecture-2-teaser_overview-soil-mapping-study.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mnocci/2019_OpenGeoHub_machine-learning-madlene/2dcb94fea24e5f3c50e77f04df5b4570e066af25/presentation/OpenGeoHub-machine-learning-lecture-2-teaser_overview-soil-mapping-study.pdf --------------------------------------------------------------------------------