├── exercises
    ├── OpenGeoHub-machine-learning-training-1.R
    ├── OpenGeoHub-machine-learning-training-1.Rnw
    ├── OpenGeoHub-machine-learning-training-1.pdf
    ├── OpenGeoHub-machine-learning-training-2.R
    ├── OpenGeoHub-machine-learning-training-2.Rnw
    └── OpenGeoHub-machine-learning-training-2.pdf
└── presentation
    ├── OpenGeoHub-machine-learning-lecture-1-handout.pdf
    ├── OpenGeoHub-machine-learning-lecture-2-handout.pdf
    └── OpenGeoHub-machine-learning-lecture-2-teaser_overview-soil-mapping-study.pdf


/exercises/OpenGeoHub-machine-learning-training-1.R:
--------------------------------------------------------------------------------
  1 | ## ----general-options,echo=FALSE-----------------------------------------
  2 | 
  3 | library(knitr)
  4 | # output code, but no warnings
  5 | opts_chunk$set(echo = TRUE,eval=TRUE,warning=FALSE,cache=TRUE)
  6 | # auto check dependencies (of cached chunks, its an approximation only)
  7 | opts_chunk$set(autodep = TRUE)
  8 | # dep_auto() # print dependencies 
  9 | 
 10 | 
 11 | 
 12 | ## ----load-packages,message=FALSE----------------------------------------
 13 | # install.packages(c("grpgreg", "glmnet", "kernlab", "caret", "randomForest", "mboost",
 14 | #                    "gbm", "geoGAM", "raster"))
 15 | library(grpreg) # for grouped lasso
 16 | library(glmnet) # for general lasso
 17 | library(kernlab) # for support vector machines
 18 | library(caret) # for model tuning
 19 | library(randomForest) # to fit random forest
 20 | library(mboost) # for the boosting models with linear and spline terms  
 21 | library(gbm) # for the boosting model with trees
 22 | library(geoGAM) # for the berne dataset
 23 | library(raster) # for plotting as a raster
 24 | library(parallel) # for parallel computing
 25 | 
 26 | 
 27 | ## ----read-in-data-------------------------------------------------------
 28 | data(berne)
 29 | dim(berne)
 30 | # Continuous response 
 31 | d.ph10 <- berne[berne$dataset == "calibration" & !is.na(berne$ph.0.10), ]
 32 | d.ph10 <- d.ph10[complete.cases(d.ph10[13:ncol(d.ph10)]), ]
 33 | # select validation data for subsequent validation
 34 | d.ph10.val <- berne[berne$dataset == "validation" & !is.na(berne$ph.0.10), ]
 35 | d.ph10.val <- d.ph10.val[complete.cases(d.ph10.val[13:ncol(d.ph10)]), ]
 36 | # Binary response 
 37 | d.wlog100 <- berne[berne$dataset=="calibration"&!is.na(berne$waterlog.100), ]
 38 | d.wlog100 <- d.wlog100[complete.cases(d.wlog100[13:ncol(d.wlog100)]), ]
 39 | # Ordered/multinomial tesponse 
 40 | d.drain <- berne[berne$dataset == "calibration" & !is.na(berne$dclass), ]
 41 | d.drain <- d.drain[complete.cases(d.drain[13:ncol(d.drain)]), ]
 42 | # covariates start at col 13
 43 | l.covar <- names(d.ph10[, 13:ncol(d.ph10)])
 44 | 
 45 | 
 46 | ## ----apply-example------------------------------------------------------
 47 | # loop 
 48 | # first create a vector to save the results
 49 | t.result <- c()
 50 | for( ii in 1:10 ){ t.result <- c(t.result, ii^2) }
 51 | # the same as apply
 52 | t.result <- sapply(1:10, function(ii){ ii^2 })
 53 | # of course, this example is even shorter using:
 54 | t.result <- (1:10)^2
 55 | 
 56 | 
 57 | ## ----lasso-continuous-response,cache=TRUE-------------------------------
 58 | 
 59 | # define groups: dummy coding of a factor is treated as group
 60 | # find factors
 61 | l.factors <- names(d.ph10[l.covar])[ 
 62 |   t.f <- unlist( lapply(d.ph10[l.covar], is.factor) ) ]
 63 | l.numeric <-  names(t.f[ !t.f ])
 64 | 
 65 | # create a vector that labels the groups with the same number  
 66 | g.groups <- c( 1:length(l.numeric), 
 67 |                unlist( 
 68 |                  sapply(1:length(l.factors), function(n){
 69 |                    rep(n+length(l.numeric), nlevels(d.ph10[, l.factors[n]])-1)
 70 |                  }) 
 71 |                ) 
 72 | )
 73 | # grpreg needs model matrix as input
 74 | XX <- model.matrix( ~., d.ph10[, c(l.numeric, l.factors), F])[,-1]
 75 | 
 76 | # cross validation (CV) to find lambda
 77 | ph.cvfit <- cv.grpreg(X = XX, y = d.ph10$ph.0.10, 
 78 |                       group = g.groups, 
 79 |                       penalty = "grLasso",
 80 |                       returnY = T) # access CV results
 81 | 
 82 | 
 83 | ## ----lasso-predictions--------------------------------------------------
 84 | # choose optimal lambda: CV minimum error + 1 SE (see glmnet)
 85 | l.se <- ph.cvfit$cvse[ ph.cvfit$min ] + ph.cvfit$cve[ ph.cvfit$min ]
 86 | idx.se <- min( which( ph.cvfit$cve < l.se ) ) - 1
 87 | 
 88 | # create model matrix for validation set
 89 | newXX <- model.matrix( ~., d.ph10.val[, c(l.factors, l.numeric), F])[,-1]
 90 | t.pred.val <-  predict(ph.cvfit, X = newXX, 
 91 |                        type = "response",
 92 |                        lambda =  ph.cvfit$lambda[idx.se])
 93 | # get CV predictions, e.g. to compute R2
 94 | ph.lasso.cv.pred <- ph.cvfit$Y[,idx.se]
 95 | 
 96 | 
 97 | ## ----lasso-get-model----------------------------------------------------
 98 | # get the non-zero coefficients:
 99 | t.coef <- ph.cvfit$fit$beta[, idx.se ]
100 | t.coef[ t.coef > 0 ]
101 | 
102 | 
103 | ## ----lasso-plot-cv,echo=FALSE,fig.width=7,fig.height=4.5, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Cross validation error plotted against the tuning parameter lambda. The dashed line indicates lambda at minimal error, the dotted darkgrey line is the optimal lambda with minimal error + 1 SE."----
104 | 
105 | plot(ph.cvfit)
106 | abline( h = l.se, col = "grey", lty = "dotted")
107 | abline( v = log( ph.cvfit$lambda[ idx.se ]), col = "grey30", lty = "dotted")
108 | 
109 | 
110 | ## ----lasso-multinomial-response,cache = TRUE----------------------------
111 | 
112 | # create model matrix for drainage classes
113 | # use a subset of covariates only, because model optimization for 
114 | # multinomial takes long otherwise
115 | 
116 | set.seed(42) # makes sample() reproducible
117 | XX <- model.matrix(~.,d.drain[, l.covar[sample(1:length(l.covar), 30)]])[,-1]
118 | 
119 | drain.cvfit <- cv.glmnet( XX, d.drain$dclass, nfold = 10,  
120 |                           keep = T, # access CV results
121 |                           family = "multinomial", 
122 |                           type.multinomial = "grouped")
123 | 
124 | 
125 | ## ----lasso-multinomial-response-coeffs,cache=TRUE-----------------------
126 | 
127 | drain.fit <- glmnet( XX, d.drain$dclass,
128 |                      family = "multinomial", 
129 |                      type.multinomial = "grouped",
130 |                      lambda = drain.cvfit$lambda.min)
131 | # The coeffs are here:
132 | # drain.fit$beta$well
133 | # drain.fit$beta$moderate
134 | # drain.fit$beta$poor
135 | 
136 | 
137 | ## ----svm,cache=TRUE-----------------------------------------------------
138 | 
139 | # We have to set up the design matrix ourselfs 
140 | # (without intercept, hence remove first column) 
141 | XX <- model.matrix( ~., d.ph10[, c(l.covar), F])[,-1]
142 | 
143 | # set seed for random numbers to split cross-valiation sets
144 | set.seed(31)
145 | # Setup for 10fold cross-validation
146 | ctrl <- trainControl(method="cv",   
147 |                      number=10,		   
148 |                      savePredictions = "final")
149 | 
150 | # 1. pass of training - find region of C and lambda 
151 | svm.tune1 <- train(x = XX,
152 |                    y = d.ph10[, "ph.0.10"],
153 |                    method = "svmRadial", # radial kernel function
154 |                    tuneLength = 9, # check 9 values of the cost function
155 |                    preProc = c("center","scale"), # center and scale data
156 |                    trControl=ctrl)
157 | 
158 | # 2. pass of training - find best value for C and lambda
159 | # setup a tuning grid with values around the result of the first pass
160 | sig <- svm.tune1$bestTune$sigma
161 | t.sigma <- sort( unique( round(abs( c(sig, sig + seq(0, sig*2, by = sig/1), 
162 |                                       sig - seq(0, sig*2, by = sig/1)) ), 6)))
163 | tune.grid <- expand.grid(
164 |   sigma = t.sigma[t.sigma>0], # sigma must be positive
165 |   C = sort( unique( abs( c(svm.tune1$bestTune$C, 
166 |                            svm.tune1$bestTune$C - seq(0, 0.3, by = 0.1), 
167 |                            svm.tune1$bestTune$C + seq(0, 0.3, by = 0.1) )) ))
168 | )
169 | #Train and Tune the SVM
170 | svm.model <- train(x = XX,
171 |                    y = d.ph10[, "ph.0.10"],
172 |                    method = "svmRadial",
173 |                    preProc = c("center","scale"),
174 |                    tuneGrid = tune.grid,
175 |                    trControl = ctrl)
176 | 
177 | # -> if this takes too long: take a short cut with
178 | # svm.model <- svm.tune1
179 | 
180 | 
181 | 
182 | ## ----svm-validation-plots,fig.width=10,fig.height=5, fig.align='center', out.width='0.85\\textwidth',fig.cap = "Predictions from cross-validation (left) and the validation dataset (right) plotted against the observed values (dashed: 1:1-line, green: lowess scatterplott smoother)."----
183 | # create validation plots with lowess scatterplot smoothers
184 | # for cross-validation
185 | par(mfrow = c(1,2))
186 | plot(svm.model$pred$pred, svm.model$pred$obs,  
187 |      xlab = "cross-validation predictions", 
188 |      ylab = "observed", 
189 |      asp = 1)
190 | abline(0,1, lty = "dashed", col = "grey")
191 | lines(lowess(svm.model$pred$pred, svm.model$pred$obs), col = "darkgreen", lwd = 2)
192 | 
193 | # for independent validation set
194 | # calculate predictions for the validation set
195 | newXX <- model.matrix( ~., d.ph10.val[, l.covar, F])[,-1]
196 | t.pred.val <- predict.train(svm.model, newdata = newXX)
197 | plot(t.pred.val, d.ph10.val[, "ph.0.10"], 
198 |      xlab = "predictions on validation set", 
199 |      ylab = "observed", 
200 |      asp = 1)
201 | abline(0,1, lty = "dashed", col = "grey")
202 | lines(lowess(t.pred.val, d.ph10.val[, "ph.0.10"]), col = "darkgreen", lwd = 2)
203 | 
204 | 
205 | ## ----random-forest,cache=TRUE-------------------------------------------
206 | 
207 | # Fit a random forest with default parameters 
208 | # (often results are already quite good)
209 | set.seed(1)
210 | rf.model.basic <- randomForest(x = d.ph10[, l.covar ],
211 |                                y = d.ph10[, "ph.0.10"])
212 | 
213 | # tune main tuning parameter "mtry"
214 | # (the number of covariates that are randomly selected to try at each split)
215 | 
216 | # define function to use below
217 | f.tune.randomforest <- function(test.mtry, # mtry to test
218 |                                 d.cal,     # calibration data
219 |                                 l.covariates ){ # list of covariates
220 |   # set seed 
221 |   set.seed(1)
222 |   # fit random forest with mtry = test.mtry
223 |   rf.tune <- randomForest(x = d.cal[, l.covariates ],
224 |                           y = d.cal[, "ph.0.10"],
225 |                           mtry = test.mtry)
226 |   # return the mean squared error (mse) of this model fit
227 |   return( tail(rf.tune$mse, n=1) )
228 | }
229 | 
230 | # vector of mtry to test 
231 | seq.mtry <- c(1:(length(l.covar) - 1))
232 | # Only take every fifth for speed reasons
233 | seq.mtry <- seq.mtry[ seq.mtry %% 5 == 0 ] 
234 | 
235 | # Apply function to sequence. 
236 | t.OOBe <-   mclapply(seq.mtry, # give sequence 
237 |                      FUN = f.tune.randomforest, # give function name
238 |                      mc.cores = 1, ## number of CPUs 
239 |                      mc.set.seed = FALSE, # do not use new seed each time 
240 |                      # now here giv the arguments to the function:
241 |                      d.cal = d.ph10, 
242 |                      l.covar = l.covar ) 
243 | 
244 | # Hint: Who is not comfortable with "mclapply" 
245 | # the same could be achieved with
246 | # for(test.mtry in 1:m.end){ 
247 | #    .. content of function + vector to collect result... }
248 | 
249 | # create a dataframe of the results           
250 | mtry.oob <- data.frame(mtry.n = seq.mtry, mtry.OOBe = unlist(t.OOBe))
251 | 
252 | # get the mtry with the minimum MSE
253 | s.mtry <- mtry.oob$mtry.n[ which.min(mtry.oob$mtry.OOBe) ]
254 | 
255 | # compute random forest with optimal mtry 
256 | set.seed(1)
257 | rf.model.tuned <- randomForest(x = d.ph10[, l.covar ],
258 |                                y = d.ph10[, "ph.0.10"],
259 |                                mtry = s.mtry)
260 | 
261 | 
262 | ## ----random-forest-plot-mtry,fig.width=6,fig.height=4.7, fig.align='center',fig.pos='!h',out.width='0.6\\textwidth',fig.cap = "Tuning parameter mtry plotted against the out-of-bag mean squared error (grey line: lowess smoothing line, dashed line: mtry at minimum MSE)."----
263 | plot( mtry.oob$mtry.n, mtry.oob$mtry.OOBe, pch = 4, 
264 |       ylab = "out-of-bag MSE error", xlab = "mtry")
265 | abline(v = s.mtry, lty = "dashed", col = "darkgrey")
266 | lines( lowess( mtry.oob$mtry.n, mtry.oob$mtry.OOBe ), lwd = 1.5, col = "darkgrey")
267 | 
268 | 
269 | ## ----boosted-trees-tuning,cache=TRUE------------------------------------
270 | 
271 | # create a grid of the tuning parameters to be tested, 
272 | # main tuning parameters are: 
273 | gbm.grid <- expand.grid(
274 |   # how many splits does each tree have
275 |   interaction.depth = c(2,5,10,15,20),
276 |   # how many trees do we add (number of iterations of boosting algorithm)
277 |   n.trees = seq(2,250, by = 5),
278 |   # put the shrinkage factor to 0.1 (=10% updates as used 
279 |   # in package mboost), the default (0.1%) is a bit too small, 
280 |   # makes model selection too slow. 
281 |   # minimum number of observations per node can be left as is
282 |   shrinkage = 0.1, n.minobsinnode = 10) 
283 | 
284 | # make tuning reproducible (there are random samples for the cross validation)
285 | set.seed(291201945)
286 | 
287 | # train the gbm model 
288 | # Remove "ge_caco3" throws an error since Package gbm 2.1.5, 
289 | # this bug is reported: https://github.com/gbm-developers/gbm/issues/40
290 | gbm.model <- train(x=d.ph10[, l.covar[-c(50)]  ], 
291 |                    y=d.ph10[, "ph.0.10"],
292 |                    method = "gbm", # choose "generalized boosted regression model"
293 |                    tuneGrid = gbm.grid,
294 |                    verbose = FALSE,
295 |                    trControl = trainControl(
296 |                      # use 10fold cross validation (CV)
297 |                      method = "cv", number = 10,
298 |                      # save fitted values (e.g. to calculate RMSE of the CV)
299 |                      savePredictions = "final"))
300 | 
301 | # print optimal tuning parameter
302 | gbm.model$bestTune
303 | 
304 | 
305 | ## ----boosted-trees-map,fig.width=5,fig.height=5, fig.align='center', out.width='0.7\\textwidth',fig.cap = "Predictions computed with an optimized boosted trees model of topsoil pH (0--10 cm) for a very small part of the Berne study region (white areas are streets, developped areas or forests, CRAN does not accept larger datasets)."----
306 | 
307 | # compute predictions for the small part of the study area
308 | # (agricultural land, the empty pixels are streets, forests etc.)
309 | data("berne.grid")
310 | 
311 | berne.grid$pred <- predict.train(gbm.model, newdata = berne.grid )
312 | 
313 | # create a spatial object for a proper spatial plot
314 | coordinates(berne.grid) <- ~x+y
315 | # add the Swiss projection (see ?berne.grid)
316 | # see https://epsg.io for details on projections
317 | proj4string(berne.grid) <- CRS("+init=epsg:21781")
318 | # create a raster object from the spatial point dataframe 
319 | gridded(berne.grid) <- TRUE
320 | plot(raster(berne.grid, layer = "pred"))
321 | 
322 | 
323 | 
324 | ## ----boosted-trees-partial-dependencies,fig.pos="h",fig.width=7,fig.height=7, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Partial dependence plots of boosted trees model for the four most important covariates."----
325 | 
326 | # get variable importance
327 | t.imp <- varImp(gbm.model$finalModel) 
328 | 
329 | # check how many covariates were never selected
330 | sum( t.imp$Overall == 0 )
331 | 
332 | # order and select 4 most important covariates
333 | t.names <- dimnames(t.imp)[[1]][ order(t.imp$Overall, decreasing = T)[1:4] ]
334 | 
335 | par(mfrow = c(2,2))
336 | for( name in t.names ){
337 |   # select index of covariate
338 |   ix <- which( gbm.model$finalModel$var.names == name )
339 |   plot(gbm.model$finalModel, i.var = ix)
340 | }
341 | 
342 | # -> improve the plots by using the same y-axis (e.g. ylim=c(..,..)) 
343 | #    for all of them, and try to add labels (xlab = , ylab = ) 
344 | #    or a title (main = )
345 | 
346 | 
347 | 
348 | ## ----glmboost,cache=TRUE------------------------------------------------
349 | # Fit model
350 | ph.glmboost <- glmboost(ph.0.10 ~., data = d.ph10[ c("ph.0.10", l.covar)],
351 |                         control = boost_control(mstop = 200),
352 |                         center = TRUE)
353 | 
354 | # Find tuning parameter: mstop = number of boosting itertations
355 | set.seed(42)
356 | ph.glmboost.cv <- cvrisk(ph.glmboost, 
357 |                          folds = mboost::cv(model.weights(ph.glmboost), 
358 |                                             type = "kfold"))
359 | 
360 | # print optimal mstop
361 | mstop(ph.glmboost.cv)
362 | 
363 | ## print model with fitted coefficents 
364 | # ph.glmboost[ mstop(ph.glmboost.cv)]
365 | 
366 | 
367 | ## ----glmboost-plot,fig.width=7,fig.height=5, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Path of cross validation error along the boosting iterations.", echo = FALSE----
368 | plot(ph.glmboost.cv)
369 | 
370 | 
371 | ## ----gamboost,cache=TRUE,message=FALSE----------------------------------
372 | 
373 | # quick set up formula
374 | 
375 | # Response
376 | f.resp <- "ph.0.10 ~ "
377 | 
378 | # Intercept, add to dataframe 
379 | f.int <- "bols(int, intercept = F, df = 1)"
380 | d.ph10$int <- rep(1, nrow(d.ph10))
381 | 
382 | # Smooth spatial surface (needs > 4 degrees of freedom)
383 | f.spat <- "bspatial(x, y, df = 5, knots = 12)"
384 | 
385 | # Linear baselearners for factors, maybe use df = 5
386 | f.fact <- paste( 
387 |   paste( "bols(", l.factors, ", intercept = F)" ), 
388 |   collapse = "+" 
389 | )
390 | 
391 | # Splines baselearners for continuous covariates
392 | f.num <- paste( 
393 |   paste( "bbs(", l.numeric, ", center = T, df = 5)" ),
394 |   collapse = "+"
395 | )
396 | 
397 | # create complete formula 
398 | ph.form <- as.formula( paste( f.resp, 
399 |                               paste( c(f.int, f.num, f.spat, f.fact),
400 |                                      collapse = "+")) ) 
401 | # fit the boosting model
402 | ph.gamboost  <- gamboost(ph.form, data = d.ph10,
403 |                          control = boost_control(mstop = 200))
404 | 
405 | # Find tuning parameter
406 | ph.gamboost.cv <- cvrisk(ph.gamboost, 
407 |                          folds = mboost::cv(model.weights(ph.gamboost), 
408 |                                             type = "kfold"))
409 | 
410 | 
411 | ## ----gamboost-results---------------------------------------------------
412 | # print optimal mstop
413 | mstop(ph.gamboost.cv)
414 | 
415 | ## print model info 
416 | ph.gamboost[ mstop(ph.glmboost.cv)]
417 | ## print number of chosen baselearners 
418 | length( t.sel <-  summary( ph.gamboost[ mstop(ph.glmboost.cv)] )$selprob ) 
419 | 
420 | # Most often selected were: 
421 | summary( ph.gamboost[ mstop(ph.glmboost.cv)] )$selprob[1:5]  
422 | 
423 | 
424 | ## ----gamboost-partial-plots,echo=FALSE,fig.width=7,fig.height=6, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Residual plots of the 4 covariates with highest selection frequency."----
425 | par(mfrow=c(2,2) )
426 | plot(ph.gamboost[ mstop(ph.glmboost.cv)], which = names(t.sel[1:4]) )
427 | 
428 | 
429 | ## ----gamboost-partial-plots-spatial,echo=FALSE,fig.width=7,fig.height=5, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Modelled smooth spatial surface based on the coordinates."----
430 | par(mfrow=c(1,1) )
431 | plot(ph.gamboost[ mstop(ph.glmboost.cv)], which = grep("bspat", names(t.sel), value = T) )
432 | 
433 | 
434 | ## ----session-info,results='asis'----------------------------------------
435 | toLatex(sessionInfo(), locale = FALSE)
436 | 
437 | 
438 | ## ----export-r-code,echo=FALSE,result="hide"-----------------------------
439 | # purl("OpenGeoHub-machine-learning-training-1.Rnw")
440 | 
441 | 


--------------------------------------------------------------------------------
/exercises/OpenGeoHub-machine-learning-training-1.Rnw:
--------------------------------------------------------------------------------
  1 | % !TeX spellcheck = en_US
  2 | %#------------------------------------------------------------------------------
  3 | %# Name:        OpenGeoHub-machine-learning-training.Rnw
  4 | %#              (knitr document: R + Latex)
  5 | %#
  6 | %# Inhalt:      Exercises for OpenGeoHub Summer School, 1. Series
  7 | %#
  8 | %# Autorin:     Madlene Nussbaum, BFH-HAFL
  9 | %# Datum:       August 2018
 10 | %# Licence:     GNU General Public License
 11 | %#------------------------------------------------------------------------------
 12 | 
 13 | \documentclass[11pt,a4paper,twoside]{article}
 14 | 
 15 | \usepackage[utf8]{inputenc}
 16 | \usepackage{blindtext} % for blind text
 17 | \usepackage{hyperref} % links in table of contents
 18 | \usepackage[english]{babel}
 19 | \usepackage{amsthm} % for renvironment
 20 | \usepackage{natbib}
 21 | \usepackage[iso,german]{isodate}
 22 | \usepackage{subcaption}
 23 | 
 24 | \newtheorem{rexample}{R Example}[section]
 25 | 
 26 | % Some colors for the links
 27 | \definecolor{darkblue}{rgb}{0,0,0.5}
 28 | \definecolor{darkmagenta}{rgb}{0.5,0,0.5}
 29 | \definecolor{darkgreen}{rgb}{0,0.4,0}
 30 | \definecolor{darkred}{rgb}{0.7,0,0}
 31 | 
 32 | \hypersetup{
 33 | draft=false,
 34 | colorlinks=true,linkcolor=darkblue,citecolor=darkred,urlcolor=darkgreen,
 35 | breaklinks=true, bookmarksnumbered=true
 36 | }
 37 | 
 38 | % % Headers
 39 | \usepackage{fancyhdr}
 40 | \pagestyle{fancy}
 41 | \fancyhf{}
 42 | \fancyhead[OL]{\leftmark}% odd page, left
 43 | \fancyhead[OR]{\thepage}% odd page, right
 44 | \fancyhead[EL]{\thepage}% even page, left 
 45 | \fancyhead[ER]{\leftmark}% even page, right
 46 | \renewcommand{\headrulewidth}{0.4pt}
 47 | \fancyheadoffset[R]{1pt}
 48 | 
 49 | % captions in bold
 50 | \usepackage[font = {small}, labelfont = bf]{caption}
 51 | 
 52 | % format page borders
 53 | \usepackage[a4paper]{geometry}
 54 | \geometry{verbose,tmargin=2.5cm,bmargin=2.3cm,lmargin=2.5cm,rmargin=3cm,headheight=1.7cm,headsep=0.9cm,footskip=1.5cm}
 55 | 
 56 | % no indents
 57 | \setlength\parindent{0pt}
 58 | \setlength{\parskip}{3pt}
 59 | 
 60 | 
 61 | % Top aling logos on title page
 62 | \def\imagebox#1#2{\vtop to #1{\null\hbox{#2}\vfill}}
 63 | 
 64 | \newcommand{\bskip}{\vspace{0.7cm}}
 65 | 
 66 | \begin{document}
 67 | 
 68 | % % Logos
 69 | \begin{figure}
 70 | \centering
 71 | \begin{subfigure}[t]{0.27\textwidth}
 72 | % \includegraphics[width=\textwidth]{BFH-Logo.pdf}
 73 | \imagebox{37.5mm}{\includegraphics[width=\textwidth]{figure/BFH-Logo.pdf}}
 74 | \end{subfigure}
 75 | \hfill
 76 | \begin{subfigure}[t]{0.3\textwidth}
 77 | % \includegraphics[width=\textwidth]{3455_original_isric.png}
 78 | \imagebox{37.5mm}{\includegraphics[width=\textwidth]{figure/logo-opengeohub.png}}
 79 | \end{subfigure}
 80 | \end{figure}
 81 | 
 82 | % Title
 83 | \vspace{5cm}
 84 | {\LARGE\textsf{OpenGeoHub Summer School}}
 85 | 
 86 | \vspace{0.7cm}
 87 | {\Large\textbf{\textsf{Mastering Machine Learning for Spatial Prediction I} }}
 88 | 
 89 | \vspace{0.3cm}
 90 | 
 91 | {\Large\textsf{Practical training}  }
 92 | 
 93 | \vspace{0.5cm}
 94 | \textsf{Madlene Nussbaum, 4/5 September 2019}
 95 | 
 96 | { \small \textsf{\copyright~ CC-BY-NC-SA } 	} 
 97 | \bigskip
 98 | 
 99 | 
100 | % Table of contents (with empty back page()
101 | \setlength{\parskip}{0pt}
102 | \tableofcontents
103 | \thispagestyle{empty}
104 | \setlength{\parskip}{4pt}
105 | 
106 | % \newpage
107 | % \mbox{}
108 | % \thispagestyle{empty}
109 | % \newpage
110 | 
111 | % --------
112 | 
113 | 
114 | <<general-options,echo=FALSE>>=
115 | 
116 | library(knitr)
117 | # output code, but no warnings
118 | opts_chunk$set(echo = TRUE,eval=TRUE,warning=FALSE,cache=TRUE)
119 | # auto check dependencies (of cached chunks, its an approximation only)
120 | opts_chunk$set(autodep = TRUE)
121 | # dep_auto() # print dependencies 
122 | 
123 | @
124 | 
125 | 
126 | 
127 | \section*{Preparation}
128 | 
129 | Load needed packages:
130 | 
131 | <<load-packages,message=FALSE>>=
132 | # install.packages(c("grpgreg", "glmnet", "kernlab", "caret", "randomForest", "mboost",
133 | #                    "gbm", "geoGAM", "raster"))
134 | library(grpreg) # for grouped lasso
135 | library(glmnet) # for general lasso
136 | library(kernlab) # for support vector machines
137 | library(caret) # for model tuning
138 | library(randomForest) # to fit random forest
139 | library(mboost) # for the boosting models with linear and spline terms  
140 | library(gbm) # for the boosting model with trees
141 | library(geoGAM) # for the berne dataset
142 | library(raster) # for plotting as a raster
143 | library(parallel) # for parallel computing
144 | @
145 | 
146 | As an example you can work with the Berne soil mapping study area: dataset \texttt{berne} in R package \texttt{geoGAM}, contains continuous, binary and a multinomial/ordered response and a spatial data \texttt{berne.grid} for prediction. 
147 | 
148 | Feel free to work with your own data!
149 | 
150 | Hint: The processing of this code is quite time consuming on a laptop. Normaly, one uses high performance computing facilities for machine learning.  
151 | 
152 | Load the data, select the calibration set and remove missing values in covariates: 
153 | 
154 | <<read-in-data>>=
155 | data(berne)
156 | dim(berne)
157 | # Continuous response 
158 | d.ph10 <- berne[berne$dataset == "calibration" & !is.na(berne$ph.0.10), ]
159 | d.ph10 <- d.ph10[complete.cases(d.ph10[13:ncol(d.ph10)]), ]
160 | # select validation data for subsequent validation
161 | d.ph10.val <- berne[berne$dataset == "validation" & !is.na(berne$ph.0.10), ]
162 | d.ph10.val <- d.ph10.val[complete.cases(d.ph10.val[13:ncol(d.ph10)]), ]
163 | # Binary response 
164 | d.wlog100 <- berne[berne$dataset=="calibration"&!is.na(berne$waterlog.100), ]
165 | d.wlog100 <- d.wlog100[complete.cases(d.wlog100[13:ncol(d.wlog100)]), ]
166 | # Ordered/multinomial tesponse 
167 | d.drain <- berne[berne$dataset == "calibration" & !is.na(berne$dclass), ]
168 | d.drain <- d.drain[complete.cases(d.drain[13:ncol(d.drain)]), ]
169 | # covariates start at col 13
170 | l.covar <- names(d.ph10[, 13:ncol(d.ph10)])
171 | @
172 | 
173 | 
174 | 
175 | \section{Lasso -- linear shrinkage method}
176 | \markboth{Lasso -- linear shrinkage method}{Lasso -- linear shrinkage method}
177 | 
178 | \paragraph{Lasso for continuous response} \mbox{}  \nolinebreak
179 | 
180 | The \texttt{berne} dataset contains categorical covariates (factors, e.g. geological map with different substrate classes). The group lasso (R package \texttt{grpreg}) ensures that all dummy covariates of one factor are excluded (coefficients set to 0) together or remain in the model as a group.
181 | 
182 | The main tuning parameter $\lambda$ is selected by cross validation. $\lambda$ determines the degree of shrinkage that is applied to the coefficients. 
183 | 
184 | 
185 | \bigskip 
186 | 
187 | HINT for R newbies: the \texttt{apply}-functions in R are replacements for loops (\texttt{sapply}: loop over a sequence of numbers, \texttt{lapply}: loop over a list). Compared to \texttt{for}, an \texttt{apply} is much faster and general coding style, though a bit more tricky to program. 
188 | 
189 | Example, how to replace a \texttt{for} by a \texttt{sapply}: 
190 | <<apply-example>>=
191 | # loop 
192 | # first create a vector to save the results
193 | t.result <- c()
194 | for( ii in 1:10 ){ t.result <- c(t.result, ii^2) }
195 | # the same as apply
196 | t.result <- sapply(1:10, function(ii){ ii^2 })
197 | # of course, this example is even shorter using:
198 | t.result <- (1:10)^2
199 | @
200 | 
201 | 
202 | Now we create the setup using \texttt{apply} and fit the grouped lasso:
203 | <<lasso-continuous-response,cache=TRUE>>=
204 | 
205 | # define groups: dummy coding of a factor is treated as group
206 | # find factors
207 | l.factors <- names(d.ph10[l.covar])[ 
208 |   t.f <- unlist( lapply(d.ph10[l.covar], is.factor) ) ]
209 | l.numeric <-  names(t.f[ !t.f ])
210 | 
211 | # create a vector that labels the groups with the same number  
212 | g.groups <- c( 1:length(l.numeric), 
213 |                unlist( 
214 |                  sapply(1:length(l.factors), function(n){
215 |                    rep(n+length(l.numeric), nlevels(d.ph10[, l.factors[n]])-1)
216 |                  }) 
217 |                ) 
218 | )
219 | # grpreg needs model matrix as input
220 | XX <- model.matrix( ~., d.ph10[, c(l.numeric, l.factors), F])[,-1]
221 | 
222 | # cross validation (CV) to find lambda
223 | ph.cvfit <- cv.grpreg(X = XX, y = d.ph10$ph.0.10, 
224 |                       group = g.groups, 
225 |                       penalty = "grLasso",
226 |                       returnY = T) # access CV results
227 | @
228 | 
229 | 
230 | Compute predictions for the validation set with optimal number of groups chosen by lasso:
231 | <<lasso-predictions>>=
232 | # choose optimal lambda: CV minimum error + 1 SE (see glmnet)
233 | l.se <- ph.cvfit$cvse[ ph.cvfit$min ] + ph.cvfit$cve[ ph.cvfit$min ]
234 | idx.se <- min( which( ph.cvfit$cve < l.se ) ) - 1
235 | 
236 | # create model matrix for validation set
237 | newXX <- model.matrix( ~., d.ph10.val[, c(l.factors, l.numeric), F])[,-1]
238 | t.pred.val <-  predict(ph.cvfit, X = newXX, 
239 |                        type = "response",
240 |                        lambda =  ph.cvfit$lambda[idx.se])
241 | # get CV predictions, e.g. to compute R2
242 | ph.lasso.cv.pred <- ph.cvfit$Y[,idx.se]
243 | @
244 | 
245 | Get the lasso (non-zero) coefficients of the optimal model:
246 | <<lasso-get-model>>=
247 | # get the non-zero coefficients:
248 | t.coef <- ph.cvfit$fit$beta[, idx.se ]
249 | t.coef[ t.coef > 0 ]
250 | @
251 | 
252 | 
253 | 
254 | <<lasso-plot-cv,echo=FALSE,fig.width=7,fig.height=4.5, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Cross validation error plotted against the tuning parameter lambda. The dashed line indicates lambda at minimal error, the dotted darkgrey line is the optimal lambda with minimal error + 1 SE.">>=
255 | 
256 | plot(ph.cvfit)
257 | abline( h = l.se, col = "grey", lty = "dotted")
258 | abline( v = log( ph.cvfit$lambda[ idx.se ]), col = "grey30", lty = "dotted")
259 | @
260 | 
261 | 
262 | \paragraph{Lasso for multinomial response} \mbox{} \nolinebreak
263 | 
264 | I am not aware of a lasso implementation for multinomial responses 
265 | that can handle groups of factors. Therefore, we use ``standard'' lasso from R package \texttt{glmnet} (the option \texttt{type.multinomial = "grouped"} does only ensure all coefficients of the multinomial model for the same covariate are treated as groups). 
266 | 
267 | 
268 | <<lasso-multinomial-response,cache = TRUE>>=
269 | 
270 | # create model matrix for drainage classes
271 | # use a subset of covariates only, because model optimization for 
272 | # multinomial takes long otherwise
273 | 
274 | set.seed(42) # makes sample() reproducible
275 | XX <- model.matrix(~.,d.drain[, l.covar[sample(1:length(l.covar), 30)]])[,-1]
276 | 
277 | drain.cvfit <- cv.glmnet( XX, d.drain$dclass, nfold = 10,  
278 |                           keep = T, # access CV results
279 |                           family = "multinomial", 
280 |                           type.multinomial = "grouped")
281 | @
282 | 
283 | For getting the coefficients of the final model you run the \texttt{glmnet} function again with the selected $\lambda$. 
284 | Please note: The multinomial fit results in a coefficient for each covariate and response level. 
285 | 
286 | <<lasso-multinomial-response-coeffs,cache=TRUE>>=
287 | 
288 | drain.fit <- glmnet( XX, d.drain$dclass,
289 |                      family = "multinomial", 
290 |                      type.multinomial = "grouped",
291 |                      lambda = drain.cvfit$lambda.min)
292 | # The coeffs are here:
293 | # drain.fit$beta$well
294 | # drain.fit$beta$moderate
295 | # drain.fit$beta$poor
296 | @
297 | 
298 | 
299 | \paragraph{Please continue:}
300 | 
301 | \begin{itemize}
302 | \item Select the lasso for a binary response (e.g. presence/absence of waterlogging \texttt{waterlog.100}). Use \texttt{family = "binomial"} in \texttt{cv.grpreg} and make sure your response is coded as 0/1. 
303 | \item For the multinomial lasso fit of drainage class: compute predictions for the validation set (\texttt{predict} with \texttt{s="lambda.1se"} or \texttt{s="lambda.min"}). Then, evaluate prediction accuracy by e.g. using Pierce Skill Score, see function \texttt{verify} or \texttt{multi.cont} in R package \texttt{verification}. 
304 | %  t.pred.val <- predict(drain.cvfit, newx = newXX, s="lambda.min", type = "class")
305 | \end{itemize}
306 | 
307 | 
308 | \clearpage
309 | \section{Support vector machines}
310 | 
311 | 
312 | We use support vector machines (SVM) for regression from the package \texttt{kernlab} with radial kernel basis functions that fit local relations in feature space. The tuning parameter \texttt{C} defines the flexibility of the SVM to allow for wrongly predicted data points and the parameter $\sigma$ the degree of non-linearity of the radial kernel. Here we apply a two step approach to find optimal tuning parameters. \texttt{C} and $\sigma$ of a first pass are used as starting points to find optimal parameters around the first estimates.
313 | 
314 | We tune the SVM with \texttt{caret} with a cross-validation. \texttt{caret} is a meta package that provides a homogenous interface to about 80 machine learning methods. 
315 | 
316 | <<svm,cache=TRUE>>=
317 | 
318 | # We have to set up the design matrix ourselfs 
319 | # (without intercept, hence remove first column) 
320 | XX <- model.matrix( ~., d.ph10[, c(l.covar), F])[,-1]
321 | 
322 | # set seed for random numbers to split cross-valiation sets
323 | set.seed(31)
324 | # Setup for 10fold cross-validation
325 | ctrl <- trainControl(method="cv",   
326 |                      number=10,		   
327 |                      savePredictions = "final")
328 | 
329 | # 1. pass of training - find region of C and lambda 
330 | svm.tune1 <- train(x = XX,
331 |                    y = d.ph10[, "ph.0.10"],
332 |                    method = "svmRadial", # radial kernel function
333 |                    tuneLength = 9, # check 9 values of the cost function
334 |                    preProc = c("center","scale"), # center and scale data
335 |                    trControl=ctrl)
336 | 
337 | # 2. pass of training - find best value for C and lambda
338 | # setup a tuning grid with values around the result of the first pass
339 | sig <- svm.tune1$bestTune$sigma
340 | t.sigma <- sort( unique( round(abs( c(sig, sig + seq(0, sig*2, by = sig/1), 
341 |                                       sig - seq(0, sig*2, by = sig/1)) ), 6)))
342 | tune.grid <- expand.grid(
343 |   sigma = t.sigma[t.sigma>0], # sigma must be positive
344 |   C = sort( unique( abs( c(svm.tune1$bestTune$C, 
345 |                            svm.tune1$bestTune$C - seq(0, 0.3, by = 0.1), 
346 |                            svm.tune1$bestTune$C + seq(0, 0.3, by = 0.1) )) ))
347 | )
348 | #Train and Tune the SVM
349 | svm.model <- train(x = XX,
350 |                    y = d.ph10[, "ph.0.10"],
351 |                    method = "svmRadial",
352 |                    preProc = c("center","scale"),
353 |                    tuneGrid = tune.grid,
354 |                    trControl = ctrl)
355 | 
356 | # -> if this takes too long: take a short cut with
357 | # svm.model <- svm.tune1
358 | 
359 | @
360 | 
361 | 
362 | <<svm-validation-plots,fig.width=10,fig.height=5, fig.align='center', out.width='0.85\\textwidth',fig.cap = "Predictions from cross-validation (left) and the validation dataset (right) plotted against the observed values (dashed: 1:1-line, green: lowess scatterplott smoother).">>=
363 | # create validation plots with lowess scatterplot smoothers
364 | # for cross-validation
365 | par(mfrow = c(1,2))
366 | plot(svm.model$pred$pred, svm.model$pred$obs,  
367 |      xlab = "cross-validation predictions", 
368 |      ylab = "observed", 
369 |      asp = 1)
370 | abline(0,1, lty = "dashed", col = "grey")
371 | lines(lowess(svm.model$pred$pred, svm.model$pred$obs), col = "darkgreen", lwd = 2)
372 | 
373 | # for independent validation set
374 | # calculate predictions for the validation set
375 | newXX <- model.matrix( ~., d.ph10.val[, l.covar, F])[,-1]
376 | t.pred.val <- predict.train(svm.model, newdata = newXX)
377 | plot(t.pred.val, d.ph10.val[, "ph.0.10"], 
378 |      xlab = "predictions on validation set", 
379 |      ylab = "observed", 
380 |      asp = 1)
381 | abline(0,1, lty = "dashed", col = "grey")
382 | lines(lowess(t.pred.val, d.ph10.val[, "ph.0.10"]), col = "darkgreen", lwd = 2)
383 | @
384 | 
385 | 
386 | \clearpage 
387 | \section{Random forest}
388 | 
389 | Here we fit a random forest with the package \texttt{randomForest}. If you work with very large datasets consider using the package \texttt{ranger}. More important: use paralell computing as demonstrated here with the function \texttt{mclapply} (does not work on Windows, then use \texttt{mc.cores = 1}). 
390 | 
391 | This is advanced programming with functions and \texttt{apply}. But, I wanted to show how you can do the tuning of a ML method yourself without using a meta-function like \texttt{train} from package \texttt{caret} (see below). Like this you can control it does what it should (and the code is not much longer)
392 | 
393 | <<random-forest,cache=TRUE>>=
394 | 
395 | # Fit a random forest with default parameters 
396 | # (often results are already quite good)
397 | set.seed(1)
398 | rf.model.basic <- randomForest(x = d.ph10[, l.covar ],
399 |                                y = d.ph10[, "ph.0.10"])
400 | 
401 | # tune main tuning parameter "mtry"
402 | # (the number of covariates that are randomly selected to try at each split)
403 | 
404 | # define function to use below
405 | f.tune.randomforest <- function(test.mtry, # mtry to test
406 |                                 d.cal,     # calibration data
407 |                                 l.covariates ){ # list of covariates
408 |   # set seed 
409 |   set.seed(1)
410 |   # fit random forest with mtry = test.mtry
411 |   rf.tune <- randomForest(x = d.cal[, l.covariates ],
412 |                           y = d.cal[, "ph.0.10"],
413 |                           mtry = test.mtry)
414 |   # return the mean squared error (mse) of this model fit
415 |   return( tail(rf.tune$mse, n=1) )
416 | }
417 | 
418 | # vector of mtry to test 
419 | seq.mtry <- c(1:(length(l.covar) - 1))
420 | # Only take every fifth for speed reasons
421 | seq.mtry <- seq.mtry[ seq.mtry %% 5 == 0 ] 
422 | 
423 | # Apply function to sequence. 
424 | t.OOBe <-   mclapply(seq.mtry, # give sequence 
425 |                      FUN = f.tune.randomforest, # give function name
426 |                      mc.cores = 1, ## number of CPUs 
427 |                      mc.set.seed = FALSE, # do not use new seed each time 
428 |                      # now here giv the arguments to the function:
429 |                      d.cal = d.ph10, 
430 |                      l.covar = l.covar ) 
431 | 
432 | # Hint: Who is not comfortable with "mclapply" 
433 | # the same could be achieved with
434 | # for(test.mtry in 1:m.end){ 
435 | #    .. content of function + vector to collect result... }
436 | 
437 | # create a dataframe of the results           
438 | mtry.oob <- data.frame(mtry.n = seq.mtry, mtry.OOBe = unlist(t.OOBe))
439 | 
440 | # get the mtry with the minimum MSE
441 | s.mtry <- mtry.oob$mtry.n[ which.min(mtry.oob$mtry.OOBe) ]
442 | 
443 | # compute random forest with optimal mtry 
444 | set.seed(1)
445 | rf.model.tuned <- randomForest(x = d.ph10[, l.covar ],
446 |                                y = d.ph10[, "ph.0.10"],
447 |                                mtry = s.mtry)
448 | @
449 | 
450 | 
451 | <<random-forest-plot-mtry,fig.width=6,fig.height=4.7, fig.align='center',fig.pos='!h',out.width='0.6\\textwidth',fig.cap = "Tuning parameter mtry plotted against the out-of-bag mean squared error (grey line: lowess smoothing line, dashed line: mtry at minimum MSE).">>=
452 | plot( mtry.oob$mtry.n, mtry.oob$mtry.OOBe, pch = 4, 
453 |       ylab = "out-of-bag MSE error", xlab = "mtry")
454 | abline(v = s.mtry, lty = "dashed", col = "darkgrey")
455 | lines( lowess( mtry.oob$mtry.n, mtry.oob$mtry.OOBe ), lwd = 1.5, col = "darkgrey")
456 | @
457 | 
458 | 
459 | \paragraph{Please continue:}
460 | 
461 | \begin{itemize}
462 | \item Compute the predictions for the validation set with the tuned and the model with default values (\texttt{predict(rf.model.tuned, newdata = ...)}) and compute a root mean squared error. Was the tuning effort worthwhile? 
463 | \item Implement the same tunging with the package \texttt{caret}. Check the option \texttt{methofd="oob"} of \texttt{trainControl}. This function is handed to \texttt{train}. 
464 | \end{itemize}
465 | 
466 | 
467 | 
468 | \clearpage 
469 | \section{Gradient boosting}
470 | 
471 | \subsection{Boosting with trees as baselearners}
472 | 
473 | There are various R packages to fit boosting models (e.g. \texttt{mboost}, \texttt{xgboost}). We use \texttt{gbm} here. We can again tune it with \texttt{caret}. \texttt{caret} is a meta package that provides a homogenous interface to about 80 machine learning methods. 
474 | 
475 | 
476 | Fit gradient boosting with trees: 
477 | 
478 | <<boosted-trees-tuning,cache=TRUE>>=
479 | 
480 | # create a grid of the tuning parameters to be tested, 
481 | # main tuning parameters are: 
482 | gbm.grid <- expand.grid(
483 |   # how many splits does each tree have
484 |   interaction.depth = c(2,5,10,15,20),
485 |   # how many trees do we add (number of iterations of boosting algorithm)
486 |   n.trees = seq(2,250, by = 5),
487 |   # put the shrinkage factor to 0.1 (=10% updates as used 
488 |   # in package mboost), the default (0.1%) is a bit too small, 
489 |   # makes model selection too slow. 
490 |   # minimum number of observations per node can be left as is
491 |   shrinkage = 0.1, n.minobsinnode = 10) 
492 | 
493 | # make tuning reproducible (there are random samples for the cross validation)
494 | set.seed(291201945)
495 | 
496 | # train the gbm model 
497 | # Remove "ge_caco3" throws an error since Package gbm 2.1.5, 
498 | # this bug is reported: https://github.com/gbm-developers/gbm/issues/40
499 | gbm.model <- train(x=d.ph10[, l.covar[-c(50)]  ], 
500 |                    y=d.ph10[, "ph.0.10"],
501 |                    method = "gbm", # choose "generalized boosted regression model"
502 |                    tuneGrid = gbm.grid,
503 |                    verbose = FALSE,
504 |                    trControl = trainControl(
505 |                      # use 10fold cross validation (CV)
506 |                      method = "cv", number = 10,
507 |                      # save fitted values (e.g. to calculate RMSE of the CV)
508 |                      savePredictions = "final"))
509 | 
510 | # print optimal tuning parameter
511 | gbm.model$bestTune
512 | @
513 | 
514 | 
515 | <<boosted-trees-map,fig.width=5,fig.height=5, fig.align='center', out.width='0.7\\textwidth',fig.cap = "Predictions computed with an optimized boosted trees model of topsoil pH (0--10 cm) for a very small part of the Berne study region (white areas are streets, developped areas or forests, CRAN does not accept larger datasets).">>=
516 | 
517 | # compute predictions for the small part of the study area
518 | # (agricultural land, the empty pixels are streets, forests etc.)
519 | data("berne.grid")
520 | 
521 | berne.grid$pred <- predict.train(gbm.model, newdata = berne.grid )
522 | 
523 | # create a spatial object for a proper spatial plot
524 | coordinates(berne.grid) <- ~x+y
525 | # add the Swiss projection (see ?berne.grid)
526 | # see https://epsg.io for details on projections
527 | proj4string(berne.grid) <- CRS("+init=epsg:21781")
528 | # create a raster object from the spatial point dataframe 
529 | gridded(berne.grid) <- TRUE
530 | plot(raster(berne.grid, layer = "pred"))
531 | 
532 | @
533 | 
534 | 
535 | Lets check the partial dependencies of the 4 most important covariates: 
536 | 
537 | <<boosted-trees-partial-dependencies,fig.pos="h",fig.width=7,fig.height=7, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Partial dependence plots of boosted trees model for the four most important covariates.">>=
538 | 
539 | # get variable importance
540 | t.imp <- varImp(gbm.model$finalModel) 
541 | 
542 | # check how many covariates were never selected
543 | sum( t.imp$Overall == 0 )
544 | 
545 | # order and select 4 most important covariates
546 | t.names <- dimnames(t.imp)[[1]][ order(t.imp$Overall, decreasing = T)[1:4] ]
547 | 
548 | par(mfrow = c(2,2))
549 | for( name in t.names ){
550 |   # select index of covariate
551 |   ix <- which( gbm.model$finalModel$var.names == name )
552 |   plot(gbm.model$finalModel, i.var = ix)
553 | }
554 | 
555 | # -> improve the plots by using the same y-axis (e.g. ylim=c(..,..)) 
556 | #    for all of them, and try to add labels (xlab = , ylab = ) 
557 | #    or a title (main = )
558 | 
559 | @
560 | 
561 | 
562 | 
563 | 
564 | 
565 | \subsection{Boosting with linear baselearners (advanced task)}
566 | 
567 | Boosting algorithm can be used with any kind of base procedures / baselearners. Many packages (e.g. \texttt{gbm}, \texttt{xgboost}) use trees. Here we try linear and splines baselearners. 
568 | 
569 | 
570 | For details on \texttt{mboost} see the hands-on tutorial in the vignette to the package:
571 | \url{https://cran.r-project.org/web/packages/mboost/vignettes/mboost_tutorial.pdf}
572 | 
573 | Select a boosting model with linear baselearners (this results in shrunken coefficients, similar to the lasso, see Hastie et al. 2009): 
574 | 
575 | <<glmboost,cache=TRUE>>=
576 | # Fit model
577 | ph.glmboost <- glmboost(ph.0.10 ~., data = d.ph10[ c("ph.0.10", l.covar)],
578 |                         control = boost_control(mstop = 200),
579 |                         center = TRUE)
580 | 
581 | # Find tuning parameter: mstop = number of boosting itertations
582 | set.seed(42)
583 | ph.glmboost.cv <- cvrisk(ph.glmboost, 
584 |                          folds = mboost::cv(model.weights(ph.glmboost), 
585 |                                             type = "kfold"))
586 | 
587 | # print optimal mstop
588 | mstop(ph.glmboost.cv)
589 | 
590 | ## print model with fitted coefficents 
591 | # ph.glmboost[ mstop(ph.glmboost.cv)]
592 | @
593 | 
594 | 
595 | <<glmboost-plot,fig.width=7,fig.height=5, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Path of cross validation error along the boosting iterations.", echo = FALSE>>=
596 | plot(ph.glmboost.cv)
597 | @
598 | 
599 | 
600 | \subsection{Boosting with splines baselearners (advanced task)}
601 | 
602 | To model non-linear relationships we use splines baselearners. Spatial autocorrelation can be captured by adding a smooth spatial surface. This type of model needs a bit more setup. Each covariate type has its own specification. All baselearners should have the same degrees of freedom, otherwise biased model selection might be the result.  
603 | 
604 | <<gamboost,cache=TRUE,message=FALSE>>=
605 | 
606 | # quick set up formula
607 | 
608 | # Response
609 | f.resp <- "ph.0.10 ~ "
610 | 
611 | # Intercept, add to dataframe 
612 | f.int <- "bols(int, intercept = F, df = 1)"
613 | d.ph10$int <- rep(1, nrow(d.ph10))
614 | 
615 | # Smooth spatial surface (needs > 4 degrees of freedom)
616 | f.spat <- "bspatial(x, y, df = 5, knots = 12)"
617 | 
618 | # Linear baselearners for factors, maybe use df = 5
619 | f.fact <- paste( 
620 |   paste( "bols(", l.factors, ", intercept = F)" ), 
621 |   collapse = "+" 
622 | )
623 | 
624 | # Splines baselearners for continuous covariates
625 | f.num <- paste( 
626 |   paste( "bbs(", l.numeric, ", center = T, df = 5)" ),
627 |   collapse = "+"
628 | )
629 | 
630 | # create complete formula 
631 | ph.form <- as.formula( paste( f.resp, 
632 |                               paste( c(f.int, f.num, f.spat, f.fact),
633 |                                      collapse = "+")) ) 
634 | # fit the boosting model
635 | ph.gamboost  <- gamboost(ph.form, data = d.ph10,
636 |                          control = boost_control(mstop = 200))
637 | 
638 | # Find tuning parameter
639 | ph.gamboost.cv <- cvrisk(ph.gamboost, 
640 |                          folds = mboost::cv(model.weights(ph.gamboost), 
641 |                                             type = "kfold"))
642 | @
643 | 
644 | Analyse boosting model:
645 | 
646 | <<gamboost-results>>=
647 | # print optimal mstop
648 | mstop(ph.gamboost.cv)
649 | 
650 | ## print model info 
651 | ph.gamboost[ mstop(ph.glmboost.cv)]
652 | ## print number of chosen baselearners 
653 | length( t.sel <-  summary( ph.gamboost[ mstop(ph.glmboost.cv)] )$selprob ) 
654 | 
655 | # Most often selected were: 
656 | summary( ph.gamboost[ mstop(ph.glmboost.cv)] )$selprob[1:5]  
657 | @
658 | 
659 | <<gamboost-partial-plots,echo=FALSE,fig.width=7,fig.height=6, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Residual plots of the 4 covariates with highest selection frequency.">>=
660 | par(mfrow=c(2,2) )
661 | plot(ph.gamboost[ mstop(ph.glmboost.cv)], which = names(t.sel[1:4]) )
662 | @
663 | 
664 | <<gamboost-partial-plots-spatial,echo=FALSE,fig.width=7,fig.height=5, fig.align='center', out.width='0.8\\textwidth',fig.cap = "Modelled smooth spatial surface based on the coordinates.">>=
665 | par(mfrow=c(1,1) )
666 | plot(ph.gamboost[ mstop(ph.glmboost.cv)], which = grep("bspat", names(t.sel), value = T) )
667 | @
668 | 
669 | \clearpage
670 | 
671 | 
672 | \section{Model averaging}
673 | 
674 | So far we calibrated several models to predict topsoil pH. With model averaging we can combine these predictions computing a simple \texttt{mean}. Besides simple averaging, we could use weights like $\frac{1}{MSE}$ (make sure they sum up to 1). 
675 | 
676 | Compute validation statistics (e.g. root mean squared error, R$^2$) on the validation set for the predictions of each model and the (weighted) averaged predictions. Is the prediction accuracy improved?
677 | 
678 | You could now add models computed from random forest, support vector machines or gradient boosted trees. Does this improve model accuracy?
679 | 
680 | \bigskip
681 | 
682 | Note: Be aware not to select the final model based on the validation data. If you start tuning your predictions on your validation data, you loose the independent estimate of prediction accuracy... better choose your method for the final predictions based on cross validation (e.g. on the same sets).  
683 | 
684 | 
685 | 
686 | 
687 | \bigskip 
688 | \section*{R session information}
689 | 
690 | \footnotesize
691 | This document was generated with:
692 | <<session-info,results='asis'>>=
693 | toLatex(sessionInfo(), locale = FALSE)
694 | @
695 | \normalsize
696 | 
697 | <<export-r-code,echo=FALSE,result="hide">>=
698 | # purl("OpenGeoHub-machine-learning-training-1.Rnw")
699 | @
700 | 
701 | 
702 | \end{document}
703 | 


--------------------------------------------------------------------------------
/exercises/OpenGeoHub-machine-learning-training-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mnocci/2019_OpenGeoHub_machine-learning-madlene/2dcb94fea24e5f3c50e77f04df5b4570e066af25/exercises/OpenGeoHub-machine-learning-training-1.pdf


--------------------------------------------------------------------------------
/exercises/OpenGeoHub-machine-learning-training-2.R:
--------------------------------------------------------------------------------
  1 | ## ----general-options,echo=FALSE------------------------------------------
  2 | # This code is used to generate the PDF (knitr report)
  3 | library(knitr)
  4 | # output code, but no warnings
  5 | opts_chunk$set(echo = TRUE,eval=TRUE,warning=FALSE)
  6 | # auto check dependencies (of cached chunks, its an approximation only)
  7 | opts_chunk$set(autodep = TRUE)
  8 | # dep_auto() # print dependencies 
  9 | 
 10 | ## ----load-packages,message=FALSE-----------------------------------------
 11 | library(randomForest) # for random forest models
 12 | library(quantregForest) # for quantile random forest
 13 | library(grpreg) # for group lasso
 14 | library(geoGAM) # for the Berne test data set
 15 | 
 16 | ## ----read-in-data--------------------------------------------------------
 17 | dim(berne)
 18 | # Select soil pH in 0-10 cm as continuous response, 
 19 | # select calibration data and remove rows with missing pH 
 20 | d.ph10 <- berne[ berne$dataset == "calibration" & !is.na(berne$ph.0.10), ]
 21 | d.ph10 <- d.ph10[ complete.cases(d.ph10[13:ncol(d.ph10)]), ]
 22 | # covariates start at col 13
 23 | l.covar <- names(d.ph10[, 13:ncol(d.ph10)])
 24 | 
 25 | ## ----fit-random-forest,cache=TRUE----------------------------------------
 26 | set.seed(17)
 27 | rf.ph <- randomForest(x = d.ph10[, l.covar],
 28 |                       y = d.ph10$ph.0.10)
 29 | 
 30 | ## ----plot-covar-importance, fig.width=5, fig.height=6, fig.align='center', fig.pos="!hb", out.width='0.5\\textwidth', fig.cap="Covariate importance of 20 most important covariates for topsoil pH (before selection)."----
 31 | varImpPlot(rf.ph, n.var = 20, main = "")
 32 | 
 33 | ## ----select-random-forest,cache=TRUE-------------------------------------
 34 | # speed up the process by removing 5-10 covariates at a time
 35 | s.seq <- sort( c( seq(5, 95, by = 5), 
 36 |                   seq(100, length(l.covar), by = 10) ), 
 37 |                decreasing = T)
 38 | 
 39 | # collect results in list
 40 | qrf.elim <- oob.mse <- list()
 41 | 
 42 | # save model and OOB error of current fit        
 43 | qrf.elim[[1]] <- rf.ph
 44 | oob.mse[[1]] <- tail(qrf.elim[[1]]$mse, n=1)
 45 | l.covar.sel <- l.covar
 46 | 
 47 | # Iterate through number of retained covariates           
 48 | for( ii in 1:length(s.seq) ){
 49 |   t.imp <- importance(qrf.elim[[ii]], type = 2)
 50 |   t.imp <- t.imp[ order(t.imp[,1], decreasing = T),]
 51 |   
 52 |   qrf.elim[[ii+1]] <- randomForest(x = d.ph10[, names(t.imp[1:s.seq[ii]])],
 53 |                                    y = d.ph10$ph.0.10 )
 54 |   oob.mse[[ii+1]] <- tail(qrf.elim[[ii+1]]$mse,n=1)
 55 |   
 56 | }
 57 | 
 58 | 
 59 | # Prepare a data frame for plot
 60 | elim.oob <- data.frame(elim.n = c(length(l.covar), s.seq[1:length(s.seq)]), 
 61 |                        elim.OOBe = unlist(oob.mse) )
 62 | 
 63 | ## ----plot-selection-path,fig.align='center',echo=FALSE,fig.height = 5,out.width='0.8\\textwidth',fig.cap = "Path of out-of-bag mean squared error as covariates are removed. Minimum is found at 55 covariates."----
 64 | 
 65 | plot(elim.oob$elim.n, elim.oob$elim.OOBe, 
 66 |      ylab = "OOB error (MSE)",
 67 |      xlab = "n covariates", 
 68 |      pch = 20)
 69 | abline(v = elim.oob$elim.n[ which.min(elim.oob$elim.OOBe)], lty = "dotted")
 70 | 
 71 | ## ----partial-residual-plots-lm-lasso,fig.width=7,fig.height=4, fig.align='center', out.width='0.9\\textwidth',fig.cap = "Partial residual plots for a climate covariate in the ordinary least squares fit and the lasso."----
 72 | # create a linear model (example, with covariates from lasso)
 73 | ols <- lm( ph.0.10 ~ timeset + ge_geo500h3id + cl_mt_gh_4 + 
 74 |              tr_se_curvplan2m_std_25c, data = d.ph10 ) 
 75 | par(mfrow = c(1,2)) # two plots on same figure
 76 | # residual plot for covariate cl_mt_gh_4
 77 | termplot(ols, partial.resid = T, terms = "cl_mt_gh_4",
 78 |          ylim = c(-2,2),
 79 |          main = "Ordinary Least Squares")
 80 | abline(h=0, lty = 2)
 81 | 
 82 | ## Create partial residual plot for lasso 
 83 | # there is no direct function available, but we can easily 
 84 | # construct the plot with
 85 | # y-axis: residuals + effect of term (XBi), scaled
 86 | # x-axis: values covariate
 87 | # regression line: model fit of axis y~x 
 88 | 
 89 | ## First setup and fit the model 
 90 | l.factors <- names(d.ph10[l.covar])[ 
 91 |   t.f <- unlist( lapply(d.ph10[l.covar], is.factor) ) ]
 92 | l.numeric <-  names(t.f[ !t.f ])
 93 | # create a vector that labels the groups with the same number  
 94 | g.groups <- c( 1:length(l.numeric), 
 95 |                unlist( 
 96 |                  sapply(1:length(l.factors), function(n){
 97 |                    rep(n+length(l.numeric), nlevels(d.ph10[, l.factors[n]])-1)
 98 |                  }) 
 99 |                ) 
100 | )
101 | # grpreg needs model matrix as input
102 | XX <- model.matrix( ~., d.ph10[, c(l.numeric, l.factors), F])[,-1]
103 | # cross validation (CV) to find lambda
104 | ph.cvfit <- cv.grpreg(X = XX, y = d.ph10$ph.0.10, 
105 |                       group = g.groups, 
106 |                       penalty = "grLasso",
107 |                       returnY = T) # access CV results
108 | # choose optimal lambda: CV minimum error + 1 SE (see glmnet)
109 | l.se <- ph.cvfit$cvse[ ph.cvfit$min ] + ph.cvfit$cve[ ph.cvfit$min ]
110 | idx.se <- min( which( ph.cvfit$cve < l.se ) ) - 1
111 | 
112 | # get the non-zero coefficients:
113 | t.coef <- ph.cvfit$fit$beta[, idx.se ]
114 | 
115 | # get the index of the covariate
116 | idx <- which( names(t.coef) == "cl_mt_gh_4" )
117 | 
118 | # residuals of lasso model chosen above
119 | residuals <- d.ph10$ph.0.10 - ph.cvfit$Y[,idx.se] 
120 | # prediction for this covariate XBi
121 | Xbeta <- ph.cvfit$fit$beta[idx, idx.se] * d.ph10$cl_mt_gh_4
122 | # calculate partial residuals and center with mean
123 | part.resid <- scale(residuals + Xbeta, scale = F)[,1]
124 | 
125 | # plot with similar settings
126 | plot(d.ph10$cl_mt_gh_4, 
127 |      part.resid, pch = 1, col = "grey",
128 |      ylim = c(-2,2),
129 |      ylab = "partial residuals [%]", xlab = "cl_mt_gh_4",
130 |      main = "Lasso")
131 | abline(lm(part.resid ~ d.ph10$cl_mt_gh_4), col = "red")
132 | abline(h=0, lty = 2)
133 | 
134 | ## ----partial-dep-rf,fig.width=7,fig.height=8, fig.align='center', out.width='0.9\\textwidth',fig.cap = "Partial dependence plots for the 4 most important covariates."----
135 | # select the model with minimum OOB error
136 | rf.selected <- qrf.elim[[ which.min(elim.oob$elim.OOBe)]]
137 | 
138 | t.imp <- importance(rf.selected, type = 2)
139 | t.imp <- t.imp[ order(t.imp[,1], decreasing = T),]
140 | 
141 | # 4 most important covariates
142 | ( t.3 <- names( t.imp[ 1:4 ] ) )
143 | 
144 | par( mfrow = c(2,2))
145 | 
146 | # Bug in partialPlot(): function does not allow a variable for the 
147 | #  covariate name (e. g. x.var = name) in a loop
148 | partialPlot(x = rf.selected, 
149 |             pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 
150 |             x.var = "cl_mt_rr_3", ylab = "ph [-]", main = "") 
151 | partialPlot(x = rf.selected, 
152 |             pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 
153 |             x.var = "cl_mt_rr_11", ylab = "ph [-]", main = "" ) 
154 | partialPlot(x = rf.selected, 
155 |             pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 
156 |             x.var = "timeset", ylab = "ph [-]", main = "" ) 
157 | partialPlot(x = rf.selected, 
158 |             pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 
159 |             x.var = "cl_mt_rr_y", ylab = "ph [-]", main = "" ) 
160 | 
161 | 
162 | ## ----quantRF,cache=TRUE--------------------------------------------------
163 | # Fit quantile regression forest 
164 | ph.quantRF <- quantregForest(x = d.ph10[, l.covar[1:30]],
165 |                              y = d.ph10$ph.0.10) 
166 | 
167 | # select validation data
168 | d.ph10.val <- berne[berne$dataset == "validation" & !is.na(berne$ph.0.10), ]
169 | d.ph10.val <- d.ph10.val[complete.cases(d.ph10.val[l.covar]), ]
170 | 
171 | # compute predictions (mean) for each validation site
172 | # (use function from random forest package)
173 | ph.pred <- randomForest:::predict.randomForest(ph.quantRF,
174 |                                                newdata = d.ph10.val)
175 | 
176 | ## ----investigate-single-point,echo=FALSE,fig.pos='!h',fig.height=5,fig.width=4,fig.align='center', out.width='0.4\\textwidth',fig.cap= "Histogram of predictive distribution for one single prediction point (dotted lines: 90 \\% prediction interval, dashed line: mean prediction)."----
177 | 
178 | ## predict 0.01, 0.02,..., 0.99 quantiles for validation data
179 | ph.pred.distribution <- predict(ph.quantRF,
180 |                                 newdata = d.ph10.val, 
181 |                                 what = seq(0.01, 0.99, by = 0.01))
182 | 
183 | # plot predictive distribution for one site
184 | sel.site <- 12
185 | hist( ph.pred.distribution[sel.site,], 
186 |       col = "grey", main = "",
187 |       xlab = "predicted pH [-]", breaks = 12)
188 | 
189 | # add 90 % prediction interval to plot
190 | abline(v = c( ph.pred.distribution[sel.site, "quantile= 0.05"],
191 |               ph.pred.distribution[sel.site, "quantile= 0.95"]), 
192 |        lty = "dotted")
193 | abline(v = ph.pred[sel.site], lty = "dashed")
194 | 
195 | ## ----create-intervall-plot,fig.height=5,fig.align='center',echo=FALSE, out.width='0.8\\textwidth',fig.cap= "Coverage of 90 \\%-prediction intervals computed by model-based boostrap."----
196 | 
197 | # get 90% quantiles for each point
198 | t.quant90 <- cbind( 
199 |   ph.pred.distribution[, "quantile= 0.05"],
200 |   ph.pred.distribution[, "quantile= 0.95"])
201 | 
202 | # get index for ranking in the plot
203 | t.ix <- sort( ph.pred, index.return = T )$ix
204 | 
205 | # plot predictions in increasing order
206 | plot(
207 |   ph.pred[t.ix], type = "n",
208 |   ylim = range(c(t.quant90, ph.pred, d.ph10.val$ph.0.10)),
209 |   xlab = "rank of predictions", 
210 |   ylab =  "ph [-]" 
211 | ) 
212 | 
213 | # add prediction intervals
214 | segments(
215 |   1:nrow( d.ph10.val ),
216 |   t.lower <- (t.quant90[,1])[t.ix],
217 |   1:nrow( d.ph10.val ),
218 |   t.upper <- (t.quant90[,2])[t.ix],
219 |   col = "grey"
220 | )
221 | 
222 | # select colour for dots outside of intervals
223 | t.col <- sapply(
224 |   1:length( t.ix ),
225 |   function( i, x, lower, upper ){
226 |     as.integer( cut( x[i], c( -Inf, lower[i]-0.000001, 
227 |                               x[i], upper[i]+0.000001, Inf ) ) )
228 |   },
229 |   x = d.ph10.val$ph.0.10[t.ix],
230 |   lower = t.lower, upper = t.upper
231 | )
232 | 
233 | # add observed values on top 
234 | points(
235 |   1:nrow( d.ph10.val ),
236 |   d.ph10.val$ph.0.10[t.ix], cex = 0.7,
237 |   pch = c( 16, 1, 16)[t.col],
238 |   col = c( "darkgreen", "black", "darkgreen" )[t.col]
239 | )
240 | points(ph.pred[t.ix], pch = 16, cex = 0.6, col = "grey60")
241 | 
242 | # Add meaningfull legend
243 | legend( "topleft", 
244 |         bty = "n", cex = 0.85,
245 |         pch = c( NA, 16, 1, 16 ), pt.cex = 0.6, lwd = 1,
246 |         lty = c( 1, NA, NA, NA ), 
247 |         col = c( "grey", "grey60", "black", "darkgreen" ), 
248 |         seg.len = 0.8,
249 |         legend = c(
250 |           "90 %-prediction interval", 
251 |           paste0("prediction (n = ", nrow(d.ph10.val), ")"),
252 |           paste0("observation within interval (n = ", 
253 |                  sum( t.col %in% c(2) ), ")" ),
254 |           paste0("observation outside interval (n = ", 
255 |                  sum( t.col %in% c(1,3)), ", ", 
256 |                  round(sum(t.col %in% c(1,3)) / 
257 |                          nrow(d.ph10.val)*100,1), "%)") )
258 | )
259 | 
260 | ## ----create-coverage-probabilty-plots,fig.align='center', fig.pos = "h", fig.width=4,fig.height=4.5, out.width='0.45\\textwidth',fig.cap="Coverage probabilities of one-sided prediction intervals computed for the validation data set of topsoil pH of the Berne study area."----
261 | 
262 | # Coverage probabilities plot
263 | # create sequence of nominal probabilities 
264 | ss <- seq(0,1,0.01)
265 | # compute coverage for sequence
266 | t.prop.inside <- sapply(ss, function(ii){
267 |   boot.quantile <-  t( apply(ph.pred.distribution, 1, quantile, 
268 |                              probs = c(0,ii) ) )[,2]
269 |   return( sum(boot.quantile <= d.ph10.val$ph.0.10)/nrow(d.ph10.val) )
270 | })
271 | 
272 | plot(x = ss, y = t.prop.inside[length(ss):1], 
273 |      type = "l", asp = 1,
274 |      ylab = "coverage probabilities", 
275 |      xlab="nominal probabilities" )
276 | # add 1:1-line  
277 | abline(0,1, lty = 2, col = "grey60")
278 | # add lines of the two-sided 90 %-prediction interval
279 | abline(v = c(0.05, 0.95), lty = "dotted", col = "grey20")
280 | 
281 | ## ----session-info,results='asis'-----------------------------------------
282 | toLatex(sessionInfo(), locale = FALSE)
283 | 
284 | ## ----export-r-code,echo=FALSE--------------------------------------------
285 | #purl("GEOSTAT-machine-learning-training-2.Rnw")
286 | 
287 | 


--------------------------------------------------------------------------------
/exercises/OpenGeoHub-machine-learning-training-2.Rnw:
--------------------------------------------------------------------------------
  1 | % !TeX spellcheck = en_US
  2 | %#------------------------------------------------------------------------------
  3 | %# Name:        OpenGeoHub-machine-learning-training-2.Rnw
  4 | %#              (knitr document: R + Latex)
  5 | %#
  6 | %# Inhalt:      Exercises for OpenGeoHub Summer School, 2. Series
  7 | %#
  8 | %# Author:      Madlene Nussbaum, BFH-HAFL
  9 | %# Date:        Mai 2018
 10 | %# Licence:     CC-BY-NC-SA
 11 | %#------------------------------------------------------------------------------
 12 | 
 13 | \documentclass[11pt,a4paper,twoside]{article}
 14 | 
 15 | \usepackage[utf8]{inputenc}
 16 | \usepackage{blindtext} % for blind text
 17 | \usepackage{hyperref} % links in table of contents
 18 | \usepackage[english]{babel}
 19 | \usepackage{amsthm} % for renvironment
 20 | \usepackage{natbib}
 21 | \usepackage[iso,german]{isodate}
 22 | \usepackage{subcaption}
 23 | 
 24 | \newtheorem{rexample}{R Example}[section]
 25 | 
 26 | % Some colors for the links
 27 | \definecolor{darkblue}{rgb}{0,0,0.5}
 28 | \definecolor{darkmagenta}{rgb}{0.5,0,0.5}
 29 | \definecolor{darkgreen}{rgb}{0,0.4,0}
 30 | \definecolor{darkred}{rgb}{0.7,0,0}
 31 | 
 32 | \hypersetup{
 33 | draft=false,
 34 | colorlinks=true,linkcolor=darkblue,citecolor=darkred,urlcolor=darkgreen,
 35 | breaklinks=true, bookmarksnumbered=true
 36 | }
 37 | 
 38 | % % Headers
 39 | \usepackage{fancyhdr}
 40 | \pagestyle{fancy}
 41 | \fancyhf{}
 42 | \fancyhead[OL]{\leftmark}% odd page, left
 43 | \fancyhead[OR]{\thepage}% odd page, right
 44 | \fancyhead[EL]{\thepage}% even page, left 
 45 | \fancyhead[ER]{\leftmark}% even page, right
 46 | \renewcommand{\headrulewidth}{0.4pt}
 47 | \fancyheadoffset[R]{1pt}
 48 | 
 49 | % captions in bold
 50 | \usepackage[font = {small}, labelfont = bf]{caption}
 51 | 
 52 | % format page borders
 53 | \usepackage[a4paper]{geometry}
 54 | \geometry{verbose,tmargin=2.5cm,bmargin=2.3cm,lmargin=2.5cm,rmargin=3cm,headheight=1.7cm,headsep=0.9cm,footskip=1.5cm}
 55 | 
 56 | % no indents
 57 | \setlength\parindent{0pt}
 58 | \setlength{\parskip}{3pt}
 59 | 
 60 | % Top aling logos on title page
 61 | \def\imagebox#1#2{\vtop to #1{\null\hbox{#2}\vfill}}
 62 | 
 63 | \newcommand{\bskip}{\vspace{0.7cm}}
 64 | 
 65 | \begin{document}
 66 | 
 67 | % % Logos
 68 | \begin{figure}
 69 | \centering
 70 | \begin{subfigure}[t]{0.27\textwidth}
 71 | % \includegraphics[width=\textwidth]{BFH-Logo.pdf}
 72 | \imagebox{37.5mm}{\includegraphics[width=\textwidth]{figure/BFH-Logo.pdf}}
 73 | \end{subfigure}
 74 | \hfill
 75 | \begin{subfigure}[t]{0.3\textwidth}
 76 | % \includegraphics[width=\textwidth]{3455_original_isric.png}
 77 | \imagebox{37.5mm}{\includegraphics[width=\textwidth]{figure/logo-opengeohub.png}}
 78 | \end{subfigure}
 79 | \end{figure}
 80 | 
 81 | % Title
 82 | \vspace{5cm}
 83 | {\LARGE\textsf{OpenGeoHub Summer School}}
 84 | 
 85 | \vspace{0.7cm}
 86 | {\Large\textbf{\textsf{Mastering Machine Learning for Spatial Prediction II} }}
 87 | 
 88 | \vspace{0.3cm}
 89 | 
 90 | {\Large\textsf{Practical training}  }
 91 | 
 92 | \vspace{0.5cm}
 93 | \textsf{Madlene Nussbaum, 4/5 September 2019}
 94 | 
 95 | { \small \textsf{\copyright~ CC-BY-NC-SA } 	} 
 96 | \bigskip
 97 | 
 98 | 
 99 | % Table of contents (with empty back page()
100 | \setlength{\parskip}{0pt}
101 | \tableofcontents
102 | \thispagestyle{empty}
103 | \setlength{\parskip}{4pt}
104 | 
105 | % \newpage
106 | % \mbox{}
107 | % \thispagestyle{empty}
108 | % \newpage
109 | 
110 | % --------
111 | 
112 | 
113 | <<general-options,echo=FALSE>>=
114 | # This code is used to generate the PDF (knitr report)
115 | library(knitr)
116 | # output code, but no warnings
117 | opts_chunk$set(echo = TRUE,eval=TRUE,warning=FALSE)
118 | # auto check dependencies (of cached chunks, its an approximation only)
119 | opts_chunk$set(autodep = TRUE)
120 | # dep_auto() # print dependencies 
121 | @
122 | 
123 | 
124 | 
125 | \section*{Preparation}
126 | 
127 | Load needed packages:
128 | 
129 | <<load-packages,message=FALSE>>=
130 | library(randomForest) # for random forest models
131 | library(quantregForest) # for quantile random forest
132 | library(grpreg) # for group lasso
133 | library(geoGAM) # for the Berne test data set
134 | @
135 | 
136 | 
137 | Load again the \textsl{Berne} data, select the calibration set and remove missing values in covariates. 
138 | 
139 | <<read-in-data>>=
140 | dim(berne)
141 | # Select soil pH in 0-10 cm as continuous response, 
142 | # select calibration data and remove rows with missing pH 
143 | d.ph10 <- berne[ berne$dataset == "calibration" & !is.na(berne$ph.0.10), ]
144 | d.ph10 <- d.ph10[ complete.cases(d.ph10[13:ncol(d.ph10)]), ]
145 | # covariates start at col 13
146 | l.covar <- names(d.ph10[, 13:ncol(d.ph10)])
147 | @
148 | 
149 | 
150 | \section{Selection of covariates}
151 | 
152 | For tree based ensemble methods covariate importance can be computed. Based on this measure non-relevant covariates can be excluded and possibly model performance can be increased. 
153 | 
154 | Fit random forest model:
155 | 
156 | <<fit-random-forest,cache=TRUE>>=
157 | set.seed(17)
158 | rf.ph <- randomForest(x = d.ph10[, l.covar],
159 |                       y = d.ph10$ph.0.10)
160 | @
161 | 
162 | Create the importance plot:
163 | 
164 | <<plot-covar-importance, fig.width=5, fig.height=6, fig.align='center', fig.pos="!hb", out.width='0.5\\textwidth', fig.cap="Covariate importance of 20 most important covariates for topsoil pH (before selection).">>=
165 | varImpPlot(rf.ph, n.var = 20, main = "")
166 | @
167 | 
168 | 
169 | Then, reduce covariates by recursive backward elimination using permuted covariate importance (\texttt{type = 2} in \texttt{importance()}): 
170 | 
171 | <<select-random-forest,cache=TRUE>>=
172 | # speed up the process by removing 5-10 covariates at a time
173 | s.seq <- sort( c( seq(5, 95, by = 5), 
174 |                   seq(100, length(l.covar), by = 10) ), 
175 |                decreasing = T)
176 | 
177 | # collect results in list
178 | qrf.elim <- oob.mse <- list()
179 | 
180 | # save model and OOB error of current fit        
181 | qrf.elim[[1]] <- rf.ph
182 | oob.mse[[1]] <- tail(qrf.elim[[1]]$mse, n=1)
183 | l.covar.sel <- l.covar
184 | 
185 | # Iterate through number of retained covariates           
186 | for( ii in 1:length(s.seq) ){
187 |   t.imp <- importance(qrf.elim[[ii]], type = 2)
188 |   t.imp <- t.imp[ order(t.imp[,1], decreasing = T),]
189 |   
190 |   qrf.elim[[ii+1]] <- randomForest(x = d.ph10[, names(t.imp[1:s.seq[ii]])],
191 |                                    y = d.ph10$ph.0.10 )
192 |   oob.mse[[ii+1]] <- tail(qrf.elim[[ii+1]]$mse,n=1)
193 |   
194 | }
195 | 
196 | 
197 | # Prepare a data frame for plot
198 | elim.oob <- data.frame(elim.n = c(length(l.covar), s.seq[1:length(s.seq)]), 
199 |                        elim.OOBe = unlist(oob.mse) )
200 | @
201 | 
202 | <<plot-selection-path,fig.align='center',echo=FALSE,fig.height = 5,out.width='0.8\\textwidth',fig.cap = "Path of out-of-bag mean squared error as covariates are removed. Minimum is found at 55 covariates.">>=
203 | 
204 | plot(elim.oob$elim.n, elim.oob$elim.OOBe, 
205 |      ylab = "OOB error (MSE)",
206 |      xlab = "n covariates", 
207 |      pch = 20)
208 | abline(v = elim.oob$elim.n[ which.min(elim.oob$elim.OOBe)], lty = "dotted")
209 | @
210 | 
211 | 
212 | \paragraph{Please continue:}
213 | 
214 | \begin{itemize}
215 | \item Optimize m$_{try}$ before you start the covariate selection (function \texttt{train}, package \texttt{caret}). How much does the OOB error decrease? Are both steps (tuning, selection) worth the effort from a point of view of prediction performance?
216 | \item Implement the same covariate selection for gradient boosting with trees as baselearners (package \texttt{gbm} or \texttt{caret}). Do you find the same covariates in the final set? Why do you expect differences?
217 | \end{itemize}
218 | 
219 | 
220 | \section{Model interpretation}
221 | \subsection{Partial resiudal plots}
222 | 
223 | To demonstrate the principle we create a partial residual plot for an ordinary least squares fit. Then, we add the same plot for the lasso fitted on the topsoil pH data above:    
224 | 
225 | <<partial-residual-plots-lm-lasso,fig.width=7,fig.height=4, fig.align='center', out.width='0.9\\textwidth',fig.cap = "Partial residual plots for a climate covariate in the ordinary least squares fit and the lasso.">>=
226 | # create a linear model (example, with covariates from lasso)
227 | ols <- lm( ph.0.10 ~ timeset + ge_geo500h3id + cl_mt_gh_4 + 
228 |              tr_se_curvplan2m_std_25c, data = d.ph10 ) 
229 | par(mfrow = c(1,2)) # two plots on same figure
230 | # residual plot for covariate cl_mt_gh_4
231 | termplot(ols, partial.resid = T, terms = "cl_mt_gh_4",
232 |          ylim = c(-2,2),
233 |          main = "Ordinary Least Squares")
234 | abline(h=0, lty = 2)
235 | 
236 | ## Create partial residual plot for lasso 
237 | # there is no direct function available, but we can easily 
238 | # construct the plot with
239 | # y-axis: residuals + effect of term (XBi), scaled
240 | # x-axis: values covariate
241 | # regression line: model fit of axis y~x 
242 | 
243 | ## First setup and fit the model 
244 | l.factors <- names(d.ph10[l.covar])[ 
245 |   t.f <- unlist( lapply(d.ph10[l.covar], is.factor) ) ]
246 | l.numeric <-  names(t.f[ !t.f ])
247 | # create a vector that labels the groups with the same number  
248 | g.groups <- c( 1:length(l.numeric), 
249 |                unlist( 
250 |                  sapply(1:length(l.factors), function(n){
251 |                    rep(n+length(l.numeric), nlevels(d.ph10[, l.factors[n]])-1)
252 |                  }) 
253 |                ) 
254 | )
255 | # grpreg needs model matrix as input
256 | XX <- model.matrix( ~., d.ph10[, c(l.numeric, l.factors), F])[,-1]
257 | # cross validation (CV) to find lambda
258 | ph.cvfit <- cv.grpreg(X = XX, y = d.ph10$ph.0.10, 
259 |                       group = g.groups, 
260 |                       penalty = "grLasso",
261 |                       returnY = T) # access CV results
262 | # choose optimal lambda: CV minimum error + 1 SE (see glmnet)
263 | l.se <- ph.cvfit$cvse[ ph.cvfit$min ] + ph.cvfit$cve[ ph.cvfit$min ]
264 | idx.se <- min( which( ph.cvfit$cve < l.se ) ) - 1
265 | 
266 | # get the non-zero coefficients:
267 | t.coef <- ph.cvfit$fit$beta[, idx.se ]
268 | 
269 | # get the index of the covariate
270 | idx <- which( names(t.coef) == "cl_mt_gh_4" )
271 | 
272 | # residuals of lasso model chosen above
273 | residuals <- d.ph10$ph.0.10 - ph.cvfit$Y[,idx.se] 
274 | # prediction for this covariate XBi
275 | Xbeta <- ph.cvfit$fit$beta[idx, idx.se] * d.ph10$cl_mt_gh_4
276 | # calculate partial residuals and center with mean
277 | part.resid <- scale(residuals + Xbeta, scale = F)[,1]
278 | 
279 | # plot with similar settings
280 | plot(d.ph10$cl_mt_gh_4, 
281 |      part.resid, pch = 1, col = "grey",
282 |      ylim = c(-2,2),
283 |      ylab = "partial residuals [%]", xlab = "cl_mt_gh_4",
284 |      main = "Lasso")
285 | abline(lm(part.resid ~ d.ph10$cl_mt_gh_4), col = "red")
286 | abline(h=0, lty = 2)
287 | @
288 | 
289 | 
290 | 
291 | \clearpage
292 | \subsection{Partial dependence plots}
293 | 
294 | Interpretation of the most important covariates of a random forest model can be done by partial dependence plots. But keep in mind that the remaining covariates after model selection might be still multi-collinear, hence covariates might be exchangeable.
295 | 
296 | Partial dependence plots are a general method and can be also applied to other supervised machine learning methods. 
297 | 
298 | <<partial-dep-rf,fig.width=7,fig.height=8, fig.align='center', out.width='0.9\\textwidth',fig.cap = "Partial dependence plots for the 4 most important covariates.">>=
299 | # select the model with minimum OOB error
300 | rf.selected <- qrf.elim[[ which.min(elim.oob$elim.OOBe)]]
301 | 
302 | t.imp <- importance(rf.selected, type = 2)
303 | t.imp <- t.imp[ order(t.imp[,1], decreasing = T),]
304 | 
305 | # 4 most important covariates
306 | ( t.3 <- names( t.imp[ 1:4 ] ) )
307 | 
308 | par( mfrow = c(2,2))
309 | 
310 | # Bug in partialPlot(): function does not allow a variable for the 
311 | #  covariate name (e. g. x.var = name) in a loop
312 | partialPlot(x = rf.selected, 
313 |             pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 
314 |             x.var = "cl_mt_rr_3", ylab = "ph [-]", main = "") 
315 | partialPlot(x = rf.selected, 
316 |             pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 
317 |             x.var = "cl_mt_rr_11", ylab = "ph [-]", main = "" ) 
318 | partialPlot(x = rf.selected, 
319 |             pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 
320 |             x.var = "timeset", ylab = "ph [-]", main = "" ) 
321 | partialPlot(x = rf.selected, 
322 |             pred.data = d.ph10[, names(rf.selected$forest$xlevels)], 
323 |             x.var = "cl_mt_rr_y", ylab = "ph [-]", main = "" ) 
324 | 
325 | @
326 | 
327 | 
328 | \paragraph{Please continue:}
329 | 
330 | \begin{itemize}
331 | \item Create partial dependence plots for the boosted trees model (\texttt{?plot.gbm}, \texttt{plot(.., i.var = ..)}). Do you find the same relationships?
332 | \item What do you conclude from the plots? Are the plotted covariates good predictors?
333 | \end{itemize}
334 | 
335 | 
336 | \clearpage 
337 | \section{Prediction uncertainty with quantile regression forest}
338 | 
339 | When reporting predictions it is important to give prediction uncertainty along with them. For any method not yielding uncertainty estimates form the method itself (e.g. kriging variances) a non-parametric or a model-based bootstrap approach can be used. Quantile regression forest computes quantiles of the predictions directly from the bootstrap (bootstrap aggregation, bagging) that is done within random forest. 
340 | 
341 | <<quantRF,cache=TRUE>>=
342 | # Fit quantile regression forest 
343 | ph.quantRF <- quantregForest(x = d.ph10[, l.covar[1:30]],
344 |                              y = d.ph10$ph.0.10) 
345 | 
346 | # select validation data
347 | d.ph10.val <- berne[berne$dataset == "validation" & !is.na(berne$ph.0.10), ]
348 | d.ph10.val <- d.ph10.val[complete.cases(d.ph10.val[l.covar]), ]
349 | 
350 | # compute predictions (mean) for each validation site
351 | # (use function from random forest package)
352 | ph.pred <- randomForest:::predict.randomForest(ph.quantRF,
353 |                                                newdata = d.ph10.val)
354 | @
355 | 
356 | <<investigate-single-point,echo=FALSE,fig.pos='!h',fig.height=5,fig.width=4,fig.align='center', out.width='0.4\\textwidth',fig.cap= "Histogram of predictive distribution for one single prediction point (dotted lines: 90 \\% prediction interval, dashed line: mean prediction).">>=
357 | 
358 | ## predict 0.01, 0.02,..., 0.99 quantiles for validation data
359 | ph.pred.distribution <- predict(ph.quantRF,
360 |                                 newdata = d.ph10.val, 
361 |                                 what = seq(0.01, 0.99, by = 0.01))
362 | 
363 | # plot predictive distribution for one site
364 | sel.site <- 12
365 | hist( ph.pred.distribution[sel.site,], 
366 |       col = "grey", main = "",
367 |       xlab = "predicted pH [-]", breaks = 12)
368 | 
369 | # add 90 % prediction interval to plot
370 | abline(v = c( ph.pred.distribution[sel.site, "quantile= 0.05"],
371 |               ph.pred.distribution[sel.site, "quantile= 0.95"]), 
372 |        lty = "dotted")
373 | abline(v = ph.pred[sel.site], lty = "dashed")
374 | @
375 | 
376 | 
377 | Plot for evaluation of prediction intervals (as shown in presentation):  
378 | 
379 | <<create-intervall-plot,fig.height=5,fig.align='center',echo=FALSE, out.width='0.8\\textwidth',fig.cap= "Coverage of 90 \\%-prediction intervals computed by model-based boostrap.">>=
380 | 
381 | # get 90% quantiles for each point
382 | t.quant90 <- cbind( 
383 |   ph.pred.distribution[, "quantile= 0.05"],
384 |   ph.pred.distribution[, "quantile= 0.95"])
385 | 
386 | # get index for ranking in the plot
387 | t.ix <- sort( ph.pred, index.return = T )$ix
388 | 
389 | # plot predictions in increasing order
390 | plot(
391 |   ph.pred[t.ix], type = "n",
392 |   ylim = range(c(t.quant90, ph.pred, d.ph10.val$ph.0.10)),
393 |   xlab = "rank of predictions", 
394 |   ylab =  "ph [-]" 
395 | ) 
396 | 
397 | # add prediction intervals
398 | segments(
399 |   1:nrow( d.ph10.val ),
400 |   t.lower <- (t.quant90[,1])[t.ix],
401 |   1:nrow( d.ph10.val ),
402 |   t.upper <- (t.quant90[,2])[t.ix],
403 |   col = "grey"
404 | )
405 | 
406 | # select colour for dots outside of intervals
407 | t.col <- sapply(
408 |   1:length( t.ix ),
409 |   function( i, x, lower, upper ){
410 |     as.integer( cut( x[i], c( -Inf, lower[i]-0.000001, 
411 |                               x[i], upper[i]+0.000001, Inf ) ) )
412 |   },
413 |   x = d.ph10.val$ph.0.10[t.ix],
414 |   lower = t.lower, upper = t.upper
415 | )
416 | 
417 | # add observed values on top 
418 | points(
419 |   1:nrow( d.ph10.val ),
420 |   d.ph10.val$ph.0.10[t.ix], cex = 0.7,
421 |   pch = c( 16, 1, 16)[t.col],
422 |   col = c( "darkgreen", "black", "darkgreen" )[t.col]
423 | )
424 | points(ph.pred[t.ix], pch = 16, cex = 0.6, col = "grey60")
425 | 
426 | # Add meaningfull legend
427 | legend( "topleft", 
428 |         bty = "n", cex = 0.85,
429 |         pch = c( NA, 16, 1, 16 ), pt.cex = 0.6, lwd = 1,
430 |         lty = c( 1, NA, NA, NA ), 
431 |         col = c( "grey", "grey60", "black", "darkgreen" ), 
432 |         seg.len = 0.8,
433 |         legend = c(
434 |           "90 %-prediction interval", 
435 |           paste0("prediction (n = ", nrow(d.ph10.val), ")"),
436 |           paste0("observation within interval (n = ", 
437 |                  sum( t.col %in% c(2) ), ")" ),
438 |           paste0("observation outside interval (n = ", 
439 |                  sum( t.col %in% c(1,3)), ", ", 
440 |                  round(sum(t.col %in% c(1,3)) / 
441 |                          nrow(d.ph10.val)*100,1), "%)") )
442 | )
443 | @
444 | 
445 | 
446 | 
447 | <<create-coverage-probabilty-plots,fig.align='center', fig.pos = "h", fig.width=4,fig.height=4.5, out.width='0.45\\textwidth',fig.cap="Coverage probabilities of one-sided prediction intervals computed for the validation data set of topsoil pH of the Berne study area.">>=
448 | 
449 | # Coverage probabilities plot
450 | # create sequence of nominal probabilities 
451 | ss <- seq(0,1,0.01)
452 | # compute coverage for sequence
453 | t.prop.inside <- sapply(ss, function(ii){
454 |   boot.quantile <-  t( apply(ph.pred.distribution, 1, quantile, 
455 |                              probs = c(0,ii) ) )[,2]
456 |   return( sum(boot.quantile <= d.ph10.val$ph.0.10)/nrow(d.ph10.val) )
457 | })
458 | 
459 | plot(x = ss, y = t.prop.inside[length(ss):1], 
460 |      type = "l", asp = 1,
461 |      ylab = "coverage probabilities", 
462 |      xlab="nominal probabilities" )
463 | # add 1:1-line  
464 | abline(0,1, lty = 2, col = "grey60")
465 | # add lines of the two-sided 90 %-prediction interval
466 | abline(v = c(0.05, 0.95), lty = "dotted", col = "grey20")
467 | @
468 | 
469 | 
470 | \paragraph{Please continue:}
471 | 
472 | \begin{itemize}
473 | \item Are you satisfied with the prediction intervals? 
474 | \item Create maps of the prediction intervals for the \texttt{berne.grid} data. Is there much spatial structure in the uncertainty? 
475 | \end{itemize}
476 | 
477 | 
478 | 
479 | \bigskip 
480 | \section*{R session information}
481 | 
482 | \footnotesize
483 | This document was generated with:
484 | <<session-info,results='asis'>>=
485 | toLatex(sessionInfo(), locale = FALSE)
486 | @
487 | \normalsize
488 | 
489 | <<export-r-code,echo=FALSE,result="hide">>=
490 | purl("OpenGeoHub-machine-learning-training-2.Rnw")
491 | @
492 | 
493 | 
494 | \end{document}
495 | 


--------------------------------------------------------------------------------
/exercises/OpenGeoHub-machine-learning-training-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mnocci/2019_OpenGeoHub_machine-learning-madlene/2dcb94fea24e5f3c50e77f04df5b4570e066af25/exercises/OpenGeoHub-machine-learning-training-2.pdf


--------------------------------------------------------------------------------
/presentation/OpenGeoHub-machine-learning-lecture-1-handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mnocci/2019_OpenGeoHub_machine-learning-madlene/2dcb94fea24e5f3c50e77f04df5b4570e066af25/presentation/OpenGeoHub-machine-learning-lecture-1-handout.pdf


--------------------------------------------------------------------------------
/presentation/OpenGeoHub-machine-learning-lecture-2-handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mnocci/2019_OpenGeoHub_machine-learning-madlene/2dcb94fea24e5f3c50e77f04df5b4570e066af25/presentation/OpenGeoHub-machine-learning-lecture-2-handout.pdf


--------------------------------------------------------------------------------
/presentation/OpenGeoHub-machine-learning-lecture-2-teaser_overview-soil-mapping-study.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mnocci/2019_OpenGeoHub_machine-learning-madlene/2dcb94fea24e5f3c50e77f04df5b4570e066af25/presentation/OpenGeoHub-machine-learning-lecture-2-teaser_overview-soil-mapping-study.pdf


--------------------------------------------------------------------------------