├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── GWRFC.Rproj
├── NAMESPACE
├── R
    ├── GWRFC.R
    ├── LVIclus.R
    ├── deforestation.R
    └── plotJPEG.R
├── README.md
├── data
    └── deforestation.rda
└── man
    ├── GWRFC.Rd
    ├── LVIclust.Rd
    ├── deforestation.Rd
    └── plotJPEG.Rd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: GWRFC
 2 | Title: Geographycally Weighted Random Forest Classification (GWRFC)
 3 | Version: 0.9.1
 4 | Description: Commonly, spatial data requires non-stationary modelling to capture relationships between variables. This has been satisfied by approaches such as the Geographically Weighted Regression (GWR; Fotheringham, Charlton, and Brunsdon 1998), which uses a moving window weightening technique to apply a linear regression model and determine variables effects. However, such approach has been criticized due to its sensitivity to multicollinearity and noisy data (Wheeler 2007). To overcome these issues, the library GWRFC replaces the linear regression model with the random forest algorithm (RF; Breiman 2001) applying case weights according to the weightening scheme of GWR in the bagging step of RF. As a result, GWRFC  produces spatial representations of variables importance, classification probabilities and accuracy of RF models at local level. Furthermore, the library GWRFC provides an additional function to cluster its outputs and facilitate their analysis and report.
 5 | Author: Fabian Santos [aut, cre]
 6 | Maintainer: Fabian Santos <fabian_santos_@hotmail.com>
 7 | License: GPL-3
 8 | Encoding: UTF-8
 9 | URL: https://github.com/FSantosCodes; http://rpubs.com/schwarzervogel10/526484
10 | Imports:
11 |   caret,
12 |   digest,
13 |   doParallel,
14 |   foreach,
15 |   foreign,
16 |   fpc,
17 |   ggplot2,
18 |   gtools,
19 |   GWmodel,
20 |   gridExtra,
21 |   iterators,
22 |   jpeg,
23 |   kohonen,
24 |   NbClust,
25 |   mclust,
26 |   parallel,
27 |   pdp,
28 |   plyr,
29 |   pracma,
30 |   ranger,
31 |   raster,
32 |   reshape,
33 |   rgdal,
34 |   rgeos,
35 |   scales,
36 |   spdep,
37 |   spgwr,
38 |   stringr,
39 |   zoo
40 | Packaged: 2019-01-07
41 | Repository: GitHub
42 | Publication: 2019-01-07
43 | RoxygenNote: 7.1.0
44 | LazyData: true
45 | Depends: 
46 |     R (>= 2.10)
47 | 


--------------------------------------------------------------------------------
/GWRFC.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: ISO8859-1
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | exportPattern("^[[:alpha:]]+")
 2 | import(caret)
 3 | import(digest)
 4 | import(doParallel)
 5 | importFrom(fpc,calinhara)
 6 | import(foreach)
 7 | import(foreign)
 8 | import(ggplot2)
 9 | importFrom(GWmodel,gw.dist)
10 | importFrom(gtools,mixedsort)
11 | importFrom(gridExtra,grid.arrange)
12 | import(iterators)
13 | import(jpeg)
14 | import(NbClust)
15 | importFrom(mclust,Mclust)
16 | import(parallel)
17 | importFrom(pdp,partial)
18 | import(plyr)
19 | import(pracma)
20 | import(ranger)
21 | import(raster)
22 | importFrom(reshape,melt)
23 | import(rgdal)
24 | import(rgeos)
25 | import(kohonen)
26 | import(scales)
27 | import(spdep)
28 | import(spgwr)
29 | import(stringr)
30 | import(zoo)
31 | 


--------------------------------------------------------------------------------
/R/GWRFC.R:
--------------------------------------------------------------------------------
  1 | #'@title Geographically weighted Random Forest Classification
  2 | #'@description GWRFC is a software for analyze and explore spatial data. It constructs geographically weighted models (GW; Fotheringham et al. 1998) to train random forest (RF; Breiman 2001) and report local models with partial depende plots (PDP, Greenwell, 2019). Prediction results and accurancy metrics (ACC) are also representated accondingly.
  3 | #'@param input_shapefile string or Spatial-class. Input shapefile with dependent and independent variables.  It can be the filename of the shapefile or an object of class {SpatialPolygonsDataFrame} or {SpatialPointsDataFrame}.
  4 | #'@param remove_columns string. Remove specific variables from \strong{input_shapefile}. Variables are identified by column name. NA ignores column remove.
  5 | #'@param dependent_varName string. Dependent variable name. Must exists at \strong{input_shapefile} and should be categorical (with not more than 20 classes).
  6 | #'@param kernel_function string. Kernel type to apply in GWRFC. It can be: 'gaussian', 'exponential', 'bisquare' or 'tricube'.
  7 | #'@param kernel_adaptative logical. Is the kernel adaptative? otherwise it is considered as fixed (larger processing time).
  8 | #'@param kernel_bandwidth numeric. Defines kernel bandwidth. If \strong{kernel_adaptative} is TRUE, then you should define the number of local observations in the kernel, otherwise you should define a distance to specify kernel bandwidth.
  9 | #'@param upsampling logical. If TRUE, upsampling is applied before random forest training, otherwise it is downsampled. Consider that upsampling is a bit more computing demanding but accuracy is improved.
 10 | #'@param save_models logical. If TRUE, random forest models are stored at \strong{output_folder} as a RDS file. Beware it can be large, therefore storage requires hard drive memory and can slow down algorithm exit.
 11 | #'@param enable_pdp logical. --EXPERIMENTAL-- If TRUE, partial dependence plots YHAT maximun, together with its correspondent independent variable value (PDP) are calculated.
 12 | #'@param number_cores numeric. Number of cores for parallel processing. Cores are register and operated via doParallel, foreach and parallel packages. Be careful with increasing numbers of cores, as RAM memory may be not enough.
 13 | #'@param output_folder string. Output folder where GWRFC outputs will be stored.
 14 | #'@return As a result, four shapefiles are created whose prefixes refer to: \enumerate{
 15 | #'                            \item LVI: Local variables importance. Calculated via permutation for each variable.
 16 | #'                            \item PDP: Independent variables local maxima (class or value). Identified when YHAT reach its maximum during RF model marginalization. Calculated for each variable.
 17 | #'                            \item YHAT: Prediction result for \strong{dependent_varName} when PDP local maxima is applied. Calculated for each variable.
 18 | #'                            \item ACC: Prediction and accuracies: predicted class, kappa from Out-of-Bag, classes probabilities and prediction failures.
 19 | #'                           }
 20 | #'        In all shapefiles cases, a column called 'ID_row' refers to rownames of \strong{input_shapefile}. In addition, processing evolution can be monitored at \strong{output_folder} as: data_progress.txt
 21 | #'@examples
 22 | #'
 23 | #'#view deforestation data
 24 | #'
 25 | #'data("deforestation")
 26 | #'tmap_mode("view")
 27 | #'tm_basemap("OpenStreetMap") +
 28 | #'  tm_shape(deforestation) +
 29 | #'  tm_polygons(col="fao",style="cat",title="Annual deforestation rate  2000-2010 (FAO) - categorical (quantiles)",palette="YlOrRd")
 30 | #'
 31 | #'#run GWRFC
 32 | #'
 33 | #'GWRFC(input_shapefile = deforestation, #can be a spatial dataframe (points or polygons) or the complete filename of the shapefile to analyze.
 34 | #'      remove_columns = c("ID_grid","L_oth"), #for remove variables if they are not informative. Put NA to avoid removal.
 35 | #'      dependent_varName = "fao", #the depedent variable to evaluate. It should be of factor or character data type.
 36 | #'      kernel_function = "exponential", #the weightening function. See help for other available functions.
 37 | #'      kernel_adaptative = T, #use TRUE for adaptative kernel distance or FALSE for a fixed kernel distance.
 38 | #'      kernel_bandwidth = 400, #as the kernel is adaptative, 400 refers to the minimun number of observations to use in modelling.
 39 | #'      upsampling = T, #improves accuracy (recommended) but is a bit more computing costly.
 40 | #'      save_models = T, #save RF models. Beware of hard disk space and extra processing time.
 41 | #'      enable_pdp = F, #experimental, use with caution as is sensible to noise.
 42 | #'      number_cores = 3, #defines the number of CPU cores to use
 43 | #'      output_folder = "E:/demo/deforestation") #check this folder for GWRFC outputs.
 44 | #'
 45 | #'@export
 46 | 
 47 | GWRFC <- function(
 48 |   input_shapefile,
 49 |   remove_columns = NA,
 50 |   dependent_varName,
 51 |   kernel_function = "exponential",
 52 |   kernel_adaptative = T,
 53 |   kernel_bandwidth,
 54 |   upsampling = T,
 55 |   save_models = F,
 56 |   enable_pdp = F,
 57 |   number_cores = 1,
 58 |   output_folder
 59 | ){
 60 | 
 61 |   ##### PREPARE DATA #####
 62 | 
 63 |   print("Reading data...")
 64 | 
 65 |   #random
 66 |   set.seed(666)
 67 |   #folder
 68 |   dir.create(output_folder,showWarnings = F, recursive = T)
 69 |   if(file.exists(paste0(output_folder,"/data_progress.txt"))){
 70 |     unlink(paste0(output_folder,"/data_progress.txt"),recursive = T, force = T)
 71 |   }
 72 |   #read shp
 73 |   if(class(input_shapefile)=="SpatialPolygonsDataFrame"|class(input_shapefile)=="SpatialPointsDataFrame"){
 74 |     model.shp <- input_shapefile
 75 |   }else{
 76 |     model.shp <- shapefile(input_shapefile)
 77 |   }
 78 |   #check logical structure
 79 |   if(length(model.shp)<= kernel_bandwidth){
 80 |     stop("kernel_bandwidth too large for input_shapefile features number")
 81 |   }
 82 |   #assign rownames
 83 |   rownames(model.shp@data) <- 1:length(model.shp)
 84 |   #remove columns?
 85 |   if(!is.na(remove_columns)[1]){
 86 |     if(length(grep(paste(remove_columns,collapse="|"),names(model.shp))) != 0){
 87 |       model.shp <- model.shp[,!names(model.shp) %in% remove_columns]
 88 |     }else{
 89 |       stop("remove_columns not found at input_shapefile. Verify its names.")
 90 |     }
 91 |   }
 92 |   #test + get dependent column
 93 |   model.dep <- grep(paste0("^",dependent_varName,"$"),names(model.shp))
 94 |   if(length(model.dep)==0){
 95 |     stop("dependent_varName not found")
 96 |   }else if(length(model.dep)>=2){
 97 |     stop("Found two or more column names at input_shapefile for the specified dependent_varName. Rename it.")
 98 |   }else if(length(unique(model.shp@data[,model.dep]))>=21){
 99 |     warning(paste0("dependent_varName has ",length(unique(model.shp@data[,model.dep])),
100 |                    " classes. Procede with caution or reduce them into around 10 interpretable classes"))
101 |   }
102 |   #get independent columns
103 |   model.ind <- names(model.shp)[!grepl(dependent_varName,names(model.shp))]
104 |   model.ind <- grep(paste(model.ind,collapse="|"),names(model.shp))
105 |   #put as factor
106 |   model.shp <- model.shp[,c(model.dep,model.ind)]
107 |   fac.vars <- sapply(model.shp@data,class)=="character"
108 |   if(any(fac.vars)){
109 |     model.shp@data[fac.vars] <- as.data.frame(lapply(model.shp@data[fac.vars],as.factor))
110 |   }
111 |   #complete cases
112 |   pos.NA <- which(!complete.cases(model.shp@data))
113 |   if(length(pos.NA)!=0){
114 |     model.shp <- model.shp[which(complete.cases(model.shp@data)),]
115 |     warning(paste0("input_shapefile has ",length(pos.NA)," incomplete case(s). Removing it/them..."))
116 |   }
117 |   #number and names of classses
118 |   dep.len <- nlevels(model.shp@data[,1])
119 |   dep.nam <- levels(model.shp@data[,1])
120 | 
121 |   ##### FUNCTIONS ####
122 | 
123 |   save.shp <- function(x,outN){
124 |     #merge
125 |     gwc.data <- lapply(gwc.extract,"[[",x)
126 |     gwc.data <- do.call("rbind.data.frame",gwc.data)
127 |     gwc.data$ID_row <- rownames(model.shp@data)
128 |     gwc.data <- gwc.data[,c(ncol(gwc.data),1:(ncol(gwc.data)-1))]
129 |     #save
130 |     output.shp <- model.shp
131 |     output.shp@data <- gwc.data
132 |     output.name <- paste0(output_folder,"/GWRFC_",
133 |                           ifelse(kernel_adaptative,"ADP_","FIX_"),
134 |                           kernel_bandwidth,"_",
135 |                           conv.name(kernel_function),
136 |                           paste0("_",outN,".shp"))
137 |     shapefile(output.shp,output.name,overwrite=T)
138 |     print(paste0(output.name," * stored sucessfully!"))
139 |   }
140 |   conv.name <- function(x){
141 |     if(x=='gaussian'){
142 |       x <- "GA"
143 |     }else if(x=='exponential'){
144 |       x <- "EX"
145 |     }else if(x=='bisquare'){
146 |       x <- "BI"
147 |     }else if(x=='bisquare'){
148 |       x <- "TR"
149 |     }
150 |     return(x)
151 |   }
152 |   zero.variance <- function(x){
153 |     nzv <- caret::nearZeroVar(x[-1], saveMetrics= TRUE)
154 |     nzv.0 <- rownames(nzv)[nzv$zeroVar]
155 |     if(length(nzv.0)!=0){
156 |       x <- x[,!names(x) %in% nzv.0]
157 |     }
158 |     if(length(nzv)!=0){
159 |       x <- x[,!names(x) %in% nzv]
160 |     }
161 |     return(x)
162 |   }
163 |   corrupted.cases <- function(){
164 |     cell.vars <- sort(names(model.shp@data[,2:ncol(model.shp@data)]))
165 |     #lvi
166 |     cell.lvi <- as.data.frame(matrix(nrow=1,ncol=length(cell.vars)))
167 |     names(cell.lvi) <- cell.vars
168 |     #yhat
169 |     cell.yhat <- as.data.frame(matrix(nrow=1,ncol=length(cell.vars)))
170 |     names(cell.yhat) <- cell.vars
171 |     #pdp
172 |     cell.pdp <- as.data.frame(matrix(nrow=1,ncol=length(cell.vars)))
173 |     names(cell.pdp) <- cell.vars
174 |     #acc
175 |     cell.acc <- as.data.frame(matrix(nrow=1,ncol=length(levels(model.shp@data[,1]))+4))
176 |     names(cell.acc) <- c("DEP","PRED",paste0("P_",levels(model.shp@data[,1])),"FAIL","KAPPA")
177 |     #rf model
178 |     cell.rf <- NA
179 |     #out
180 |     cell.out <- list(cell.lvi,cell.yhat,cell.pdp,cell.acc,cell.rf)
181 |     return(cell.out)
182 |   }
183 |   get.probs <- function(){
184 |     x <- as.data.frame(predict(cell.rf,cell.i[,2:(ncol(cell.i)-1)])$predictions)
185 |     x.pred <- names(x[,which.max(x),drop=F])
186 |     x.prob <- x[,grep(cell.out$DEP,names(x))]
187 |     return(list(x.pred,x.prob))
188 |   }
189 |   get.probsClass <- function(){
190 |     x <- as.data.frame(predict(cell.rf,cell.i[,2:(ncol(cell.i)-1)])$predictions)
191 |     x.pred <- names(x[,which.max(x),drop=F])
192 |     return(list(x.pred,x))
193 |   }
194 |   get.kappa <- function(){
195 |     predictions <- factor(apply(cell.rf$predictions,1,function(x){
196 |       x <- names(x)[which.max(x)]
197 |     }),levels=levels(cell.data[,1]))
198 |     conf.m <- caret::confusionMatrix(table(cell.data[,1],predictions))
199 |     return(conf.m$overall[2])
200 |   }
201 | 
202 |   #### GW RANDOM FOREST ####
203 | 
204 |   print("Start processing...")
205 |   #start cluster
206 |   cl <- makeCluster(number_cores)
207 |   registerDoParallel(cl)
208 |   gwc.extract <- foreach(i=1:length(model.shp),
209 |                          .packages=c("ranger","scales","caret","GWmodel","pdp","raster","sp"),
210 |                          .errorhandling="pass") %dopar% {
211 |     #timer
212 |     init.time <- proc.time()
213 |     #order data by distance
214 |     cell.data <- model.shp@data
215 |     cell.data$dist <- GWmodel::gw.dist(dp.locat=coordinates(model.shp),rp.locat=coordinates(model.shp[i,]))[,1]
216 |     if(kernel_adaptative){
217 |       cell.data <- cell.data[order(cell.data$dist)[1:kernel_bandwidth],]
218 |     }else{
219 |       cell.data <- cell.data[cell.data$dist < kernel_bandwidth,]
220 |     }
221 |     #get 'i' observation & rows index
222 |     cell.i <- cell.data[1,]
223 |     cell.pos <- as.numeric(rownames(cell.data))
224 |     #CORRUPTED CASE 1: only one observation
225 |     if(nrow(cell.data)==1){
226 |       cell.out <- corrupted.cases()
227 |     }else{
228 |       #drop unused levels
229 |       cell.data[,1] <- droplevels(cell.data[,1])
230 |       #balance + set levels in classification
231 |       if(upsampling){
232 |         cell.data <- caret::upSample(x=cell.data[,2:ncol(cell.data)],
233 |                                      y=cell.data[,1],
234 |                                      yname=names(cell.data)[1])
235 |       }else{
236 |         cell.data <- caret::downSample(x=cell.data[,2:ncol(cell.data)],
237 |                                        y=cell.data[,1],
238 |                                        yname=names(cell.data)[1])
239 |       }
240 |       cell.data <- cell.data[,c(ncol(cell.data),1:(ncol(cell.data)-1))]
241 |       #distance weights
242 |       cell.weights <- GWmodel::gw.weight(cell.data$dist,
243 |                                          bw=kernel_bandwidth,
244 |                                          kernel=kernel_function,
245 |                                          adaptive=kernel_adaptative)
246 |       #assign dependent + remove zero variance
247 |       cell.data <- cell.data[,1:(ncol(cell.data)-1)]
248 |       cell.vars <- sort(names(cell.data[,2:ncol(cell.data)]))
249 |       cell.data <- cbind(cell.data[,1],zero.variance(cell.data[,-1]))
250 |       names(cell.data)[1] <- dependent_varName
251 |       #apply ranger: FIRST TIME
252 |       cell.formula <- formula(paste0(dependent_varName," ~ ."))
253 |       cell.rf <- ranger::ranger(formula=cell.formula,
254 |                                 data=cell.data,
255 |                                 replace=T,
256 |                                 min.node.size=1,
257 |                                 scale.permutation.importance=T,
258 |                                 case.weights=cell.weights,
259 |                                 probability=T,
260 |                                 importance="permutation")
261 |       #extract important variables
262 |       cell.best <- names(cell.rf$variable.importance)[cell.rf$variable.importance > 0]
263 |       #CORRUPTED CASE 2: null calculation after run
264 |       if(length(cell.best)==0){
265 |         cell.out <- corrupted.cases()
266 |       }else{
267 |         #apply ranger: SECOND TIME
268 |         if(length(cell.best)!=length(cell.vars)){
269 |           cell.formula <- formula(paste0(dependent_varName," ~ ",paste(cell.best,collapse=" + ")))
270 |           cell.rf <- ranger::ranger(formula=cell.formula,
271 |                                     data=cell.data,
272 |                                     replace=T,
273 |                                     min.node.size=1,
274 |                                     scale.permutation.importance=T,
275 |                                     case.weights=cell.weights,
276 |                                     probability=T,
277 |                                     importance="permutation")
278 |         }
279 |         #extract LVI
280 |         cell.lvi <- data.frame(t(matrix(cell.rf$variable.importance)))
281 |         names(cell.lvi) <- names(cell.rf$variable.importance)
282 |         cell.lvi[,cell.lvi < 0] <- 0
283 |         cell.rem <- cell.vars[!cell.vars %in% names(cell.lvi)]
284 |         if(length(cell.rem!=0)){
285 |           cell.val <- as.data.frame(t(matrix(rep(NA,length(cell.rem)))))
286 |           names(cell.val) <- cell.rem
287 |           cell.lvi <- cbind(cell.lvi,cell.val)
288 |         }
289 |         cell.lvi <- cell.lvi[sort(names(cell.lvi))]
290 |         #apply PDP
291 |         if(enable_pdp){
292 |           cell.pdp <- lapply(names(cell.rf$variable.importance),function(x){
293 |             x <- pdp::partial(cell.rf,
294 |                               pred.var = x,
295 |                               grid.resolution = 5,
296 |                               train = cell.data,
297 |                               type = "classification",
298 |                               parallel = F)
299 |             x <- x[which.max(x[,2]),]
300 |             return(x)
301 |           })
302 |           #extract yhat
303 |           cell.yhat <- as.data.frame(lapply(cell.pdp,"[[",2),stringsAsFactors=F)
304 |           names(cell.yhat) <- names(cell.rf$variable.importance)
305 |           if(length(cell.rem!=0)){
306 |             cell.val <- as.data.frame(t(matrix(rep(NA,length(cell.rem)))))
307 |             names(cell.val) <- cell.rem
308 |             cell.yhat <- cbind(cell.yhat,cell.val)
309 |           }
310 |           cell.yhat <- cell.yhat[sort(names(cell.yhat))]
311 |           #extract max values
312 |           cell.pdp <- as.data.frame(lapply(cell.pdp,"[[",1),stringsAsFactors=F)
313 |           names(cell.pdp) <- names(cell.rf$variable.importance)
314 |           if(length(cell.rem!=0)){
315 |             cell.val <- as.data.frame(t(matrix(rep(NA,length(cell.rem)))))
316 |             names(cell.val) <- cell.rem
317 |             cell.pdp <- cbind(cell.pdp,cell.val)
318 |           }
319 |           cell.pdp <- cell.pdp[sort(names(cell.pdp))]
320 |         }else{
321 |           cell.yhat <- as.data.frame(matrix(nrow=1,ncol=length(cell.vars)))
322 |           names(cell.yhat) <- cell.vars
323 |           cell.pdp <- as.data.frame(matrix(nrow=1,ncol=length(cell.vars)))
324 |           names(cell.pdp) <- cell.vars
325 |         }
326 |         #extract accuracies
327 |         cell.acc <- data.frame(DEP=as.character(cell.i[,1]))
328 |         cell.acc$PRED <- get.probsClass()[[1]]
329 |         cell.acc.prob <- get.probsClass()[[2]]
330 |         if(length(cell.acc.prob)!=dep.len){
331 |           miss.class <- as.data.frame(matrix(rep(0,dep.len),nrow=1,ncol=dep.len))
332 |           names(miss.class) <- dep.nam
333 |           miss.class[which(dep.nam %in% names(cell.acc.prob))] <- cell.acc.prob
334 |           cell.acc.prob <- miss.class
335 |         }
336 |         names(cell.acc.prob) <- paste0("P_", names(cell.acc.prob))
337 |         cell.acc <- cbind(cell.acc,as.data.frame(cell.acc.prob))
338 |         cell.acc$FAIL <- ifelse(cell.acc$DEP!=cell.acc$PRED,"yes","no")
339 |         cell.acc$KAPPA <- get.kappa()
340 |       }
341 |       #close
342 |       cell.out <- list(cell.lvi,cell.yhat,cell.pdp,cell.acc,cell.rf)
343 |     }
344 |     #status text
345 |     init.time <- round(proc.time()-init.time,2)
346 |     cat(paste0(as.character(i)," of ",length(model.shp)," - time elapsed: ",init.time[3],"\n"),
347 |         file=paste0(output_folder,"/data_progress.txt"), append=TRUE)
348 |     #end
349 |     return(cell.out)
350 |   }
351 |   stopCluster(cl)
352 | 
353 |   #### SAVE SHAPEFILES ####
354 | 
355 |   print("Start saving...")
356 | 
357 |   save.shp(1,"LVI")
358 |   if(enable_pdp){
359 |     save.shp(2,"YHAT")
360 |     save.shp(3,"PDP")
361 |   }
362 |   save.shp(4,"ACC")
363 | 
364 |   #### SAVE RF MODELS ####
365 | 
366 |   if(save_models){
367 |     rf.models <- lapply(gwc.extract,"[[",5)
368 |     rf.models <- rf.models[!is.na(rf.models)]
369 |     output.name <- paste0(output_folder,"/GWRFC_",
370 |                           ifelse(kernel_adaptative,"ADP_","FIX_"),
371 |                           kernel_bandwidth,"_",
372 |                           conv.name(kernel_function),
373 |                           paste0("_MODELS.rds"))
374 |     saveRDS(rf.models,output.name)
375 |     print(paste0(output.name," * stored sucessfully!"))
376 |   }
377 | 
378 |   #error warning
379 |   if(length(which(is.na(model.shp@data$PRED)))!=0){
380 |     warning(paste0(length(which(is.na(model.shp@data$PRED)))," observations were not possible to evaluate during Random Forest execution."))
381 |   }
382 |   #end
383 |   print("****GWRFC end sucessfully*****")
384 | }
385 | 


--------------------------------------------------------------------------------
/R/LVIclus.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSantosCodes/GWRFC/407291e21e4db51fd7ef124b95efc5d739b62795/R/LVIclus.R


--------------------------------------------------------------------------------
/R/deforestation.R:
--------------------------------------------------------------------------------
 1 | #' 2000-2010 deforestation rates for the northeastern ecuadorian Amazon.
 2 | #'
 3 | #' This dataset contains the annual deforestation rate acoording FAO (FAO, 2003) from 2000-2010 for 2418 grid cells (each one of 400 ha.). Additionally, 35 variables from landscape, commodities, socioeconomic and sociocultural features are included. Sources are: Instituto Nacional de Estadisticas y Censos (INEC 2001, 2010), Sistema Nacional de Informacion (SNI, 2017), Sistema Nacional de Informacion y Gestion de Tierras Rurales e Infraestructura Tecnologica (SIGTIERRAS, 2015), National Oceanic and Atmospheric Administration (NOAA, 2019) and deforestation maps from Santos et al. 2018.
 4 | #'
 5 | #' @format A SpatialPolygonsDataFrame with 2418 rows and 37 variables:
 6 | #' \describe{
 7 | #'   \item{ID_grid}{an unique numeric identifier for each grid cell}
 8 | #'   \item{fao}{annual deforestation rate (2000-2010, classified as quantiles)}
 9 | #'   \item{A_cao}{Accessibility to coffee and cacao collection centers (minutes ~ scaled 0-1)}
10 | #'   \item{A_fru}{Accessibility to fruits collection centers (minutes ~ scaled 0-1)}
11 | #'   \item{A_mlk}{Accessibility to milk collection centers (minutes ~ scaled 0-1)}
12 | #'   \item{A_plm}{Accessibility to palm oil extraction facilities (minutes ~ scaled 0-1)}
13 | #'   \item{I_min}{Distance to mining blocks assigned between 2000 and 2010 (meters)}
14 | #'   \item{I_ngt}{Stable nightlights trend 2000-2010 (slope)}
15 | #'   \item{I_oil}{Distance to oil wells perforated between 2000 and 2010 (meters)}
16 | #'   \item{B_alt}{Altitude (meters above sea level)}
17 | #'   \item{B_rfl}{Annual rainfall (mm)}
18 | #'   \item{B_fer}{Soil fertility (percentage organic matter)}
19 | #'   \item{C_bsl}{Bare soil (percentage frequency)}
20 | #'   \item{C_fra}{Fractal dimension index (unitless)}
21 | #'   \item{C_pas}{Pasture (percentage frequency)}
22 | #'   \item{C_sze}{Mean patch size (ha)}
23 | #'   \item{D_adt}{Adult population (26-45 yrs)}
24 | #'   \item{D_old}{Older adult population (45-72 yrs)}
25 | #'   \item{D_ygr}{Young population (15-25 yrs)}
26 | #'   \item{E_hgr}{Higher education (>13 yrs)}
27 | #'   \item{E_ilt}{Illiterate}
28 | #'   \item{E_pri}{Primary education (1-6 yrs)}
29 | #'   \item{E_sec}{Secondary education (7-12 yrs)}
30 | #'   \item{G_chf}{Chief female household}
31 | #'   \item{G_chm}{Chief male household}
32 | #'   \item{G_pof}{Female population}
33 | #'   \item{G_pom}{Male population}
34 | #'   \item{H_lar}{Large families (>5 children)}
35 | #'   \item{H_med}{Medium families (3-5 children)}
36 | #'   \item{H_sma}{Small families (1-2 children)}
37 | #'   \item{L_kcw}{Speak Kichwa}
38 | #'   \item{L_oth}{Speak other languages}
39 | #'   \item{L_spa}{Speak Spanish}
40 | #'   \item{L_wao}{Speak Huao Tededo}
41 | #'   \item{W_agr}{Agricultural workers}
42 | #'   \item{W_ind}{Industrial workers}
43 | #'   \item{W_ser}{Service workers}
44 | #'   ...
45 | #' }
46 | #' @source \url{www.sigtierras.gob.ec/; www.sni.gob.ec}
47 | "deforestation"
48 | 


--------------------------------------------------------------------------------
/R/plotJPEG.R:
--------------------------------------------------------------------------------
 1 | #'@title Plot JPEG file using base graphics
 2 | #'@description Internal function to plot a JPEG file in R
 3 | #'@param input_jpeg string. Input filename of JPG picture.
 4 | 
 5 | plotJPEG <- function(input_jpeg,add=FALSE){
 6 |   jpg = readJPEG(input_jpeg, native=T) # read the file
 7 |   res = dim(jpg)[2:1] # get the resolution, [x, y]
 8 |   if (!add) # initialize an empty plot area if add==FALSE
 9 |     plot(1,1,xlim=c(1,res[1]),ylim=c(1,res[2]),asp=1,type='n',xaxs='i',yaxs='i',xaxt='n',yaxt='n',xlab='',ylab='',bty='n')
10 |   rasterImage(jpg,1,1,res[1],res[2])
11 | }
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Geographically weighted Random Forest Classification (GWRFC)
 2 | GWRFC is function that replaces the linear regression model of the Geographically Weighted Regression (GWR; Fotheringham, Charlton, and Brunsdon 1998) with the random forest
 3 | algorithm (RF; Breiman 2001). For this, it applies case weights according to the weightening scheme of GWR in the bagging step of RF. As a result, GWRFC produces spatial
 4 | representations of variables importance, classification probabilities and accuracy of RF models at local level. To improve processing speed, GWRFC uses the ranger package for
 5 | train RF and parallel computing (only available for Windows OS).
 6 | 
 7 | To cite this work, please use: Santos F, Graw V, Bonilla S (2019) A geographically weighted random forest approach for evaluate forest change drivers in the Northern Ecuadorian
 8 | Amazon. PLOS ONE 14(12): e0226224. https://doi.org/10.1371/journal.pone.0226224
 9 | 
10 | To use it with last updates, please run/adapt the code below.
11 | 
12 | ## Installation
13 | 
14 | ```r
15 | #required libraries
16 | list.of.packages <- c("caret","digest","doParallel","foreach","foreign","fpc","ggplot2","gtools","GWmodel","jpeg","kohonen","mclust","NbClust","parallel","plyr","pracma","ranger","raster","reshape","raster",
17 |                       "rgdal","rgeos","scales","spdep","spgwr","stringr","tmap","zoo")
18 | new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
19 | if(length(new.packages)){install.packages(new.packages)}
20 | lapply(list.of.packages, require, character.only = TRUE)
21 | 
22 | #install GWRFC
23 | require(devtools)
24 | install_github("FSantosCodes/GWRFC")
25 | library(GWRFC)
26 | ```
27 | 
28 | ## Call example data
29 | 
30 | ```r
31 | #view deforestation data
32 | data("deforestation")
33 | tmap_mode("view")
34 | tm_basemap("OpenStreetMap") +
35 |   tm_shape(deforestation) +
36 |   tm_polygons(col="fao",style="cat",title="Annual deforestation rate  2000-2010 (FAO) - categorical (quantiles)",palette="YlOrRd")
37 | ```
38 | 
39 | ## Apply GWRFC
40 | 
41 | ```r
42 | #run GWRFC
43 | GWRFC(input_shapefile = deforestation, #can be a spatial dataframe (points or polygons) or the complete filename of the shapefile to analyze.
44 |       remove_columns = c("ID_grid","L_oth"), #for remove variables if they are not informative. Put NA to avoid removal.
45 |       dependent_varName = "fao", #the depedent variable to evaluate. It should be of factor or character data type.
46 |       kernel_function = "exponential", #the weightening function. See help for other available functions.
47 |       kernel_adaptative = T, #use TRUE for adaptative kernel distance or FALSE for a fixed kernel distance.
48 |       kernel_bandwidth = 400, #as the kernel is adaptative, 400 refers to the minimun number of observations to use in modelling.
49 |       upsampling = T, #improves accuracy (recommended) but is a bit more computing costly.
50 |       save_models = T, #save RF models. Beware of hard disk space and extra processing time.
51 |       enable_pdp = F, #experimental, use with caution as is sensible to noise.
52 |       number_cores = 3, #defines the number of CPU cores to use
53 |       output_folder = "E:/demo/deforestation") #check this folder for GWRFC outputs.
54 | ```
55 | 
56 | ## Cluster LVI
57 | 
58 | ```r
59 | #clustering GWRFC LVI outputs
60 | LVIclust(input_LVI = "E:/demo/deforestation/GWRFC_ADP_400_EX_LVI.shp", #filename of the GWRFC LVI output
61 |          remove_columns=NA,
62 |          method_clustering="ward.D2", #hierarchical clustering is applied here.
63 |          ncluster = 4, #number of clusters.
64 |          plots=T, #available only for all hierarchical clustering methods and kohonen.
65 |          output_folder = "E:/demo/deforestation") #check this folder for outputs generated by the function.
66 | ```
67 | 


--------------------------------------------------------------------------------
/data/deforestation.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSantosCodes/GWRFC/407291e21e4db51fd7ef124b95efc5d739b62795/data/deforestation.rda


--------------------------------------------------------------------------------
/man/GWRFC.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/GWRFC.R
 3 | \name{GWRFC}
 4 | \alias{GWRFC}
 5 | \title{Geographically weighted Random Forest Classification}
 6 | \usage{
 7 | GWRFC(
 8 |   input_shapefile,
 9 |   remove_columns = NA,
10 |   dependent_varName,
11 |   kernel_function = "exponential",
12 |   kernel_adaptative = T,
13 |   kernel_bandwidth,
14 |   upsampling = T,
15 |   save_models = F,
16 |   enable_pdp = F,
17 |   number_cores = 1,
18 |   output_folder
19 | )
20 | }
21 | \arguments{
22 | \item{input_shapefile}{string or Spatial-class. Input shapefile with dependent and independent variables.  It can be the filename of the shapefile or an object of class {SpatialPolygonsDataFrame} or {SpatialPointsDataFrame}.}
23 | 
24 | \item{remove_columns}{string. Remove specific variables from \strong{input_shapefile}. Variables are identified by column name. NA ignores column remove.}
25 | 
26 | \item{dependent_varName}{string. Dependent variable name. Must exists at \strong{input_shapefile} and should be categorical (with not more than 20 classes).}
27 | 
28 | \item{kernel_function}{string. Kernel type to apply in GWRFC. It can be: 'gaussian', 'exponential', 'bisquare' or 'tricube'.}
29 | 
30 | \item{kernel_adaptative}{logical. Is the kernel adaptative? otherwise it is considered as fixed (larger processing time).}
31 | 
32 | \item{kernel_bandwidth}{numeric. Defines kernel bandwidth. If \strong{kernel_adaptative} is TRUE, then you should define the number of local observations in the kernel, otherwise you should define a distance to specify kernel bandwidth.}
33 | 
34 | \item{upsampling}{logical. If TRUE, upsampling is applied before random forest training, otherwise it is downsampled. Consider that upsampling is a bit more computing demanding but accuracy is improved.}
35 | 
36 | \item{save_models}{logical. If TRUE, random forest models are stored at \strong{output_folder} as a RDS file. Beware it can be large, therefore storage requires hard drive memory and can slow down algorithm exit.}
37 | 
38 | \item{enable_pdp}{logical. --EXPERIMENTAL-- If TRUE, partial dependence plots YHAT maximun, together with its correspondent independent variable value (PDP) are calculated.}
39 | 
40 | \item{number_cores}{numeric. Number of cores for parallel processing. Cores are register and operated via doParallel, foreach and parallel packages. Be careful with increasing numbers of cores, as RAM memory may be not enough.}
41 | 
42 | \item{output_folder}{string. Output folder where GWRFC outputs will be stored.}
43 | }
44 | \value{
45 | As a result, four shapefiles are created whose prefixes refer to: \enumerate{
46 |                            \item LVI: Local variables importance. Calculated via permutation for each variable.
47 |                            \item PDP: Independent variables local maxima (class or value). Identified when YHAT reach its maximum during RF model marginalization. Calculated for each variable.
48 |                            \item YHAT: Prediction result for \strong{dependent_varName} when PDP local maxima is applied. Calculated for each variable.
49 |                            \item ACC: Prediction and accuracies: predicted class, kappa from Out-of-Bag, classes probabilities and prediction failures.
50 |                           }
51 |        In all shapefiles cases, a column called 'ID_row' refers to rownames of \strong{input_shapefile}. In addition, processing evolution can be monitored at \strong{output_folder} as: data_progress.txt
52 | }
53 | \description{
54 | GWRFC is a software for analyze and explore spatial data. It constructs geographically weighted models (GW; Fotheringham et al. 1998) to train random forest (RF; Breiman 2001) and report local models with partial depende plots (PDP, Greenwell, 2019). Prediction results and accurancy metrics (ACC) are also representated accondingly.
55 | }
56 | \examples{
57 | 
58 | #view deforestation data
59 | 
60 | data("deforestation")
61 | tmap_mode("view")
62 | tm_basemap("OpenStreetMap") +
63 |  tm_shape(deforestation) +
64 |  tm_polygons(col="fao",style="cat",title="Annual deforestation rate  2000-2010 (FAO) - categorical (quantiles)",palette="YlOrRd")
65 | 
66 | #run GWRFC
67 | 
68 | GWRFC(input_shapefile = deforestation, #can be a spatial dataframe (points or polygons) or the complete filename of the shapefile to analyze.
69 |      remove_columns = c("ID_grid","L_oth"), #for remove variables if they are not informative. Put NA to avoid removal.
70 |      dependent_varName = "fao", #the depedent variable to evaluate. It should be of factor or character data type.
71 |      kernel_function = "exponential", #the weightening function. See help for other available functions.
72 |      kernel_adaptative = T, #use TRUE for adaptative kernel distance or FALSE for a fixed kernel distance.
73 |      kernel_bandwidth = 400, #as the kernel is adaptative, 400 refers to the minimun number of observations to use in modelling.
74 |      upsampling = T, #improves accuracy (recommended) but is a bit more computing costly.
75 |      save_models = T, #save RF models. Beware of hard disk space and extra processing time.
76 |      enable_pdp = F, #experimental, use with caution as is sensible to noise.
77 |      number_cores = 3, #defines the number of CPU cores to use
78 |      output_folder = "E:/demo/deforestation") #check this folder for GWRFC outputs.
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/man/LVIclust.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/LVIclus.R
 3 | \name{LVIclust}
 4 | \alias{LVIclust}
 5 | \title{Local variables importance (LVI) clustering from GWRFC outputs}
 6 | \usage{
 7 | LVIclust(
 8 |   input_LVI,
 9 |   remove_columns = NA,
10 |   method_clustering = "ward.D2",
11 |   ncluster = 2,
12 |   plots = T,
13 |   output_folder
14 | )
15 | }
16 | \arguments{
17 | \item{input_LVI}{string or Spatial-class. Input shapefile of GWRFC LVI output. It can be the filename of the shapefile or an object of class SpatialPolygonsDataFrame or SpatialPointsDataFrame.}
18 | 
19 | \item{remove_columns}{string. Remove specific variables from \strong{input_LVI}. Variables are identified by column name. NA ignores column remove. By default, column "ID_row" is removed.}
20 | 
21 | \item{method_clustering}{string. A method to use for clustering. It can be:"ward.D","ward.D2","single","complete","average","mcquitty","median", "centroid", "mclust" or "SOM". The latter, is calculated with the kohonen library.}
22 | 
23 | \item{ncluster}{numeric. Number of clusters to apply. Should be more than 2.}
24 | 
25 | \item{plots}{logical. If TRUE, plots from clustering libraries are generated and stored at \strong{output_folder} (except for mclust).}
26 | 
27 | \item{output_folder}{string. Output folder where LVIclust outputs will be stored.}
28 | }
29 | \value{
30 | The output of this function is a shapefile with the resulting clusters and its plot if it was specified.
31 | }
32 | \description{
33 | This function clusters local variables importance (LVI) output with different methods, including: Gaussian Mixture Modelling (mclust), Kohonen's Self-Organizing Maps (kohonen), or hierarchical clustering (hclust)
34 | }
35 | \examples{
36 | 
37 | #based in the example showed with the execution of GWRFC
38 | 
39 | LVIclust(input_LVI = "E:/demo/deforestation/GWRFC_ADP_400_EX_LVI.shp", #filename of the GWRFC LVI output
40 | remove_columns=NA,
41 | method_clustering="mclust", #hierarchical clustering is applied here.
42 | ncluster = 2, #number of clusters.
43 | plots=T, #available only for all hierarchical clustering methods and kohonen.
44 | output_folder = "E:/demo/deforestation") #check this folder for outputs generated by the function.
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/man/deforestation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deforestation.R
 3 | \docType{data}
 4 | \name{deforestation}
 5 | \alias{deforestation}
 6 | \title{2000-2010 deforestation rates for the northeastern ecuadorian Amazon.}
 7 | \format{
 8 | A SpatialPolygonsDataFrame with 2418 rows and 37 variables:
 9 | \describe{
10 |   \item{ID_grid}{an unique numeric identifier for each grid cell}
11 |   \item{fao}{annual deforestation rate (2000-2010, classified as quantiles)}
12 |   \item{A_cao}{Accessibility to coffee and cacao collection centers (minutes ~ scaled 0-1)}
13 |   \item{A_fru}{Accessibility to fruits collection centers (minutes ~ scaled 0-1)}
14 |   \item{A_mlk}{Accessibility to milk collection centers (minutes ~ scaled 0-1)}
15 |   \item{A_plm}{Accessibility to palm oil extraction facilities (minutes ~ scaled 0-1)}
16 |   \item{I_min}{Distance to mining blocks assigned between 2000 and 2010 (meters)}
17 |   \item{I_ngt}{Stable nightlights trend 2000-2010 (slope)}
18 |   \item{I_oil}{Distance to oil wells perforated between 2000 and 2010 (meters)}
19 |   \item{B_alt}{Altitude (meters above sea level)}
20 |   \item{B_rfl}{Annual rainfall (mm)}
21 |   \item{B_fer}{Soil fertility (percentage organic matter)}
22 |   \item{C_bsl}{Bare soil (percentage frequency)}
23 |   \item{C_fra}{Fractal dimension index (unitless)}
24 |   \item{C_pas}{Pasture (percentage frequency)}
25 |   \item{C_sze}{Mean patch size (ha)}
26 |   \item{D_adt}{Adult population (26-45 yrs)}
27 |   \item{D_old}{Older adult population (45-72 yrs)}
28 |   \item{D_ygr}{Young population (15-25 yrs)}
29 |   \item{E_hgr}{Higher education (>13 yrs)}
30 |   \item{E_ilt}{Illiterate}
31 |   \item{E_pri}{Primary education (1-6 yrs)}
32 |   \item{E_sec}{Secondary education (7-12 yrs)}
33 |   \item{G_chf}{Chief female household}
34 |   \item{G_chm}{Chief male household}
35 |   \item{G_pof}{Female population}
36 |   \item{G_pom}{Male population}
37 |   \item{H_lar}{Large families (>5 children)}
38 |   \item{H_med}{Medium families (3-5 children)}
39 |   \item{H_sma}{Small families (1-2 children)}
40 |   \item{L_kcw}{Speak Kichwa}
41 |   \item{L_oth}{Speak other languages}
42 |   \item{L_spa}{Speak Spanish}
43 |   \item{L_wao}{Speak Huao Tededo}
44 |   \item{W_agr}{Agricultural workers}
45 |   \item{W_ind}{Industrial workers}
46 |   \item{W_ser}{Service workers}
47 |   ...
48 | }
49 | }
50 | \source{
51 | \url{www.sigtierras.gob.ec/; www.sni.gob.ec}
52 | }
53 | \usage{
54 | deforestation
55 | }
56 | \description{
57 | This dataset contains the annual deforestation rate acoording FAO (FAO, 2003) from 2000-2010 for 2418 grid cells (each one of 400 ha.). Additionally, 35 variables from landscape, commodities, socioeconomic and sociocultural features are included. Sources are: Instituto Nacional de Estadisticas y Censos (INEC 2001, 2010), Sistema Nacional de Informacion (SNI, 2017), Sistema Nacional de Informacion y Gestion de Tierras Rurales e Infraestructura Tecnologica (SIGTIERRAS, 2015), National Oceanic and Atmospheric Administration (NOAA, 2019) and deforestation maps from Santos et al. 2018.
58 | }
59 | \keyword{datasets}
60 | 


--------------------------------------------------------------------------------
/man/plotJPEG.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotJPEG.R
 3 | \name{plotJPEG}
 4 | \alias{plotJPEG}
 5 | \title{Plot JPEG file using base graphics}
 6 | \usage{
 7 | plotJPEG(input_jpeg, add = FALSE)
 8 | }
 9 | \arguments{
10 | \item{input_jpeg}{string. Input filename of JPG picture.}
11 | }
12 | \description{
13 | Internal function to plot a JPEG file in R
14 | }
15 | 


--------------------------------------------------------------------------------