├── data ├── datalist └── Income.rdata ├── NAMESPACE ├── MD5 ├── R ├── random.test.data.R ├── predict.grf.R ├── rf.mtry.optim.R ├── grf.bw.R └── grf.R ├── DESCRIPTION └── man ├── random.test.data.Rd ├── rf.mtry.optim.Rd ├── Income.Rd ├── predict.grf.Rd ├── grf.Rd └── grf.bw.Rd /data/datalist: -------------------------------------------------------------------------------- 1 | Income: Income 2 | -------------------------------------------------------------------------------- /data/Income.rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cran/SpatialML/HEAD/data/Income.rdata -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | importFrom("ranger", ranger) 3 | importFrom("caret", postResample, trainControl, train) 4 | importFrom("stats", "dist", "predict", "sd", "setNames", "terms", "rpois", "runif") 5 | import(randomForest) 6 | S3method(predict, grf) 7 | -------------------------------------------------------------------------------- /MD5: -------------------------------------------------------------------------------- 1 | b18754e97fc41eb296f6dc335b002573 *DESCRIPTION 2 | 45ddcc712503c9f957a34fa02862608d *NAMESPACE 3 | 9f0f9340ec41f53a7379bc11a699e497 *R/grf.R 4 | 337dde360c5fcbf5230120b58cbeeb1f *R/grf.bw.R 5 | ca3698ff5ff91242ea6d10820660c19e *R/predict.grf.R 6 | f7127bd35d6fb7a886c8ba5fdc31942f *R/random.test.data.R 7 | 9dbbe6cbfacc4a010eeadc5a9e1bc751 *R/rf.mtry.optim.R 8 | 58731a784e7721a6410fcf9a6b9ffac9 *data/Income.rdata 9 | 3fe4fb93b20dc0f99db4a04b7fda15e6 *data/datalist 10 | 4b5bb52a520626f61e926c7ff3ce2a0c *man/Income.Rd 11 | 91c3cdf41b32ba1850e615f1a10a8ed3 *man/grf.Rd 12 | 116a28a2882c1b4aca99ce918763c6da *man/grf.bw.Rd 13 | 530ff44351b2a228fae9c4ba511c93c7 *man/predict.grf.Rd 14 | 06938780dacd8432957b6a4d4e0f65d5 *man/random.test.data.Rd 15 | 2b8cd5f02542b339d82b0609344f6f13 *man/rf.mtry.optim.Rd 16 | -------------------------------------------------------------------------------- /R/random.test.data.R: -------------------------------------------------------------------------------- 1 | random.test.data <- function(nrows=10, ncols=10, vars.no=3, dep.var.dis="normal", xycoords=TRUE){ 2 | 3 | obs.no <- nrows * ncols 4 | 5 | if (dep.var.dis=="normal") { 6 | dep <- runif(obs.no) 7 | } 8 | if (dep.var.dis == "poisson"){ 9 | dep <- rpois(obs.no, lambda = 7) 10 | } 11 | 12 | if (xycoords == TRUE){ 13 | X <- rep(1:nrows, each=ncols) 14 | Y <- rep(1:ncols, nrows) 15 | } 16 | 17 | vars<-matrix(data=NA, nrow=obs.no, ncol=vars.no-1) 18 | 19 | for (i in 1:vars.no-1) { 20 | vars[,i] <- runif(obs.no) 21 | } 22 | 23 | if (xycoords == TRUE){ 24 | random.df <- data.frame(dep=dep, vars, X=X, Y=Y) 25 | } else { 26 | random.df <- data.frame(dep=dep, vars) 27 | } 28 | return(random.df) 29 | } 30 | -------------------------------------------------------------------------------- /R/predict.grf.R: -------------------------------------------------------------------------------- 1 | #object = local Forests 2 | predict.grf <- function(object, new.data, x.var.name, y.var.name, local.w=1, global.w=0, ...) { 3 | 4 | Obs <- nrow(new.data) 5 | 6 | predictions <- vector(mode="numeric", length=Obs) 7 | 8 | for(i in 1:Obs){ 9 | 10 | x <- new.data[i, which(names(new.data)==x.var.name)] 11 | y <- new.data[i, which(names(new.data)==y.var.name)] 12 | 13 | locations <- object$Locations 14 | 15 | D <- sqrt((x-locations[,1])^2 + (y-locations[,2])^2) 16 | 17 | local.model.ID <- which.min(D) 18 | 19 | g.predict <- predict(object[[1]], new.data[i,], ...) 20 | g.prediction <- g.predict$predictions 21 | l.predict <- predict(object$Forests[[local.model.ID]], new.data[i,]) 22 | l.prediction <- l.predict$predictions 23 | 24 | predictions[i] <- global.w * g.prediction[1] + local.w * l.prediction[1] 25 | } 26 | return(predictions) 27 | } 28 | -------------------------------------------------------------------------------- /R/rf.mtry.optim.R: -------------------------------------------------------------------------------- 1 | rf.mtry.optim <- function(formula, dataset, min.mtry=NULL, max.mtry=NULL, mtry.step=1, cv.method="repeatedcv", cv.folds=10, ...) { 2 | 3 | f <- formula(formula) 4 | 5 | RNames <- attr(terms(f), "term.labels") 6 | 7 | ModelVarNo <- length(RNames) 8 | 9 | if (is.null(min.mtry)) {min.mtry <- 1} 10 | if (is.null(max.mtry)) {max.mtry <- ModelVarNo} 11 | 12 | if (cv.method == "repeatedcv") { 13 | control <- trainControl(cv.method, repeats=5, number=cv.folds, search="grid", ...) 14 | } else if (cv.method == "cv") { 15 | control <- trainControl(number=cv.folds, cv.method, search="grid", ...) 16 | } else { 17 | control <- trainControl(number=cv.folds, cv.method, search="grid", ...) 18 | } 19 | 20 | set.seed(123) 21 | 22 | tunegrid <- expand.grid(.mtry=seq(from=min.mtry, to=max.mtry, by=mtry.step)) 23 | 24 | rf_gridsearch <- train(formula, data= dataset, method="rf", tuneGrid=tunegrid, trControl=control) 25 | 26 | print(rf_gridsearch) 27 | 28 | plot(rf_gridsearch) 29 | 30 | return(rf_gridsearch) 31 | } 32 | 33 | 34 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: SpatialML 2 | Version: 0.1.7 3 | Date: 2024-04-02 4 | Type: Package 5 | Title: Spatial Machine Learning 6 | Authors@R: c( 7 | person("Stamatis", "Kalogirou", email = "stamatis.science@gmail.com", role = c("aut", "cre")), 8 | person("Stefanos", "Georganos", email = "stefanos.georganos@kau.se", role = c("aut", "ctb"))) 9 | Depends: R (>= 4.3.0), ranger (>= 0.15.1), caret (>= 6.0), randomForest 10 | (>= 4.7) 11 | Description: Implements a spatial extension of the random forest algorithm 12 | (Georganos et al. (2019) ). 13 | Allows for a geographically weighted random forest regression 14 | including a function to find the optical bandwidth. (Georganos 15 | and Kalogirou (2022) ). 16 | License: GPL (>= 2) 17 | Encoding: UTF-8 18 | LazyData: true 19 | URL: https://stamatisgeoai.eu/ 20 | NeedsCompilation: no 21 | Packaged: 2024-04-01 23:20:42 UTC; Stamatis 22 | Author: Stamatis Kalogirou [aut, cre], 23 | Stefanos Georganos [aut, ctb] 24 | Maintainer: Stamatis Kalogirou 25 | Repository: CRAN 26 | Date/Publication: 2024-04-02 00:00:02 UTC 27 | -------------------------------------------------------------------------------- /man/random.test.data.Rd: -------------------------------------------------------------------------------- 1 | \name{random.test.data} 2 | \alias{random.test.data} 3 | \title{Radmom data generator} 4 | \description{ 5 | Generates datasets with random data for modelling including a dependent variable, independent variables and X,Y coordinates. 6 | } 7 | \usage{ 8 | random.test.data(nrows = 10, ncols = 10, vars.no = 3, dep.var.dis = "normal", 9 | xycoords = TRUE) 10 | } 11 | \arguments{ 12 | \item{nrows}{an integer referring to the number of rows for a regular grid} 13 | \item{ncols}{an integer referring to the number of columns for a regular grid} 14 | \item{vars.no}{an integer referring to the number of independent variables} 15 | \item{dep.var.dis}{a character referring to the distribution of the dependent variable. Options are "normal" (default) and "poisson"} 16 | \item{xycoords}{a logical value indicating whether X,Y coordinates will be created (default) or not.} 17 | } 18 | \details{The creation of a random dataset was necessary here to provide examples to some functions. However, random datasets may be used in simulation studies.} 19 | 20 | \value{a dataframe} 21 | 22 | \author{Stamatis Kalogirou } 23 | 24 | \examples{ 25 | RDF <- random.test.data(12,12,3) 26 | } 27 | \keyword{random data} 28 | -------------------------------------------------------------------------------- /R/grf.bw.R: -------------------------------------------------------------------------------- 1 | grf.bw <- function(formula, dataset, kernel="adaptive", coords, bw.min = NULL, bw.max = NULL, step = 1, trees=500, mtry=NULL, importance="impurity", nthreads = 1, forests = FALSE, geo.weighted = TRUE, ...) { 2 | 3 | 4 | if (!is.data.frame(dataset)) { 5 | stop("Error: input dataset must be a data frame") 6 | } 7 | 8 | 9 | Obs <- nrow(dataset) 10 | 11 | f <- formula(formula) 12 | RNames <- attr(terms(f), "term.labels") 13 | ModelVarNo <- length(RNames) 14 | 15 | DepVarName <- row.names(attr(terms(f), "factors"))[1] 16 | Y.DF <- dataset[DepVarName] 17 | Y <- Y.DF[[1]] 18 | 19 | 20 | 21 | if (is.null(bw.min)) {bw.min <- max(round(Obs*0.05,0), ModelVarNo+2, 20)} 22 | if (is.null(bw.max)) {bw.max <- max(round(Obs*0.95,0), ModelVarNo+2)} 23 | if (is.null(mtry)) {mtry= max(floor(ModelVarNo/3), 2)} 24 | 25 | #store goodness of fit statistics 26 | eval.bw.grf <- data.frame(Bandwidth=integer(), 27 | Local=double(), 28 | Mixed=double(), 29 | Low.Local=double(), 30 | stringsAsFactors=FALSE) 31 | set.seed(1234) 32 | count <- 1 33 | for(abw in seq(from= bw.min, to=bw.max, by=step)){ 34 | 35 | eval.bw.grf[count,1] <- abw 36 | 37 | grf16.a <- eval(substitute(grf(formula, dframe=dataset, bw=abw, kernel, coords, ntree=trees, mtry = mtry, importance=importance, nthreads=nthreads, forests = FALSE, geo.weighted = geo.weighted, print.results=FALSE, ...))) 38 | 39 | eval.bw.grf[count,2] <- grf16.a$LocalModelSummary$l.r.OOB 40 | 41 | message("Bandwidth: ", abw) 42 | message("R2 of Local Model: ", eval.bw.grf[count,2]) 43 | 44 | metrics_bw_grf <- postResample(pred = (grf16.a$LGofFit$LM_yfitOOB + grf16.a$Global.Model$predictions)/2, obs = Y) 45 | eval.bw.grf[count,3] <- metrics_bw_grf[2] 46 | 47 | metrics_bw_grf <- postResample(pred = (grf16.a$LGofFit$LM_yfitOOB*0.25) +(grf16.a$Global.Model$predictions*0.75), obs = Y) 48 | eval.bw.grf[count,4] <- metrics_bw_grf[2] 49 | 50 | count <- count + 1 51 | } 52 | 53 | best.bw <- eval.bw.grf$Bandwidth[which(eval.bw.grf$Local == max(eval.bw.grf$Local))] 54 | 55 | message("Best Bandwidth (Based on the Local Model): ", best.bw) 56 | 57 | return(list(tested.bandwidths = eval.bw.grf, Best.BW = best.bw)) 58 | } 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /man/rf.mtry.optim.Rd: -------------------------------------------------------------------------------- 1 | \name{rf.mtry.optim} 2 | \alias{rf.mtry.optim} 3 | \title{Optimal mtry} 4 | 5 | \description{This function calculates the optimal mtry for a given Random Forest (RF) model in a specified range of values. The optimal mtry value can then be used in the grf model.} 6 | 7 | \usage{rf.mtry.optim(formula, dataset, min.mtry=NULL, max.mtry=NULL, mtry.step, 8 | cv.method="repeatedcv", cv.folds=10, ...)} 9 | 10 | \arguments{ 11 | \item{formula}{the model to be fitted using the function \code{train} of the R package \code{caret}.} 12 | 13 | \item{dataset}{a numeric data frame of at least two suitable variables (one dependent and one independent)} 14 | 15 | \item{min.mtry}{the minimum mtry value for its optimisation (function \code{expand.grid})} 16 | 17 | \item{max.mtry}{the maximum mtry value for its optimisation (function \code{expand.grid})} 18 | 19 | \item{mtry.step}{the step in the sequence of mtry values for its optimisation (function \code{expand.grid})} 20 | 21 | \item{cv.method}{the resampling method in the function \code{trainControl} of the R package \code{caret}. Default option is "repeatedcv" and alternative option is "cv".} 22 | 23 | \item{cv.folds}{the number of folds (argument "number" in the function \code{trainControl}). Default value is 10)} 24 | 25 | \item{...}{additional arguments affecting the function \code{trainControl})} 26 | 27 | } 28 | 29 | \details{Based on the \code{train} function of the \code{caret} package, this function sets up a grid of tuning parameters for a number of random forest routines, fits each model and calculates a resampling based performance measure to choose the best mtry value.} 30 | 31 | \value{A list is returned of class train as in the function \code{train} in the \code{caret} package.} 32 | 33 | \references{Kuhn, M. (2008). Building Predictive Models in R Using the caret Package. Journal of Statistical Software, 28(5), 1 - 26. doi: 34 | 35 | Georganos, S. and Kalogirou, S. (2022) A Forest of Forests: A Spatially Weighted and Computationally Efficient Formulation of Geographical Random Forests. ISPRS, International Journal of Geo-Information, 2022, 11, 471. } 36 | 37 | \author{Stamatis Kalogirou , Stefanos Georganos } 38 | 39 | \note{This function is under development.} 40 | 41 | \examples{ 42 | 43 | \donttest{ 44 | data(Income) 45 | Coords <- Income[ ,1:2] 46 | results <- rf.mtry.optim(Income01 ~ UnemrT01 + PrSect01, Income) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /man/Income.Rd: -------------------------------------------------------------------------------- 1 | \name{Income} 2 | \alias{Income} 3 | \docType{data} 4 | \title{Mean household income at lcoal authorities in Greece in 2011} 5 | \description{Municipality centroids and socioeconomic variables aggregated to the new local authority geography in Greece (Programme Kallikratis).} 6 | 7 | \usage{data(Income)} 8 | \format{ 9 | A data frame with 325 observations on the following 5 variables. 10 | \describe{ 11 | \item{\code{X}}{a numeric vector of x coordinates} 12 | \item{\code{Y}}{a numeric vector of y coordinates} 13 | \item{\code{UnemrT01}}{a numeric vector of total unemployment rate in 2001 (Census)} 14 | \item{\code{PrSect01}}{a numeric vector of the proportion of economically active working in the primary financial sector (mainly agriculture; fishery; and forestry in 2001 (Census))} 15 | \item{\code{Foreig01}}{a numeric vector of proportion of people who do not have the Greek citizenship in 2001 (Census)} 16 | \item{\code{Income01}}{a numeric vector of mean recorded household income (in Euros) earned in 2001 and declared in 2002 tax forms} 17 | 18 | } 19 | } 20 | 21 | \details{ 22 | The X,Y coordinates refer to the geometric centroids of the new 325 Municipalities in Greece (Programme Kallikratis) in 2011.} 23 | \source{ 24 | The original shapefile of the corresponding polygons is available from the Hellenic Statistical Authority (EL.STAT.) at \url{http://www.statistics.gr/el/digital-cartographical-data}. The population, employment, citizenship and employment sector data is available from the Hellenic Statistical Authority (EL.STAT.) at \url{http://www.statistics.gr/en/home} but were aggregated to the new municipalities by the author. The income data are available from the General Secretariat of Information Systems in Greece at \url{http://www.gsis.gr/} at the postcode level of geography and were aggregated to the new municipalities by the author. 25 | } 26 | \references{ 27 | Kalogirou, S., and Hatzichristos, T. (2007). A spatial modelling framework for income estimation. Spatial Economic Analysis, 2(3), 297-316. \url{https://www.tandfonline.com/doi/full/10.1080/17421770701576921} 28 | 29 | Kalogirou, S. (2010). Spatial inequalities in income and post-graduate educational attainment in Greece. Journal of Maps, 6(1), 393-400.\url{https://www.tandfonline.com/doi/abs/10.4113/jom.2010.1095} 30 | 31 | Kalogirou, S. (2013) Testing geographically weighted multicollinearity diagnostics, GISRUK 2013, Department of Geography and Planning, School of Environmental Sciences, University of Liverpool, Liverpool, UK, 3-5 April 2013. 32 | 33 | } 34 | \examples{ 35 | data(Income) 36 | boxplot(Income$Income01) 37 | hist(Income$PrSect01) 38 | } 39 | \keyword{datasets} 40 | \keyword{Greek Municipalities} 41 | \keyword{Income} 42 | -------------------------------------------------------------------------------- /man/predict.grf.Rd: -------------------------------------------------------------------------------- 1 | \name{predict.grf} 2 | \alias{predict.grf} 3 | \title{Predict Method for Geographical Random Forest} 4 | \description{Prediction of test data using the geographical random forest.} 5 | 6 | \usage{\method{predict}{grf}(object, new.data, x.var.name, y.var.name, local.w=1, global.w=0,...) 7 | } 8 | \arguments{ 9 | \item{object}{an object that created by the function grf that includes all local forests.} 10 | \item{new.data}{a data frame containing new data.} 11 | \item{x.var.name}{the name of the variable with X coordinates.} 12 | \item{y.var.name}{the name of the variable with Y coordinates.} 13 | \item{local.w}{weight of the local model predictor allowing semi-local predictions. Default value is 1.} 14 | \item{global.w}{weight of the global model predictor allowing semi-local predictions. Default value is 0.} 15 | \item{...}{for other arguments passed to the generic predict functions. For example you may pass here the number of threats}} 16 | \details{A Geographical Random Forest prediction on unknown data. The nearest local random forest model in coordinate space is used to predict in each unknown y-variable location. 17 | } 18 | 19 | \value{vector of predicted values} 20 | 21 | \references{Stefanos Georganos, Tais Grippa, Assane Niang Gadiaga, Catherine Linard, Moritz Lennert, Sabine Vanhuysse, Nicholus Odhiambo Mboga, Eléonore Wolff & Stamatis Kalogirou (2019) Geographical Random Forests: A Spatial Extension of the Random Forest Algorithm to Address Spatial Heterogeneity in Remote Sensing and Population Modelling, Geocarto International, DOI: 10.1080/10106049.2019.1595177} 22 | 23 | \author{Stamatis Kalogirou , Stefanos Georganos } 24 | 25 | \note{This function is under development. There should be improvements in future versions of the package \code{SpatialML}. Any suggestion is welcome!} 26 | 27 | \seealso{ 28 | \code{\link{grf}} 29 | } 30 | 31 | \examples{ 32 | \dontrun{ 33 | RDF <- random.test.data(10,10,3) 34 | Coords<-RDF[ ,4:5] 35 | grf <- grf(dep ~ X1 + X2, dframe=RDF, bw=10, 36 | kernel="adaptive", coords=Coords) 37 | 38 | RDF.Test <- random.test.data(2,2,3) 39 | 40 | predict.grf(grf, RDF.Test, x.var.name="X", y.var.name="Y", local.w=1, global.w=0) 41 | } 42 | \donttest{ 43 | #Load the sample data 44 | data(Income) 45 | 46 | #Create the vector of XY coordinates 47 | Coords<-Income[,1:2] 48 | 49 | #Fit local model 50 | grf <- grf(Income01 ~ UnemrT01 + PrSect01, dframe=Income, bw=60, 51 | kernel="adaptive", coords=Coords) 52 | 53 | #Create New Random Data - XY coordinates inside the sample data map extend 54 | x<-runif(20, min = 142498, max = 1001578) 55 | y<-runif(20, min = 3855768, max = 4606754) 56 | u<-runif(20, min = 5, max = 50) 57 | p<-runif(20, min = 0, max = 100) 58 | f<-runif(20, min = 2, max = 30) 59 | df2<-data.frame(X=x, Y= y, UnemrT01=u, PrSect01=p, Foreig01=f) 60 | 61 | #Make predictions using the local model 62 | predict.grf(grf, df2, x.var.name="X", y.var.name="Y", local.w=1, global.w=0) 63 | } 64 | } 65 | 66 | \keyword{local random forest} 67 | -------------------------------------------------------------------------------- /man/grf.Rd: -------------------------------------------------------------------------------- 1 | \name{grf} 2 | \alias{grf} 3 | \title{Geographically Weighted Random Forest Model} 4 | \description{Fit a local version of the Random Forest algorithm, accounting for spatial non-stationarity.} 5 | \usage{grf(formula, dframe, bw, kernel, coords, ntree=500, mtry=NULL, 6 | importance="impurity", nthreads = NULL, forests = TRUE, 7 | geo.weighted = TRUE, print.results=TRUE, ...)} 8 | \arguments{ 9 | 10 | \item{formula}{a formula specifying the local model to be fitted, using the syntax of the \code{\link{ranger}} package's \code{\link{ranger}} function.} 11 | 12 | \item{dframe}{a numeric data frame with at least two suitable variables (one dependent and one independent).} 13 | 14 | \item{bw}{a positive number representing either the number of nearest neighbors (for "adaptive kernel") or bandwidth in meters (for "fixed kernel").} 15 | 16 | \item{kernel}{the type of kernel to use in the regression: "adaptive" or "fixed".} 17 | 18 | \item{coords}{a numeric matrix or data frame containing X and Y coordinates of observations.} 19 | 20 | \item{ntree}{an integer referring to the number of trees to grow for each local random forest.} 21 | 22 | \item{mtry}{the number of variables randomly sampled as candidates at each split. Default is \code{p/3}, where \code{p} is the number of variables in the formula.} 23 | 24 | \item{importance}{feature importance measure for the dependent variables used as input in the random forest. Default is "impurity", which refers to the Gini index for classification and the variance of the responses for regression.} 25 | 26 | \item{nthreads}{number of threads for parallel processing. Default is the number of available CPUs. The argument passes to both \code{\link{ranger}} and \code{\link{predict}} functions.} 27 | 28 | \item{forests}{a option to save and export (TRUE) or not (FALSE) all local forests.} 29 | 30 | \item{geo.weighted}{if TRUE, calculate Geographically Weighted Random Forest using case weights. If FALSE, calculate local random forests without weighting each observation.} 31 | 32 | \item{print.results}{a option to print the summary of the analysis (TRUE) or not (FALSE).} 33 | 34 | \item{...}{additional arguments passed to the \code{\link{ranger}} function.} 35 | 36 | } 37 | \details{ 38 | Geographically Weighted Random Forest (GRF) is a spatial analysis method using a local version of the famous Machine Learning algorithm. It allows for the investigation of the existence of spatial non-stationarity, in the relationship between a dependent and a set of independent variables. The latter is possible by fitting a sub-model for each observation in space, taking into account the neighbouring observations. This technique adopts the idea of the Geographically Weighted Regression, Kalogirou (2003). The main difference between a tradition (linear) GWR and GRF is that we can model non-stationarity coupled with a flexible non-linear model which is very hard to overfit due to its bootstrapping nature, thus relaxing the assumptions of traditional Gaussian statistics. Essentially, it was designed to be a bridge between machine learning and geographical models, combining inferential and explanatory power. Additionally, it is suited for datasets with numerous predictors, due to the robust nature of the random forest algorithm in high dimensionality. 39 | 40 | 41 | Geographically Weighted Random Forest (GRF) is a spatial analysis method that fits a local version of the Random Forest algorithm for investigating spatial non-stationarity, in the relationship between a dependent and a set of independent variables. The latter is possible by fitting a sub-model for each observation in space, taking into account the neighbouring observations. This technique adopts the idea of the Geographically Weighted Regression, Kalogirou (2003). It models non-stationarity with a flexible non-linear approach, bridging the gap between machine learning and geographical models. The main difference between a tradition (linear) GWR and GRF is that we can model non-stationarity coupled with a flexible non-linear model which is very hard to overfit due to its bootstrapping nature, thus relaxing the assumptions of traditional Gaussian statistics.GRF is suitable for datasets with numerous predictors due to the robustness of the random forest algorithm in high dimensionality.} 42 | 43 | 44 | \value{ 45 | \item{Global.Model}{A ranger object of the global random forest model.} 46 | 47 | \item{Locations}{a numeric matrix or data frame with X and Y coordinates of observations.} 48 | 49 | \item{Local.Variable.Importance}{anumeric data frame with local feature importance for each predictor in each local random forest model.} 50 | 51 | \item{LGofFit}{a numeric data frame with residuals and local goodness of fit statistics.} 52 | 53 | \item{Forests}{all local forests.} 54 | 55 | \item{lModelSummary}{Local Model Summary and goodness of fit statistics.} 56 | } 57 | 58 | 59 | \references{Stefanos Georganos, Tais Grippa, Assane Niang Gadiaga, Catherine Linard, Moritz Lennert, Sabine Vanhuysse, Nicholus Odhiambo Mboga, Eléonore Wolff & Stamatis Kalogirou (2019) Geographical Random Forests: A Spatial Extension of the Random Forest Algorithm to Address Spatial Heterogeneity in Remote Sensing and Population Modelling, Geocarto International, DOI: 10.1080/10106049.2019.1595177 60 | 61 | Georganos, S. and Kalogirou, S. (2022) A Forest of Forests: A Spatially Weighted and Computationally Efficient Formulation of Geographical Random Forests. ISPRS, International Journal of Geo-Information, 2022, 11, 471. } 62 | 63 | \author{Stamatis Kalogirou , Stefanos Georganos } 64 | 65 | \note{ 66 | This function is under development, and improvements are expected in future versions of the package \code{SpatialML}. Any suggestions are welcome! 67 | } 68 | 69 | \section{Warning}{Large datasets may take long to calibrate. A high number of observations may result in a voluminous forests output.} 70 | 71 | \seealso{ 72 | \code{\link{predict.grf}} 73 | } 74 | 75 | \examples{ 76 | \dontrun{ 77 | RDF <- random.test.data(10,10,3) 78 | Coords<-RDF[ ,4:5] 79 | grf <- grf(dep ~ X1 + X2, dframe=RDF, bw=10, 80 | kernel="adaptive", coords=Coords) 81 | } 82 | \donttest{ 83 | data(Income) 84 | Coords<-Income[ ,1:2] 85 | grf <- grf(Income01 ~ UnemrT01 + PrSect01, dframe=Income, bw=60, 86 | kernel="adaptive", coords=Coords) 87 | } 88 | } 89 | 90 | \keyword{spatial random forest} 91 | \keyword{predictive analytics} 92 | -------------------------------------------------------------------------------- /man/grf.bw.Rd: -------------------------------------------------------------------------------- 1 | \name{grf.bw} 2 | \alias{grf.bw} 3 | \title{Geographically Weighted Random Forest optimal bandwidth selection} 4 | \description{This function finds the optimal bandwidth for the Geographically Weighted Random Forest algorithm using an exhaustive approach.} 5 | \usage{grf.bw(formula, dataset, kernel="adaptive", coords, bw.min = NULL, 6 | bw.max = NULL, step = 1, trees=500, mtry=NULL, importance="impurity", 7 | nthreads = 1, forests = FALSE, geo.weighted = TRUE, ...)} 8 | \arguments{ 9 | 10 | \item{formula}{the local model to be fitted using the same syntax used in the \code{ranger} function of the R package \code{\link{ranger}}. This is a string that is passed to the sub-models' \code{ranger} function. For more details look at the class \code{\link{formula}}.} 11 | 12 | \item{dataset}{a numeric data frame of at least two suitable variables (one dependent and one independent)} 13 | 14 | \item{kernel}{the kernel to be used in the regression. Options are "adaptive" (default) or "fixed".} 15 | 16 | \item{coords}{a numeric matrix or data frame of two columns giving the X,Y coordinates of the observations} 17 | 18 | \item{bw.min}{an integer referring to the minimum bandwidth that evaluation starts.} 19 | 20 | \item{bw.max}{an integer referring to the maximum bandwidth that evaluation ends.} 21 | 22 | \item{step}{an integer referring to the step for each iteration of the evaluation between the min and the max bandwidth. Default value is 1.} 23 | 24 | \item{trees}{an integer referring to the number of trees to grow for each of the local random forests.} 25 | 26 | \item{mtry}{the number of variables randomly sampled as candidates at each split. Note that the default values is p/3, where p is number of variables in the formula} 27 | 28 | \item{importance}{feature importance of the dependent variables used as input at the random forest. Default value is "impurity" which refers to the Gini index for classification and the variance of the responses for regression.} 29 | 30 | \item{nthreads}{Number of threads. Default is number of CPUs available. The argument passes to both ranger and predict functions.} 31 | 32 | \item{forests}{a option to save and export (TRUE) or not (FALSE) all the local forests. Default value is FALSE.} 33 | 34 | \item{geo.weighted}{if TRUE the algorithm calculates Geographically Weighted Random Forest using the case.weights option of the package ranger. If FALSE it will calculate local random forests without weighting each observation in the local data set.} 35 | 36 | \item{...}{further arguments passed to the grf and ranger functions} 37 | 38 | } 39 | \details{ 40 | Geographically Weighted Random Forest (GRF) is a spatial analysis method using a local version of the famous Machine Learning algorithm. It allows for the investigation of the existence of spatial non-stationarity, in the relationship between a dependent and a set of independent variables. The latter is possible by fitting a sub-model for each observation in space, taking into account the neighbouring observations. This technique adopts the idea of the Geographically Weighted Regression, Kalogirou (2003). The main difference between a tradition (linear) GWR and GRF is that we can model non-stationarity coupled with a flexible non-linear model which is very hard to over-fit due to its bootstrapping nature, thus relaxing the assumptions of traditional Gaussian statistics. Essentially, it was designed to be a bridge between machine learning and geographical models, combining inferential and explanatory power. Additionally, it is suited for datasets with numerous predictors, due to the robust nature of the random forest algorithm in high dimensionality. 41 | 42 | This function is a first attempt to find the optimal bandwidth for the grf. It uses an exhaustive approach, i.e. it tests sequential nearest neighbour bandwidths within a range and with a user defined step, and returns a list of goodness of fit statistics. It chooses the best bandwidth based on the maximum R2 value of the local model. Future versions of this function will include heuristic methods to find the optimal bandwidth using algorithms such as optim.} 43 | 44 | 45 | \value{ 46 | \item{tested.bandwidths}{A table with the tested bandwidths and the corresponding R2 of three model configurations: Local that refers to predictions based on the local (grf) model only; Mixed that refers to predictions that equally combine local (grf) and global (rf) model predictors; and Low.Local that refers to a prediction based on the combination of the local model predictors with a weight of 0.25 and the global model predictors with a weight of 0.75).} 47 | 48 | \item{best.bw}{Best bandwidth based on the local model predictions.} 49 | } 50 | 51 | \references{Stefanos Georganos, Tais Grippa, Assane Niang Gadiaga, Catherine Linard, Moritz Lennert, Sabine Vanhuysse, Nicholus Odhiambo Mboga, Eléonore Wolff and Stamatis Kalogirou (2019) Geographical Random Forests: A Spatial Extension of the Random Forest Algorithm to Address Spatial Heterogeneity in Remote Sensing and Population Modelling, Geocarto International, DOI: 10.1080/10106049.2019.1595177 52 | 53 | Georganos, S. and Kalogirou, S. (2022) A Forest of Forests: A Spatially Weighted and Computationally Efficient Formulation of Geographical Random Forests. ISPRS, International Journal of Geo-Information, 2022, 11, 471. } 54 | 55 | \author{Stamatis Kalogirou , Stefanos Georganos } 56 | 57 | \note{ 58 | This function is under development. There should be improvements in future versions of the package \code{SpatialML}. Any suggestion is welcome! 59 | } 60 | 61 | \section{Warning}{Large datasets may take long time to evaluate the optimal bandwidth.} 62 | 63 | \seealso{ 64 | \code{\link{grf}} 65 | } 66 | 67 | \examples{ 68 | \dontrun{ 69 | RDF <- random.test.data(8,8,3) 70 | Coords<-RDF[ ,4:5] 71 | bw.test <- grf.bw(dep ~ X1 + X2, RDF, kernel="adaptive", 72 | coords=Coords, bw.min = 20, bw.max = 23, step = 1, 73 | forests = FALSE, weighted = TRUE) 74 | } 75 | \donttest{ 76 | data(Income) 77 | Coords<-Income[ ,1:2] 78 | 79 | bwe <-grf.bw(Income01 ~ UnemrT01 + PrSect01, Income, kernel="adaptive", 80 | coords=Coords, bw.min = 30, bw.max = 80, step = 1, 81 | forests = FALSE, weighted = TRUE) 82 | 83 | grf <- grf(Income01 ~ UnemrT01 + PrSect01, dframe=Income, bw=bwe$Best.BW, 84 | kernel="adaptive", coords=Coords) 85 | } 86 | } 87 | 88 | \keyword{spatial random forest} 89 | \keyword{predictive analytics} 90 | -------------------------------------------------------------------------------- /R/grf.R: -------------------------------------------------------------------------------- 1 | # This function fits a geographical random forest model. 2 | # Inputs: 3 | # - formula: an object of class "formula" or one that can be coerced to that class 4 | # - dframe: a data frame containing the variables in the model 5 | # - bw: bandwidth, used for kernel density estimation 6 | # - kernel: type of kernel to use ('adaptive' or 'fixed') 7 | # - coords: coordinates for the geographical data 8 | # - ntree: number of trees to grow in the forest 9 | # - mtry: number of variables randomly sampled as candidates at each split 10 | # - importance: type of importance measure ('impurity' or 'permutation') 11 | # - nthreads: number of threads for parallel processing 12 | # - forests: boolean indicating whether to save the local forests 13 | # - geo.weighted: boolean indicating whether to use geographical weighting 14 | # - print.results: boolean indicating whether to print the results 15 | # - ...: additional arguments passed to the ranger function 16 | 17 | grf <- function(formula, dframe, bw, kernel, coords, ntree=500, mtry=NULL, importance="impurity", nthreads = NULL, forests = TRUE, geo.weighted = TRUE, print.results=TRUE, ...) 18 | { 19 | 20 | # Start timing the function execution 21 | start.time <- Sys.time() 22 | 23 | # Convert formula text to a formula object 24 | f <- formula(formula) 25 | 26 | # Extract variable names from the formula 27 | RNames <- attr(terms(f), "term.labels") 28 | 29 | # Get the name of the dependent variable 30 | DepVarName <- row.names(attr(terms(f), "factors"))[1] 31 | 32 | # Create a data frame for the dependent variable 33 | Y.DF <- dframe[DepVarName] 34 | 35 | # Convert the dependent variable data frame to a vector 36 | Y <- Y.DF[[1]] 37 | 38 | # Determine the number of independent variables and add 1 for degrees of freedom 39 | ModelVarNo <- length(RNames) 40 | K = ModelVarNo + 1 41 | 42 | # Set the number of trees in the model 43 | ntrees <- ntree 44 | 45 | # Count the number of observations in the data 46 | Obs <- nrow(dframe) 47 | 48 | # Define mtry if it is not provided [max(floor(Number of Variables/3), 1)] 49 | if (is.null(mtry)) {mtry= max(floor(ModelVarNo/3), 1)} 50 | 51 | # Print initial information if required 52 | if(print.results) { 53 | message("\nNumber of Observations: ", Obs) 54 | message("Number of Independent Variables: ", ModelVarNo) 55 | } 56 | 57 | # Configure the kernel type and its parameters 58 | if(kernel == 'adaptive') 59 | { 60 | Ne <- bw 61 | if(print.results) {message("Kernel: Adaptive\nNeightbours: ", Ne)} 62 | } 63 | else 64 | { 65 | if(kernel == 'fixed') 66 | { 67 | if(print.results) {message("Kernel: Fixed\nBandwidth: ", bw)} 68 | } 69 | } 70 | 71 | # Fit the global random forest model using the ranger package 72 | Gl.Model <- eval(substitute(ranger(formula, data = dframe, num.trees=ntree, mtry= mtry, importance=importance, num.threads = nthreads, ...))) 73 | 74 | # Get predictions from the global model 75 | Predict <- predict(Gl.Model, dframe, num.threads = nthreads) 76 | 77 | yhat <- Predict$predictions 78 | 79 | # Print global model summary if required 80 | if(print.results) { 81 | message("\n--------------- Global ML Model Summary ---------------\n") 82 | print(Gl.Model) 83 | 84 | message("\nImportance:\n") 85 | print(Gl.Model$variable.importance) 86 | 87 | #calculate pseudoR2 88 | g.RSS <- sum((Y-yhat)^2) 89 | g.mean.y <- mean(Y) 90 | g.TSS<-sum((Y-g.mean.y)^2) 91 | 92 | g.r<-1-(g.RSS/g.TSS) 93 | 94 | g.AIC <- 2*K + Obs*log(g.RSS/Obs) 95 | 96 | g.AICc <- g.AIC + ((2*K*(K +1)) / (Obs - K - 1)) 97 | 98 | message("\nMean Square Error (Not OOB): ", round(g.RSS/Obs,3)) 99 | message("R-squared (Not OOB) %: ", round(100 * g.r,3)) 100 | message("AIC (Not OOB): ", round(g.AIC,3)) 101 | message("AICc (Not OOB): ", round(g.AICc,3)) 102 | } 103 | 104 | # Calculate distances between observations based on coordinates 105 | DistanceT <- dist(coords) 106 | Dij <- as.matrix(DistanceT) 107 | 108 | # Initialize storage for local forests if required 109 | if (forests == TRUE) {LM_Forests <- as.list(rep(NA, length(ntrees)))} 110 | 111 | LM_LEst <- as.data.frame(setNames(replicate(ModelVarNo, numeric(0), simplify = F), RNames[1:ModelVarNo])) 112 | 113 | LM_GofFit <- data.frame(y=numeric(0), LM_yfitOOB=numeric(0), LM_ResOOB=numeric(0), LM_yfitPred=numeric(0), 114 | LM_ResPred=numeric(0), LM_MSE=numeric(0), LM_Rsq100=numeric(0), LPerm=numeric(0)) 115 | 116 | for(m in 1:Obs){ 117 | 118 | #Get the data 119 | DNeighbour <- Dij[,m] 120 | DataSet <- data.frame(dframe, DNeighbour = DNeighbour) 121 | 122 | #Sort by distance 123 | DataSetSorted <- DataSet[order(DataSet$DNeighbour),] 124 | 125 | if(kernel == 'adaptive') 126 | { 127 | #Keep Nearest Neighbours 128 | SubSet <- DataSetSorted[1:Ne,] 129 | Kernel_H <- max(SubSet$DNeighbour) 130 | } 131 | else 132 | { 133 | if(kernel == 'fixed') 134 | { 135 | SubSet <- subset(DataSetSorted, DNeighbour <= bw) 136 | Kernel_H <- bw 137 | } 138 | } 139 | 140 | #Bi-square weights 141 | Wts <- (1-(SubSet$DNeighbour/Kernel_H)^2)^2 142 | 143 | #Calculate WLM 144 | if (geo.weighted == TRUE) { 145 | Lcl.Model <- eval(substitute(ranger(formula, data = SubSet, num.trees=ntree, mtry= mtry, importance=importance, case.weights=Wts, num.threads = nthreads, ...))) 146 | 147 | local.predicted.y <- Lcl.Model$predictions[[1]] 148 | counter <- 1 149 | while (is.nan(local.predicted.y)) { 150 | Lcl.Model<-eval(substitute(ranger(formula, data = SubSet, num.trees=ntree, mtry= mtry, importance=importance, case.weights=Wts, num.threads = nthreads, ...))) 151 | local.predicted.y <- Lcl.Model$predictions[[1]] 152 | counter <- counter + 1 153 | } 154 | } else 155 | { 156 | Lcl.Model<-eval(substitute(ranger(formula, data = SubSet, num.trees=ntree, mtry= mtry, importance=importance, num.threads = nthreads, ...))) 157 | counter <- 1 158 | } 159 | 160 | 161 | 162 | if (forests == TRUE) {LM_Forests[[m]] <- Lcl.Model} 163 | 164 | #Store in table 165 | #Importance 166 | for (j in 1:ModelVarNo) { 167 | LM_LEst[m,j] <- Lcl.Model$variable.importance[j] 168 | } 169 | 170 | #Observed y 171 | LM_GofFit[m,1] <- Y[m] 172 | LM_GofFit[m,2] <- Lcl.Model$predictions[[1]] 173 | LM_GofFit[m,3] <- LM_GofFit[m,1] - LM_GofFit[m,2] 174 | l.predict <- predict(Lcl.Model, dframe[m,], num.threads = nthreads) 175 | LM_GofFit[m,4] <- l.predict$predictions 176 | LM_GofFit[m,5] <- LM_GofFit[m,1] - LM_GofFit[m,4] 177 | LM_GofFit[m,6] <- Lcl.Model$prediction.error 178 | LM_GofFit[m,7] <- Lcl.Model$r.squared 179 | LM_GofFit[m,8] <- counter 180 | } 181 | 182 | # Compile outputs from the function 183 | if (forests == TRUE) {grf.out <- list(Global.Model=Gl.Model, Locations = coords, Local.Variable.Importance = LM_LEst, LGofFit=LM_GofFit, Forests=LM_Forests)} 184 | else {grf.out <- list(Global.Model=Gl.Model, Locations = coords, Local.Variable.Importance = LM_LEst, LGofFit=LM_GofFit)} 185 | 186 | if(print.results) { 187 | 188 | message("\n--------------- Local Model Summary ---------------\n") 189 | 190 | message("\nResiduals OOB:\n") 191 | print(summary(grf.out$LGofFit$LM_ResOOB)) 192 | 193 | message("\nResiduals Predicted (Not OOB):\n") 194 | 195 | print(summary(grf.out$LGofFit$LM_ResPred)) 196 | 197 | } 198 | lvi <- data.frame(Min = apply(grf.out$Local.Variable.Importance, 2, min), Max = apply(grf.out$Local.Variable.Importance, 2, max), 199 | Mean = apply(grf.out$Local.Variable.Importance, 2, mean), StD = apply(grf.out$Local.Variable.Importance, 2, sd)) 200 | 201 | 202 | l.RSS.OOB <- sum(grf.out$LGofFit$LM_ResOOB^2) 203 | l.RSS.Pred<-sum(grf.out$LGofFit$LM_ResPred^2) 204 | 205 | mean.y<-mean(grf.out$LGofFit$y) 206 | TSS<-sum((grf.out$LGofFit$y-mean.y)^2) 207 | 208 | l.r.OOB<-1-(l.RSS.OOB/TSS) 209 | g.AIC.OOB <- 2*K + Obs*log(l.RSS.OOB/Obs) 210 | g.AICc.OOB <- g.AIC.OOB + ((2*K*(K + 1)) / (Obs - K - 1)) 211 | 212 | 213 | 214 | l.r.Pred<-1-(l.RSS.Pred/TSS) 215 | g.AIC.Pred <- 2*K + Obs*log(l.RSS.Pred/Obs) 216 | g.AICc.Pred <- g.AIC.Pred + ((2*K*(K +1)) / (Obs - K - 1)) 217 | 218 | if(print.results) { 219 | 220 | message("\nLocal Variable Importance:\n") 221 | print(lvi) 222 | message("\nMean squared error (OOB): ", round(l.RSS.OOB/Obs,3)) 223 | message("R-squared (OOB) %: ", round(100* l.r.OOB,3)) 224 | message("AIC (OOB): ", round(g.AIC.OOB,3)) 225 | message("AICc (OOB): ", round(g.AICc.OOB,3)) 226 | message("Mean squared error Predicted (Not OOB): ", round(l.RSS.Pred/Obs,3)) 227 | message("R-squared Predicted (Not OOB) %: ", round(100* l.r.Pred,3)) 228 | message("AIC Predicted (Not OOB): ", round(g.AIC.Pred,3)) 229 | message("AICc Predicted (Not OOB): ", round(g.AICc.Pred,3)) 230 | } 231 | 232 | lModelSummary = list() 233 | lModelSummary$l.VariableImportance <- lvi 234 | lModelSummary$l.MSE.OOB <- l.RSS.OOB/Obs 235 | lModelSummary$l.r.OOB <- l.r.OOB 236 | lModelSummary$l.MSE.Pred <- l.RSS.Pred/Obs 237 | lModelSummary$l.r.Pred <- l.r.Pred 238 | 239 | grf.out$LocalModelSummary <- lModelSummary 240 | 241 | # Calculate and print the time taken to run the function 242 | end.time <- Sys.time() 243 | time.taken <- end.time - start.time 244 | 245 | if(print.results) {message("\nCalculation time (in seconds): ", round(time.taken,4))} 246 | 247 | # Return the output list 248 | return(grf.out) 249 | } 250 | --------------------------------------------------------------------------------