├── README.md ├── SCR-function.R ├── data-generation.R ├── example-Baltimore.R ├── simdata1.RData ├── simdata2.RData └── simulation.R /README.md: -------------------------------------------------------------------------------- 1 | # SCR: Spatially Clustered Regression 2 | 3 | This repository provides R code implementing spatially clustered regression for spatial data analysis, as proposed by the following paper. 4 | 5 | [Sugasawa, S. and Murakami, D. (2021). Spatially Clustered Regression. *Spatial Statistics* 44, 100525.](https://doi.org/10.1016/j.spasta.2021.100525) 6 | (arXiv version: https://arxiv.org/abs/2011.01493) 7 | 8 | The repository includes the following files. 9 | 10 | * SCR-function.R : Script implementing the proposed method 11 | * simulation.R : Script applying the proposed method to two simulated datasets 12 | * simdata1.RData: Simulated data 1 13 | * simdata2.RData: Simulated data 2 14 | * data-generation.R: Script for generating the two simulated datasets 15 | * example-Boltimore.R: Script for applying SCR to Baltimore dataset (available from `spdep` package) 16 | -------------------------------------------------------------------------------- /SCR-function.R: -------------------------------------------------------------------------------- 1 | ###-----------------------------------------------------### 2 | ### Functions for spatially clustered regression (SCR) ### 3 | ###-----------------------------------------------------### 4 | ## This code implements the following two functions for SCR/SFCR 5 | # 'SCR': SCR/SFCR with fixed G 6 | # 'SCR.select': tuning parameter (G) selection via BIC-type criteria 7 | 8 | ## packages 9 | library(SparseM) 10 | library(MASS) 11 | 12 | 13 | ### Spatially clustered regression (with LASSO) ### 14 | ## Input 15 | # Y: n-dimensional response vector 16 | # X: (n,p)-matrix of covariates (p: number of covariates) 17 | # W: (n,n)-matrix of spatial weight 18 | # Sp: (n,2)-matrix of location information 19 | # G: number of groups 20 | # Phi: tuning parameter for spatial similarity 21 | # offset: n-dimensional vector of offset term (applicable only to "poisson" and "NB") 22 | # fuzzy: if True, SFCR is applied 23 | # maxitr: maximum number of iterations 24 | # family: distribution family ("gaussian", "poisson" or "NB) 25 | 26 | ## Output 27 | # Beta: (G,p)-matrix of group-wise regression coefficients 28 | # Sig: G-dimensional vector of group-wise standard deviations (only for "gaussian" and "NB") 29 | # group: n-dimensional vector of group assignment 30 | # sBeta: (n,p)-matrix of location-wise regression coefficients 31 | # sSig: n-dimensional vector of location-wise standard deviations (only for "gaussian") 32 | # s: n-dimensional vector of location-wise standard deviations (only for "gaussian") 33 | # ML: maximum log-likelihood 34 | # itr: number of iterations 35 | 36 | ## Remark 37 | # matrix X should not include an intercept term 38 | # initial grouping is determined by K-means of spatial locations 39 | 40 | 41 | ## Main function 42 | SCR <- function(Y, X, W, Sp, G=5, Phi=1, offset=NULL, fuzzy=F, maxitr=100, delta=1, family="gaussian"){ 43 | ## Preparations 44 | ep <- 10^(-5) # convergence criterion 45 | X <- as.matrix(X) 46 | n <- dim(X)[1] # number of samples 47 | p <- dim(X)[2]+1 # number of regression coefficients 48 | XX <- as.matrix( cbind(1,X) ) 49 | W <- as(W, "sparseMatrix") 50 | if(is.null(offset)){ offset <- rep(0, n) } 51 | nmax <- function(x){ max(na.omit(x)) } # new max function 52 | 53 | ## Initial values 54 | M <- 20 # the number of initial values of k-means 55 | WSS <- c() 56 | CL <- list() 57 | for(k in 1:M){ 58 | CL[[k]] <- kmeans(Sp, G) 59 | WSS[k] <- CL[[k]]$tot.withinss 60 | } 61 | Ind <- CL[[which.min(WSS)]]$cluster 62 | Pen <- rep(0, G) 63 | Beta <- matrix(0, p, G) 64 | dimnames(Beta)[[2]] <- paste0("G=",1:G) 65 | Sig <- rep(1, G) # not needed under non-Gaussian case 66 | Nu <- rep(1, G) 67 | 68 | ## iterative algorithm 69 | val <- 0 70 | mval <- 0 71 | for(k in 1:maxitr){ 72 | cval <- val 73 | 74 | ## penalty term 75 | Ind.mat <- matrix(0, n, G) 76 | for(g in 1:G){ 77 | Ind.mat[Ind==g, g] <- 1 78 | } 79 | Ind.mat <- as(Ind.mat, "sparseMatrix") 80 | Pen <- W%*%Ind.mat # penalty term 81 | 82 | ## model parameters (clustered case) 83 | if(fuzzy==F){ 84 | for(g in 1:G){ 85 | if(length(Ind[Ind==g])>p+1){ 86 | # gaussian 87 | if(family=="gaussian"){ 88 | fit <- lm(Y[Ind==g]~X[Ind==g,]) 89 | Beta[,g] <- as.vector( coef(fit) ) 90 | resid <- Y-as.vector(XX%*%Beta[,g]) 91 | Sig[g] <- sqrt(mean(resid[Ind==g]^2)) 92 | Sig[g] <- max(Sig[g], 0.1) 93 | } 94 | # poisson 95 | if(family=="poisson"){ 96 | x <- X[Ind==g,] 97 | y <- Y[Ind==g] 98 | off <- offset[Ind==g] 99 | fit <- glm(y~x, offset=off, family="poisson") 100 | Beta[,g] <- as.vector( coef(fit) ) 101 | } 102 | # NB 103 | if(family=="NB"){ 104 | x <- X[Ind==g,] 105 | y <- Y[Ind==g] 106 | off <- offset[Ind==g] 107 | fit <- glm.nb(y~x+offset(off)) 108 | Beta[,g] <- as.vector( coef( fit ) ) 109 | Nu[g] <- fit$theta 110 | } 111 | } 112 | } 113 | } 114 | 115 | ## model parameters (fuzzy case) 116 | if(fuzzy==T){ 117 | # Gaussian 118 | if(family=="gaussian"){ 119 | Mu <- XX%*%Beta # (n,G)-matrix 120 | ESig <- t(matrix(rep(Sig,n), G, n)) # (n,G)-matrix 121 | log.dens <- log(dnorm(Y,Mu,ESig)) + Phi*Pen 122 | mval <- apply(log.dens, 1, max) 123 | log.denom <- mval + log(apply(exp(log.dens-mval), 1, sum)) 124 | PP <- exp(log.dens-log.denom) # weight 125 | for(g in 1:G){ 126 | if(sum(PP[,g])>0.1){ 127 | fit <- lm(Y~X, weights=PP[,g]) 128 | Beta[,g] <- as.vector( coef(fit) ) 129 | resid <- Y-as.vector(XX%*%Beta[,g]) 130 | Sig[g] <- sqrt( sum(PP[,g]*resid^2)/sum(PP[,g]) ) 131 | Sig[g] <- max(Sig[g], 0.1) 132 | } 133 | } 134 | } 135 | 136 | # Poisson 137 | if(family=="poisson"){ 138 | Mu <- exp(offset + XX%*%Beta) # (n,G)-matrix 139 | log.dens <- log(dpois(Y, Mu)) + Phi*Pen 140 | mval <- apply(log.dens, 1, max) 141 | log.denom <- mval + log(apply(exp(log.dens-mval), 1, sum)) 142 | PP <- exp(log.dens-log.denom) # weight 143 | for(g in 1:G){ 144 | if(sum(PP[,g])>0.1){ 145 | fit <- glm(Y~X, offset=offset, weights=PP[,g], family="poisson") 146 | Beta[,g] <- as.vector( coef(fit) ) 147 | } 148 | } 149 | } 150 | # NB 151 | if(family=="NB"){ 152 | Mu <- exp(offset + XX%*%Beta) # (n,G)-matrix 153 | log.dens <- dnbinom(Y, size=Nu, prob=Nu/(Nu+Mu), log=T) + Phi*Pen 154 | mval <- apply(log.dens, 1, max) 155 | log.denom <- mval + log(apply(exp(log.dens-mval), 1, sum)) 156 | PP <- exp(log.dens-log.denom) # weight 157 | for(g in 1:G){ 158 | if(sum(PP[,g])>0.1){ 159 | fit <- glm.nb(Y~X+offset(offset), weights=PP[,g]) 160 | Beta[,g] <- as.vector( coef(fit) ) 161 | Nu[g] <- fit$theta 162 | } 163 | } 164 | } 165 | } 166 | 167 | ## Grouping (clustered case) 168 | if(fuzzy==F){ 169 | if(family=="gaussian"){ 170 | Mu <- XX%*%Beta # (n,G)-matrix 171 | ESig <- t(matrix(rep(Sig,n), G, n)) # (n,G)-matrix 172 | Q <- dnorm(Y, Mu, ESig, log=T) + Phi*Pen # penalized likelihood 173 | } 174 | if(family=="poisson"){ 175 | Mu <- exp(offset + XX%*%Beta) 176 | Q <- dpois(Y, Mu, log=T) + Phi*Pen # penalized likelihood 177 | } 178 | if(family=="NB"){ 179 | Mu <- exp(offset + XX%*%Beta) 180 | Q <- dnbinom(Y, size=Nu, prob=Nu/(Nu+Mu), log=T) + Phi*Pen # penalized likelihood 181 | } 182 | Ind <- apply(Q, 1, which.max) 183 | } 184 | 185 | ## Grouping (fuzzy case) 186 | if(fuzzy==T){ 187 | if(family=="gaussian"){ 188 | Mu <- XX%*%Beta # (n,G)-matrix 189 | ESig <- t(matrix(rep(Sig,n), G, n)) # (n,G)-matrix 190 | Q <- delta*(dnorm(Y, Mu, ESig, log=T) + Phi*Pen) # penalized likelihood 191 | mval <- apply(Q, 1, max) 192 | log.denom <- mval + log(apply(exp(Q-mval), 1, sum)) 193 | PP <- exp(Q-log.denom) 194 | } 195 | if(family=="poisson"){ 196 | Mu <- exp(offset + XX%*%Beta) # (n,G)-matrix 197 | Q <- delta*(dpois(Y, Mu, log=T) + Phi*Pen) # penalized likelihood 198 | mval <- apply(Q, 1, max) 199 | log.denom <- mval + log(apply(exp(Q-mval), 1, sum)) 200 | PP <- exp(Q-log.denom) 201 | } 202 | if(family=="NB"){ 203 | Mu <- exp(offset + XX%*%Beta) # (n,G)-matrix 204 | Q <- delta*(dnbinom(Y, size=Nu, prob=Nu/(Nu+Mu), log=T) + Phi*Pen) # penalized likelihood 205 | mval <- apply(Q, 1, max) 206 | log.denom <- mval + log(apply(exp(Q-mval), 1, sum)) 207 | PP <- exp(Q-log.denom) 208 | } 209 | Ind <- apply(PP, 1, which.max) 210 | } 211 | 212 | ## Value of objective function 213 | val <- sum( apply(Q, 1, nmax) ) 214 | dd <- abs(cval-val)/abs(val) 215 | mval <- max(mval, cval) 216 | if( dd