├── Application-to-real-dataset
    ├── BGWR.py
    ├── BandwidthSelection.py
    └── EpiData.csv
├── README.md
└── Simulation
    ├── BGWR.py
    ├── BandwidthSelection.py
    ├── simuData.py
    └── simulateDate.csv


/Application-to-real-dataset/BGWR.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Dec 19 14:08:10 2020
  4 | 
  5 | @author: Yang Liu
  6 | """
  7 | 
  8 | 
  9 | import numpy as np
 10 | import os
 11 | import pandas as pd
 12 | from scipy.special import loggamma
 13 | import scipy.stats
 14 | from datetime import datetime
 15 | from multiprocessing import Pool
 16 | 
 17 | file_id = int(os.getenv('SLURM_ARRAY_TASK_ID'))
 18 | #print([task_id,type(task_id)],flush=True)
 19 | 
 20 | task_id = 4
 21 | rep_id = file_id
 22 | is_para = True    #Using parallel computing？
 23 | num_core = 10     #number of cores
 24 | fitting_ratio = 0    #the proportion of samples used for testing at the location of interest (i.e., splitting samples at location of interest into fitting set and testing set).
 25 | is_eucliDis = False      #Using Euclidian distance?
 26 | is_block = True        #block sampling?
 27 | 
 28 | 
 29 | # set multiple cores
 30 | #pool = ThreadPool(4)
 31 | 
 32 | #geographical kernel bandwidth
 33 | h = [100,500,1000,2000,3000,5000,7000,10000,20000][task_id]
 34 | 
 35 | 
 36 | #Geographically weighted kernel (exponential kernel)
 37 | def G_kernel(d,h):
 38 |     return(np.exp(-d**2/h**2))
 39 |     
 40 | #euclidean distance
 41 | def eucliDis(A,B):
 42 |     A = np.array(A)
 43 |     B = np.array(B)
 44 |     return np.sqrt(sum((A-B)**2))
 45 | 
 46 | #spherical distance (measured in KM)
 47 | def Haversine(A,B):
 48 |     """
 49 |     This uses the ‘haversine’ formula to calculate the great-circle distance between two points – that is, 
 50 |     the shortest distance over the earth’s surface – giving an ‘as-the-crow-flies’ distance between the points 
 51 |     (ignoring any hills they fly over, of course!).
 52 |     Haversine
 53 |     formula:    a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
 54 |     c = 2 ⋅ atan2( √a, √(1−a) )
 55 |     d = R ⋅ c
 56 |     where   φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km);
 57 |     note that angles need to be in radians to pass to trig functions!
 58 |     """
 59 |     lat1,lon1,lat2,lon2 = A[0],A[1],B[0],B[1]
 60 |     
 61 |     R = 6378.0088
 62 |     lat1,lon1,lat2,lon2 = map(np.radians, [lat1,lon1,lat2,lon2])
 63 | 
 64 |     dlat = lat2 - lat1
 65 |     dlon = lon2 - lon1
 66 |     a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2) **2
 67 |     c = 2 * np.arctan2(a**0.5, (1-a)**0.5)
 68 |     d = R * c
 69 |     return round(d,4)
 70 | 
 71 | #log-likelihood of negative binomial distribution
 72 | def negBion(outcome,offset,covariate,phi,theta):
 73 |     mean = np.exp(np.log(offset)+sum(np.array(covariate)*np.array(phi)))
 74 |     result = loggamma(outcome+1/theta)-loggamma(1/theta)-loggamma(outcome+1)-(1/theta)*np.log(1+theta*mean)+outcome*np.log(theta*mean/(1+theta*mean))
 75 |     return result
 76 | 
 77 | #log-prior for phi (uniform) 
 78 | def prior_phi(phi):
 79 |     if max(abs(np.array(phi)))>1000:
 80 |         return np.log(0)
 81 |     else:
 82 |         return np.log(1/2000)*len(phi)
 83 |     
 84 | #log-prior for theta (uniform 0,1000)
 85 | def prior_theta(theta):
 86 |     if theta>1000 or theta<=0:
 87 |         return np.log(0)
 88 |     else:
 89 |         return np.log(1/1000)
 90 |     
 91 | #baseline proposal sd for phi (proportional to estimated correlation matrix of phi)
 92 | #EURO
 93 | pro_st = np.array([[ 1,0,0],
 94 |        [0,  0.1, 0],
 95 |        [0 , 0,  0.01]])
 96 |  
 97 | #SouthEast    
 98 | pro_st2 = np.array([[ 9.93206712e-01, -3.78288045e-02, -1.04242828e-05],
 99 |        [-3.78288045e-02,  6.41566397e-03, -3.39223226e-04],
100 |        [-1.04242828e-05, -3.39223226e-04,  3.57276820e-04]])
101 | 
102 | 
103 | #two step adatpive proposal sd:
104 | #aggressive proposal sd for phi to approximate true value before burn_in
105 | pro_early = [np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st)]
106 | #mild proposal sd for phi to achieve a good mixture after burn_in
107 | pro_later1 = [np.dot(pro_st,pro_st),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2)]
108 | pro_later2 = [np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2)]
109 | 
110 |  
111 | #aggressive proposal sd for theta to approximate true value before burn_in 
112 | pro_theta_early = [0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25]
113 | #mild proposal sd for theta to achieve a good mixture after burn_in
114 | pro_theta_later = [0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25]
115 | 
116 | #speccialized with latitude
117 | #proposal sampling function for phi (multivariate normal)
118 | def r_phi(phi):
119 |     phi = np.array(phi)
120 |     phi_n = scipy.stats.multivariate_normal(phi,pro_early[task_id]).rvs(1)
121 |     return phi_n
122 | #proposal density function for phi
123 | def d_phi(phi_n,phi):
124 |     return np.log(scipy.stats.multivariate_normal(phi,pro_early[task_id]).pdf(phi_n))
125 | 
126 | def r_phi_new(phi,lat):
127 |     if(lat>20):
128 |         phi = np.array(phi)
129 |         phi_n = scipy.stats.multivariate_normal(phi,pro_later1[task_id]).rvs(1)
130 |         return phi_n
131 |     else:
132 |         phi = np.array(phi)
133 |         phi_n = scipy.stats.multivariate_normal(phi,pro_later2[task_id]).rvs(1)
134 |         return phi_n
135 | #proposal density function for phi
136 | def d_phi_new(phi_n,phi,lat):
137 |     if(lat>20):
138 |         return np.log(scipy.stats.multivariate_normal(phi,pro_later1[task_id]).pdf(phi_n))
139 |     else:
140 |         return np.log(scipy.stats.multivariate_normal(phi,pro_later2[task_id]).pdf(phi_n))
141 | 
142 | #proposal sampling function for theta (truncated normal)
143 | def r_theta(theta):
144 |     lower, upper, sd = 0, 1000, pro_theta_early[task_id-1]
145 |     X = scipy.stats.truncnorm(
146 |           (lower-theta)/sd,(upper-theta)/sd,loc=theta,scale=sd)
147 |     return float(X.rvs(size=1))
148 | #proposal density function for theta
149 | def d_theta(theta_n,theta):
150 |     theta_n = np.array(theta_n)
151 |     theta = np.array(theta)
152 |     lower, upper, sd = 0, 1000, pro_theta_early[task_id-1]
153 |     X = scipy.stats.truncnorm(
154 |           (lower-theta)/sd,(upper-theta)/sd,loc=theta,scale=sd)
155 |     return sum(np.log(X.pdf(theta_n)))
156 | 
157 | 
158 | #import data from file
159 | data = pd.read_csv('EpiData.csv',encoding='utf-8',header=0)
160 | #extract coordinates of locations
161 | location = data[['x','y']].drop_duplicates(subset=['x','y'])
162 | #number of locations
163 | num_location = location.shape[0]
164 | #add main key to location table
165 | location['index']=range(num_location)
166 | 
167 | #randomly select fitting subdata for each location of interest (cross validation)
168 | index_sel = []
169 | 
170 | for k in range(num_location):
171 |     loc_foc = location.values[k][0:2]
172 |     index_loc = data[(data['x']==loc_foc[0]) & (data['y']==loc_foc[1])].index
173 |     num_sel = int(len(index_loc)*fitting_ratio)
174 |     index_sel.append(np.sort(np.random.choice(index_loc,size=num_sel,replace=False)))
175 | 
176 | #given a observation (data_slice) and a location of interest (loc_int), this function calculate the geographical weight
177 | def kernel_weight(data_slice,loc_int,h):
178 |     loc1=data_slice[0:2]
179 |     if(is_eucliDis == True):
180 |         dis = eucliDis(loc1,loc_int)
181 |     else:
182 |         dis = Haversine(loc1,loc_int)
183 |     kern = G_kernel(dis,h)
184 |     return(kern)
185 | 
186 | #geographical weighting kernel matrix
187 | #In this part, we store all geographical weights in matrix (or list) "geo_weight" to avoid redundant calculation. The number of rows is equal to the number of location of interest. At each location of interest (e.g., each row), we calculate the geographical weight for each sample.
188 | weight = []
189 | theta_rep_num = np.zeros(shape=[num_location,num_location])
190 | theta_slice_ind = np.zeros(shape=[num_location,(num_location+1)])
191 | for i in range(num_location):
192 |     loc_int_inner = location.values[i][0:2]
193 |     slice_weight = lambda x: kernel_weight(x,loc_int_inner,h)
194 |     weight.append(np.array([0]*i+list(map(slice_weight,data.drop_duplicates(subset=['x','y']).iloc[i:,:].values))))
195 |     for j in range(num_location):
196 |         theta_rep_num[i][j] = np.equal(data.drop(index_sel[i]).iloc[:,:2].values, location[['x','y']].values[j]).all(axis=1).sum()
197 |     theta_slice_ind[i] = np.append(0,[sum(theta_rep_num[i][:(k+1)]) for k in range(len(theta_rep_num[i]))])
198 | 
199 | theta_slice_ind = theta_slice_ind.astype(int)        
200 | weight = np.array(weight)        
201 | weight = weight + weight.T - np.eye(num_location)
202 | geo_weight = [np.repeat(weight[i],theta_rep_num[i].astype(int)) for i in range(num_location)]
203 | 
204 | # joint log-density of negative binomial likelihood and prior given a certain location of interest.   Vectorization for fast calculation
205 | def joint_like(data,loc_int,phi,theta):
206 |     loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
207 |     theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int))   
208 |     outcome = np.array(data.drop(index_sel[loc_ind])['outcome'])
209 |     offset = np.array(data.drop(index_sel[loc_ind])['offset'])
210 |     covariate = np.array(data.drop(index_sel[loc_ind]).iloc[:, 4:])
211 |     mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi)))))
212 |     result = sum(geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean))))
213 |     result += (prior_phi(phi) + sum(list(map(prior_theta, theta))))
214 |     return(result)
215 | 
216 | def theta_like(subdata,loc_ind,phi,theta):
217 |     theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int))   
218 |     outcome = np.array(subdata['outcome'])
219 |     offset = np.array(subdata['offset'])
220 |     covariate = np.array(subdata.iloc[:, 4:])
221 |     mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi)))))
222 |     theta_like_value = geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean)))
223 |     result = np.array([sum(theta_like_value[theta_slice_ind[loc_ind][k]:theta_slice_ind[loc_ind][k+1]]) for k in range(num_location)])
224 |     result += np.array(list(map(prior_theta, theta)))
225 |     return(result)
226 |     
227 |     
228 | 
229 | #old code (discarded)
230 | #def weight_like(data_slice,loc_int,phi,theta,h):
231 | #    loc1=data_slice[0:2]
232 | #    theta_ind = int(location.loc[(location['x']==loc1[0]) & (location['y']==loc1[1])]['index'])
233 | #    dis = eucliDis(loc1,loc_int)
234 | #    kern = G_kernel(dis,h)
235 | #    return(kern*negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta[theta_ind]))
236 | 
237 | 
238 | #def joint_like(data,loc_int,phi,theta,h):
239 | #    slice_like = lambda x: weight_like(x,loc_int,phi,theta,h)
240 | #    #result = sum( data.apply(slice_like,axis=1) )
241 | #    result = sum(map(slice_like,data.values))
242 | #    result += (prior_phi(phi) + sum(list(map(prior_theta, theta))))
243 | #    return(result)
244 |   
245 | #init=[[1,1],[1]*num_location]
246 |     
247 | 
248 | #Given necessary model information "model_info" (i.e., list of value of phi, value of theta, coordinates of location of interest, joint density, value of theta at location of interest, number of accepted proposals),
249 | #function "GWR_update" updates old "model_info" by one step metropolis hasting. The output is new "model_info".
250 | #Note that, "GWR_update" only update one location. Therefore, it will be applied in parallel for all locations. See following function "GWR_MCMC_multloc"
251 | if is_block:
252 |     def GWR_update(model_info):
253 |         phi_old = model_info[0]
254 |         theta_old = model_info[1]
255 |         loc_int = model_info[2]
256 |         loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
257 |         subdata = data.drop(index_sel[loc_ind])
258 |         joint_old = model_info[3]
259 |         accept_num = model_info[5]
260 |         theta_focus = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
261 |         phi_new = r_phi(phi_old)
262 |         theta_new = list(map(r_theta,theta_old))
263 |         joint_new_phi = joint_like(data,loc_int,phi_new,theta_old)
264 |         rate_phi = joint_new_phi + d_phi(phi_old,phi_new) - joint_old - d_phi(phi_new,phi_old) 
265 |         alfa_phi = min(1,np.exp(rate_phi))
266 |         runif = np.random.uniform(0,1,1)[0]
267 |         phi_old = phi_new if runif < alfa_phi else phi_old
268 |         accept_num = (accept_num + 1) if runif <alfa_phi else accept_num
269 |         joint_new_theta = theta_like(subdata,loc_ind,phi_old,theta_new)
270 |         joint_old_theta = theta_like(subdata,loc_ind,phi_old,theta_old)
271 |         rate_theta = joint_new_theta + d_theta(theta_old,theta_new) - joint_old_theta - d_theta(theta_new,theta_old)
272 |         alfa_theta = np.minimum(np.ones_like(rate_theta),np.exp(rate_theta))
273 |         runif = np.random.uniform(0,1,len(alfa_theta))
274 |         theta_pro = [theta_new[q] if runif[q] < alfa_theta[q] else theta_old[q] for q in range(num_location)]
275 |         theta_old = theta_pro
276 |         sto_theta = theta_old[theta_focus]
277 |         joint_old = joint_like(data,loc_int,phi_old,theta_old)
278 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num])
279 | else:
280 |     def GWR_update(model_info):
281 |         phi_old = model_info[0]
282 |         theta_old = model_info[1]
283 |         loc_int = model_info[2]
284 |         joint_old = model_info[3]
285 |         accept_num = model_info[5]
286 |         theta_focus = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
287 |         phi_new = r_phi(phi_old)
288 |         theta_new = list(map(r_theta,theta_old))
289 |         joint_new = joint_like(data,loc_int,phi_new,theta_new)
290 |         rate = joint_new + d_phi(phi_old,phi_new) + d_theta(theta_old,theta_new) - joint_old - d_phi(phi_new,phi_old) - d_theta(theta_new,theta_old)
291 |         alfa = min(1,np.exp(rate))
292 |         runif = np.random.uniform(0,1,1)[0]
293 |         phi_old = phi_new if runif < alfa else phi_old
294 |         theta_old = theta_new if runif <alfa else theta_old
295 |         joint_old = joint_new if runif <alfa else joint_old
296 |         accept_num = (accept_num + 1) if runif <alfa else accept_num
297 |         sto_theta = theta_old[theta_focus]
298 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num])
299 |     
300 |     
301 | 
302 | #initial value "init" (list of initial phi, initial theta, coordinates of location of interest, initial joint density)    
303 | init_phi = [1,1,1]    
304 | init = [[init_phi,[1]*num_location,list(x),joint_like(data,x,init_phi,[1]*num_location),1,0] for x in location[['x','y']].values] 
305 | 
306 | # redefine weighted likelihood function for a single sample, this function is used to calculate the likelihood value of samples from testing set (cross validation).
307 | def weight_like_s(data_slice,loc_int,phi,theta,h):
308 |     loc1=data_slice[0:2]
309 |     if(is_eucliDis == True):
310 |         dis = eucliDis(loc1,loc_int)
311 |     else:
312 |         dis = Haversine(loc1,loc_int)
313 |     kern = G_kernel(dis,h)
314 |     return(kern*negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta))
315 |     
316 |     
317 | def GWR_update_new(model_info):
318 |     phi_old = model_info[0]
319 |     theta_old = model_info[1]
320 |     loc_int = model_info[2]
321 |     loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
322 |     subdata = data.drop(index_sel[loc_ind])
323 |     joint_old = model_info[3]
324 |     accept_num = model_info[5]
325 |     theta_focus = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
326 |     phi_new = r_phi_new(phi_old,loc_int[0])
327 |     theta_new = list(map(r_theta,theta_old))
328 |     joint_new_phi = joint_like(data,loc_int,phi_new,theta_old)
329 |     rate_phi = joint_new_phi + d_phi_new(phi_old,phi_new,loc_int[0]) - joint_old - d_phi_new(phi_new,phi_old,loc_int[0]) 
330 |     alfa_phi = min(1,np.exp(rate_phi))
331 |     runif = np.random.uniform(0,1,1)[0]
332 |     phi_old = phi_new if runif < alfa_phi else phi_old
333 |     accept_num = (accept_num + 1) if runif <alfa_phi else accept_num
334 |     joint_new_theta = theta_like(subdata,loc_ind,phi_old,theta_new)
335 |     joint_old_theta = theta_like(subdata,loc_ind,phi_old,theta_old)
336 |     rate_theta = joint_new_theta + d_theta(theta_old,theta_new) - joint_old_theta - d_theta(theta_new,theta_old)
337 |     alfa_theta = np.minimum(np.ones_like(rate_theta),np.exp(rate_theta))
338 |     runif = np.random.uniform(0,1,len(alfa_theta))
339 |     theta_pro = [theta_new[q] if runif[q] < alfa_theta[q] else theta_old[q] for q in range(num_location)]
340 |     theta_old = theta_pro
341 |     sto_theta = theta_old[theta_focus]
342 |     joint_old = joint_like(data,loc_int,phi_old,theta_old)
343 |     return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num])
344 | 
345 | 
346 | #MCMC updates of all location simultaneously (mignt be in parallel).
347 | def GWR_MCMC_multloc(init,num_iter,thin,burn_in):
348 |     sto_phi = np.zeros([(num_iter-burn_in)//thin,num_location,len(init[0][0])])     #store posterior samples of phi
349 |     sto_theta = np.zeros([(num_iter-burn_in)//thin,num_location,1])     #store posterior samples of theta
350 |     iter_param = init   
351 |     #loglik_sum = np.zeros(num_location)     #store the log-likelihood of testing set for all locations
352 |     #elpd = np.zeros(num_location)       #store the estimated elpd for all locations
353 |     for i in range(num_iter):
354 |         if( (i<=(burn_in-1)) | ((i+1) % thin !=0)):
355 |             if(is_para):
356 |                 iter_param = list(pool.map(GWR_update,iter_param))      #one step metropolis hasting update for all locations in parallel
357 |             else:
358 |                 iter_param = list(map(GWR_update,iter_param))       #one step metropolis hasting update for all locations
359 |             
360 |             #if((i+1) % thin == 0):
361 |                 #print('{0}% complete.'.format((i+1)*100/num_iter), flush=True)
362 |         else:
363 |             if(is_para):
364 |                 iter_param = list(pool.map(GWR_update_new,iter_param))      #one step metropolis hasting update for all locations in parallel
365 |             else:
366 |                 iter_param = list(map(GWR_update_new,iter_param))       #one step metropolis hasting update for all locations
367 |             accept_number = []
368 |             for j in range(num_location):       #calculate the elpd at each location by for loop
369 |                 #sub_log_lik = 0
370 |                 #loc_foc = list(location.loc[location['index']==j][['x','y']].values[0])
371 |                 #testing subdata
372 |                 #subdata=data.iloc[index_sel[j]]     #testing set at location j
373 |                 sto_phi[((i+1-burn_in)//thin) - 1][j] = iter_param[j][0]
374 |                 sto_theta[((i+1-burn_in)//thin) - 1][j] = iter_param[j][4] 
375 |                 accept_number.append(iter_param[j][5])
376 |                 #slice_like = lambda x: weight_like_s(x,loc_foc,iter_param[j][0],iter_param[j][4],h)     #define the likelihood function by using posterior phi and theta at location j of iteration i
377 |                 #sub_log_lik = sum( subdata.apply(slice_like,axis=1) )       #calculate the likelihood of testing set at location j by using posterior phi and theta at iteration i
378 |                 #loglik_sum[j] += sub_log_lik        #add likelihood of testing set at location j of iteration i to previous likelihood storation.
379 |                 #elpd[j] = loglik_sum[j]/((i+1-burn_in)//thin)       #elpd at location j = the mean of all likelihoods of testing set at location j before iteration i
380 |             #accept_rate = np.mean(np.array(accept_number)/i)        #calculate the mean acceptance rate
381 |             #elpd_mean = np.mean(elpd)       #mean of elpd across all locations
382 |             #print('{0}% complete. The ELPD is: {site}. The average acceptance rate is: {rate}'.format((i+1)*100/num_iter, site=elpd_mean, rate=accept_rate), flush=True)
383 |     result = {'phi':sto_phi,'theta':sto_theta}
384 |     return(result)
385 |     
386 |     
387 | time_one = datetime.now()
388 | if __name__ == '__main__':
389 |     pool = Pool(processes=num_core)
390 |     re=GWR_MCMC_multloc(init,13000,1,3000)
391 | time_two = datetime.now()
392 | 
393 | print(time_two-time_one)        #time used for MCMC updates
394 | 
395 | est_phi=sum(re['phi'])/re['phi'].shape[0]       #posterior estimation of phi (posterior mean)
396 | est_theta=sum(re['theta'])/re['theta'].shape[0]     ##posterior estimation of theta (posterior mean)
397 | 
398 | upper_phi=np.zeros_like(sum(re['phi']))
399 | median_phi=np.zeros_like(sum(re['phi']))
400 | lower_phi=np.zeros_like(sum(re['phi']))
401 | for i in range(num_location):
402 |     store_phi = []
403 |     for j in range(re['phi'].shape[0]):
404 |         store_phi.append(re['phi'][j][i])
405 |     store_phi = np.array(pd.DataFrame(store_phi).T)
406 |     upper_phi[i] = np.quantile(store_phi,q=0.975,axis=1)
407 |     median_phi[i] = np.quantile(store_phi,q=0.5,axis=1)
408 |     lower_phi[i] = np.quantile(store_phi,q=0.025,axis=1)
409 |     
410 | 
411 | for m in range(num_location):
412 |     trace = np.zeros(shape=[re['phi'].shape[0],re['phi'].shape[2]+re['theta'].shape[2]]) 
413 |     for n in range(re['phi'].shape[0]):
414 |         trace[n][0:re['phi'].shape[2]] = re['phi'][n][m]
415 |         trace[n][re['phi'].shape[2]:] = re['theta'][n][m]
416 |     np.savetxt('trace'+str(m)+'rep'+str(rep_id)+'.csv',trace,delimiter=',')
417 | 
418 | #print(est_phi)
419 | #np.savetxt("est_phi"+str(rep_id)+".csv", est_phi, delimiter=",")
420 | #np.savetxt("upper_phi"+str(rep_id)+".csv", upper_phi, delimiter=",")
421 | #np.savetxt("median_phi"+str(rep_id)+".csv", median_phi, delimiter=",")
422 | #np.savetxt("lower_phi"+str(rep_id)+".csv", lower_phi, delimiter=",")
423 | #print(est_theta)
424 | #np.savetxt("est_theta"+str(rep_id)+".csv", est_theta, delimiter=",")
425 | #print([h,re['ELPD']])
426 | 


--------------------------------------------------------------------------------
/Application-to-real-dataset/BandwidthSelection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Dec 19 14:08:10 2020
  4 | 
  5 | @author: Yang Liu
  6 | """
  7 | 
  8 | 
  9 | import numpy as np
 10 | import os
 11 | import pandas as pd
 12 | from scipy.special import loggamma
 13 | import scipy.stats
 14 | from datetime import datetime
 15 | from multiprocessing import Pool
 16 | 
 17 | file_id = int(os.getenv('SLURM_ARRAY_TASK_ID'))
 18 | #print([task_id,type(task_id)],flush=True)
 19 | 
 20 | task_id = (file_id-1)//15
 21 | rep_id = file_id%15
 22 | is_para = True    #Using parallel computing？
 23 | num_core = 10     #number of cores
 24 | fitting_ratio = 0.5    #the proportion of samples used for fitting at the location of interest (i.e., splitting samples at location of interest into fitting set and testing set).
 25 | is_eucliDis = False      #Using Euclidian distance?
 26 | is_block = True        #block sampling?
 27 | 
 28 | 
 29 | 
 30 | # set multiple cores
 31 | #pool = ThreadPool(4)
 32 | 
 33 | #geographical kernel bandwidth
 34 | h = [100,500,1000,2000,3000,5000,7000,10000,20000][task_id]
 35 | 
 36 | #Geographically weighted kernel (exponential kernel)
 37 | def G_kernel(d,h):
 38 |     return(np.exp(-d**2/h**2))
 39 |     
 40 | #euclidean distance
 41 | def eucliDis(A,B):
 42 |     A = np.array(A)
 43 |     B = np.array(B)
 44 |     return np.sqrt(sum((A-B)**2))
 45 | 
 46 | #spherical distance (measured in KM)
 47 | def Haversine(A,B):
 48 |     """
 49 |     This uses the ‘haversine’ formula to calculate the great-circle distance between two points – that is, 
 50 |     the shortest distance over the earth’s surface – giving an ‘as-the-crow-flies’ distance between the points 
 51 |     (ignoring any hills they fly over, of course!).
 52 |     Haversine
 53 |     formula:    a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
 54 |     c = 2 ⋅ atan2( √a, √(1−a) )
 55 |     d = R ⋅ c
 56 |     where   φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km);
 57 |     note that angles need to be in radians to pass to trig functions!
 58 |     """
 59 |     lat1,lon1,lat2,lon2 = A[0],A[1],B[0],B[1]
 60 |     
 61 |     R = 6378.0088
 62 |     lat1,lon1,lat2,lon2 = map(np.radians, [lat1,lon1,lat2,lon2])
 63 | 
 64 |     dlat = lat2 - lat1
 65 |     dlon = lon2 - lon1
 66 |     a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2) **2
 67 |     c = 2 * np.arctan2(a**0.5, (1-a)**0.5)
 68 |     d = R * c
 69 |     return round(d,4)
 70 | 
 71 | #log-likelihood of negative binomial distribution
 72 | def negBion(outcome,offset,covariate,phi,theta):
 73 |     mean = np.exp(np.log(offset)+sum(np.array(covariate)*np.array(phi)))
 74 |     result = loggamma(outcome+1/theta)-loggamma(1/theta)-loggamma(outcome+1)-(1/theta)*np.log(1+theta*mean)+outcome*np.log(theta*mean/(1+theta*mean))
 75 |     return result
 76 | 
 77 | #log-prior for phi (uniform) 
 78 | def prior_phi(phi):
 79 |     if max(abs(np.array(phi)))>1000:
 80 |         return np.log(0)
 81 |     else:
 82 |         return np.log(1/2000)*len(phi)
 83 |     
 84 | #log-prior for theta (uniform 0,1000)
 85 | def prior_theta(theta):
 86 |     if theta>1000 or theta<=0:
 87 |         return np.log(0)
 88 |     else:
 89 |         return np.log(1/1000)
 90 |     
 91 | #baseline proposal sd for phi (proportional to estimated correlation matrix of phi)
 92 | #EURO
 93 | pro_st = np.array([[ 1,0,0],
 94 |        [0,  0.1, 0],
 95 |        [0 , 0,  0.01]])
 96 |  
 97 | #SouthEast    
 98 | pro_st2 = np.array([[ 9.93206712e-01, -3.78288045e-02, -1.04242828e-05],
 99 |        [-3.78288045e-02,  6.41566397e-03, -3.39223226e-04],
100 |        [-1.04242828e-05, -3.39223226e-04,  3.57276820e-04]])
101 | 
102 | 
103 | #two step adatpive proposal sd:
104 | #aggressive proposal sd for phi to approximate true value before burn_in
105 | pro_early = [np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st)]
106 | #mild proposal sd for phi to achieve a good mixture after burn_in
107 | pro_later1 = [np.dot(pro_st,pro_st),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2)]
108 | pro_later2 = [np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2)]
109 | 
110 |  
111 | #aggressive proposal sd for theta to approximate true value before burn_in 
112 | pro_theta_early = [0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25]
113 | #mild proposal sd for theta to achieve a good mixture after burn_in
114 | pro_theta_later = [0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25]
115 | 
116 | #speccialized with latitude
117 | #proposal sampling function for phi (multivariate normal)
118 | def r_phi(phi):
119 |     phi = np.array(phi)
120 |     phi_n = scipy.stats.multivariate_normal(phi,pro_early[task_id-1]).rvs(1)
121 |     return phi_n
122 | #proposal density function for phi
123 | def d_phi(phi_n,phi):
124 |     return np.log(scipy.stats.multivariate_normal(phi,pro_early[task_id-1]).pdf(phi_n))
125 | 
126 | def r_phi_new(phi,lat):
127 |     if(lat>20):
128 |         phi = np.array(phi)
129 |         phi_n = scipy.stats.multivariate_normal(phi,pro_later1[task_id-1]).rvs(1)
130 |         return phi_n
131 |     else:
132 |         phi = np.array(phi)
133 |         phi_n = scipy.stats.multivariate_normal(phi,pro_later2[task_id-1]).rvs(1)
134 |         return phi_n
135 | #proposal density function for phi
136 | def d_phi_new(phi_n,phi,lat):
137 |     if(lat>20):
138 |         return np.log(scipy.stats.multivariate_normal(phi,pro_later1[task_id-1]).pdf(phi_n))
139 |     else:
140 |         return np.log(scipy.stats.multivariate_normal(phi,pro_later2[task_id-1]).pdf(phi_n))
141 | 
142 | #proposal sampling function for theta (truncated normal)
143 | def r_theta(theta):
144 |     lower, upper, sd = 0, 1000, pro_theta_early[task_id-1]
145 |     X = scipy.stats.truncnorm(
146 |           (lower-theta)/sd,(upper-theta)/sd,loc=theta,scale=sd)
147 |     return float(X.rvs(size=1))
148 | #proposal density function for theta
149 | def d_theta(theta_n,theta):
150 |     theta_n = np.array(theta_n)
151 |     theta = np.array(theta)
152 |     lower, upper, sd = 0, 1000, pro_theta_early[task_id-1]
153 |     X = scipy.stats.truncnorm(
154 |           (lower-theta)/sd,(upper-theta)/sd,loc=theta,scale=sd)
155 |     return sum(np.log(X.pdf(theta_n)))
156 | 
157 | 
158 | #import data from file
159 | data = pd.read_csv('EpiData(test).csv',encoding='utf-8',header=0)
160 | #extract coordinates of locations
161 | location = data[['x','y']].drop_duplicates(subset=['x','y'])
162 | #number of locations
163 | num_location = location.shape[0]
164 | #add main key to location table
165 | location['index']=range(num_location)
166 | 
167 | #randomly select fitting subdata for each location of interest (cross validation)
168 | index_sel = []
169 | 
170 | for k in range(num_location):
171 |     loc_foc = location.values[k][0:2]
172 |     index_loc = data[(data['x']==loc_foc[0]) & (data['y']==loc_foc[1])].index
173 |     num_sel = int(len(index_loc)*fitting_ratio)
174 |     index_sel.append(np.sort(np.random.choice(index_loc,size=num_sel,replace=False)))
175 | 
176 | #given a observation (data_slice) and a location of interest (loc_int), this function calculate the geographical weight
177 | def kernel_weight(data_slice,loc_int,h):
178 |     loc1=data_slice[0:2]
179 |     if(is_eucliDis == True):
180 |         dis = eucliDis(loc1,loc_int)
181 |     else:
182 |         dis = Haversine(loc1,loc_int)
183 |     kern = G_kernel(dis,h)
184 |     return(kern)
185 | 
186 | #geographical weighting kernel matrix
187 | #In this part, we store all geographical weights in matrix (or list) "geo_weight" to avoid redundant calculation. The number of rows is equal to the number of location of interest. At each location of interest (e.g., each row), we calculate the geographical weight for each sample.
188 | weight = []
189 | theta_rep_num = np.zeros(shape=[num_location,num_location])
190 | theta_slice_ind = np.zeros(shape=[num_location,(num_location+1)])
191 | for i in range(num_location):
192 |     loc_int_inner = location.values[i][0:2]
193 |     slice_weight = lambda x: kernel_weight(x,loc_int_inner,h)
194 |     weight.append(np.array([0]*i+list(map(slice_weight,data.drop_duplicates(subset=['x','y']).iloc[i:,:].values))))
195 |     for j in range(num_location):
196 |         theta_rep_num[i][j] = np.equal(data.drop(index_sel[i]).iloc[:,:2].values, location[['x','y']].values[j]).all(axis=1).sum()
197 |     theta_slice_ind[i] = np.append(0,[sum(theta_rep_num[i][:(k+1)]) for k in range(len(theta_rep_num[i]))])
198 | 
199 | theta_slice_ind = theta_slice_ind.astype(int)        
200 | weight = np.array(weight)        
201 | weight = weight + weight.T - np.eye(num_location)
202 | geo_weight = [np.repeat(weight[i],theta_rep_num[i].astype(int)) for i in range(num_location)]
203 | 
204 | # joint log-density of negative binomial likelihood and prior given a certain location of interest.   Vectorization for fast calculation
205 | def joint_like(data,loc_int,phi,theta):
206 |     loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
207 |     theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int))   
208 |     outcome = np.array(data.drop(index_sel[loc_ind])['outcome'])
209 |     offset = np.array(data.drop(index_sel[loc_ind])['offset'])
210 |     covariate = np.array(data.drop(index_sel[loc_ind]).iloc[:, 4:])
211 |     mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi)))))
212 |     result = sum(geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean))))
213 |     result += (prior_phi(phi) + sum(list(map(prior_theta, theta))))
214 |     return(result)
215 | 
216 | def theta_like(subdata,loc_ind,phi,theta):
217 |     theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int))   
218 |     outcome = np.array(subdata['outcome'])
219 |     offset = np.array(subdata['offset'])
220 |     covariate = np.array(subdata.iloc[:, 4:])
221 |     mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi)))))
222 |     theta_like_value = geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean)))
223 |     result = np.array([sum(theta_like_value[theta_slice_ind[loc_ind][k]:theta_slice_ind[loc_ind][k+1]]) for k in range(num_location)])
224 |     result += np.array(list(map(prior_theta, theta)))
225 |     return(result)
226 |     
227 |     
228 | 
229 | #old code (discarded)
230 | #def weight_like(data_slice,loc_int,phi,theta,h):
231 | #    loc1=data_slice[0:2]
232 | #    theta_ind = int(location.loc[(location['x']==loc1[0]) & (location['y']==loc1[1])]['index'])
233 | #    dis = eucliDis(loc1,loc_int)
234 | #    kern = G_kernel(dis,h)
235 | #    return(kern*negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta[theta_ind]))
236 | 
237 | 
238 | #def joint_like(data,loc_int,phi,theta,h):
239 | #    slice_like = lambda x: weight_like(x,loc_int,phi,theta,h)
240 | #    #result = sum( data.apply(slice_like,axis=1) )
241 | #    result = sum(map(slice_like,data.values))
242 | #    result += (prior_phi(phi) + sum(list(map(prior_theta, theta))))
243 | #    return(result)
244 |   
245 | #init=[[1,1],[1]*num_location]
246 |     
247 | 
248 | #Given necessary model information "model_info" (i.e., list of value of phi, value of theta, coordinates of location of interest, joint density, value of theta at location of interest, number of accepted proposals),
249 | #function "GWR_update" updates old "model_info" by one step metropolis hasting. The output is new "model_info".
250 | #Note that, "GWR_update" only update one location. Therefore, it will be applied in parallel for all locations. See following function "GWR_MCMC_multloc"
251 | if is_block:
252 |     def GWR_update(model_info):
253 |         phi_old = model_info[0]
254 |         theta_old = model_info[1]
255 |         loc_int = model_info[2]
256 |         loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
257 |         subdata = data.drop(index_sel[loc_ind])
258 |         joint_old = model_info[3]
259 |         accept_num = model_info[5]
260 |         theta_focus = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
261 |         phi_new = r_phi(phi_old)
262 |         theta_new = list(map(r_theta,theta_old))
263 |         joint_new_phi = joint_like(data,loc_int,phi_new,theta_old)
264 |         rate_phi = joint_new_phi + d_phi(phi_old,phi_new) - joint_old - d_phi(phi_new,phi_old) 
265 |         alfa_phi = min(1,np.exp(rate_phi))
266 |         runif = np.random.uniform(0,1,1)[0]
267 |         phi_old = phi_new if runif < alfa_phi else phi_old
268 |         accept_num = (accept_num + 1) if runif <alfa_phi else accept_num
269 |         joint_new_theta = theta_like(subdata,loc_ind,phi_old,theta_new)
270 |         joint_old_theta = theta_like(subdata,loc_ind,phi_old,theta_old)
271 |         rate_theta = joint_new_theta + d_theta(theta_old,theta_new) - joint_old_theta - d_theta(theta_new,theta_old)
272 |         alfa_theta = np.minimum(np.ones_like(rate_theta),np.exp(rate_theta))
273 |         runif = np.random.uniform(0,1,len(alfa_theta))
274 |         theta_pro = [theta_new[q] if runif[q] < alfa_theta[q] else theta_old[q] for q in range(num_location)]
275 |         theta_old = theta_pro
276 |         sto_theta = theta_old[theta_focus]
277 |         joint_old = joint_like(data,loc_int,phi_old,theta_old)
278 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num])
279 | else:
280 |     def GWR_update(model_info):
281 |         phi_old = model_info[0]
282 |         theta_old = model_info[1]
283 |         loc_int = model_info[2]
284 |         joint_old = model_info[3]
285 |         accept_num = model_info[5]
286 |         theta_focus = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
287 |         phi_new = r_phi(phi_old)
288 |         theta_new = list(map(r_theta,theta_old))
289 |         joint_new = joint_like(data,loc_int,phi_new,theta_new)
290 |         rate = joint_new + d_phi(phi_old,phi_new) + d_theta(theta_old,theta_new) - joint_old - d_phi(phi_new,phi_old) - d_theta(theta_new,theta_old)
291 |         alfa = min(1,np.exp(rate))
292 |         runif = np.random.uniform(0,1,1)[0]
293 |         phi_old = phi_new if runif < alfa else phi_old
294 |         theta_old = theta_new if runif <alfa else theta_old
295 |         joint_old = joint_new if runif <alfa else joint_old
296 |         accept_num = (accept_num + 1) if runif <alfa else accept_num
297 |         sto_theta = theta_old[theta_focus]
298 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num])
299 |     
300 |     
301 | 
302 | #initial value "init" (list of initial phi, initial theta, coordinates of location of interest, initial joint density)    
303 | init_phi = [1,1,1]    
304 | init = [[init_phi,[1]*num_location,list(x),joint_like(data,x,init_phi,[1]*num_location),1,0] for x in location[['x','y']].values] 
305 | 
306 | # redefine weighted likelihood function for a single sample, this function is used to calculate the likelihood value of samples from testing set (cross validation).
307 | def weight_like_s(data_slice,loc_int,phi,theta,h):
308 |     loc1=data_slice[0:2]
309 |     if(is_eucliDis == True):
310 |         dis = eucliDis(loc1,loc_int)
311 |     else:
312 |         dis = Haversine(loc1,loc_int)
313 |     kern = G_kernel(dis,h)
314 |     return(kern*negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta))
315 |     
316 |     
317 | def GWR_update_new(model_info):
318 |     phi_old = model_info[0]
319 |     theta_old = model_info[1]
320 |     loc_int = model_info[2]
321 |     loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
322 |     subdata = data.drop(index_sel[loc_ind])
323 |     joint_old = model_info[3]
324 |     accept_num = model_info[5]
325 |     theta_focus = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
326 |     phi_new = r_phi_new(phi_old,loc_int[0])
327 |     theta_new = list(map(r_theta,theta_old))
328 |     joint_new_phi = joint_like(data,loc_int,phi_new,theta_old)
329 |     rate_phi = joint_new_phi + d_phi_new(phi_old,phi_new,loc_int[0]) - joint_old - d_phi_new(phi_new,phi_old,loc_int[0]) 
330 |     alfa_phi = min(1,np.exp(rate_phi))
331 |     runif = np.random.uniform(0,1,1)[0]
332 |     phi_old = phi_new if runif < alfa_phi else phi_old
333 |     accept_num = (accept_num + 1) if runif <alfa_phi else accept_num
334 |     joint_new_theta = theta_like(subdata,loc_ind,phi_old,theta_new)
335 |     joint_old_theta = theta_like(subdata,loc_ind,phi_old,theta_old)
336 |     rate_theta = joint_new_theta + d_theta(theta_old,theta_new) - joint_old_theta - d_theta(theta_new,theta_old)
337 |     alfa_theta = np.minimum(np.ones_like(rate_theta),np.exp(rate_theta))
338 |     runif = np.random.uniform(0,1,len(alfa_theta))
339 |     theta_pro = [theta_new[q] if runif[q] < alfa_theta[q] else theta_old[q] for q in range(num_location)]
340 |     theta_old = theta_pro
341 |     sto_theta = theta_old[theta_focus]
342 |     joint_old = joint_like(data,loc_int,phi_old,theta_old)
343 |     return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num])
344 | 
345 | 
346 | #MCMC updates of all location simultaneously (mignt be in parallel).
347 | def GWR_MCMC_multloc(init,num_iter,thin,burn_in):
348 |     sto_phi = np.zeros([(num_iter-burn_in)//thin,num_location,len(init[0][0])])     #store posterior samples of phi
349 |     sto_theta = np.zeros([(num_iter-burn_in)//thin,num_location,1])     #store posterior samples of theta
350 |     iter_param = init   
351 |     loglik_sum = np.zeros(num_location)     #store the log-likelihood of testing set for all locations
352 |     elpd = np.zeros(num_location)       #store the estimated elpd for all locations
353 |     for i in range(num_iter):
354 |         if( (i<=(burn_in-1)) | ((i+1) % thin !=0)):
355 |             if(is_para):
356 |                 iter_param = list(pool.map(GWR_update,iter_param))      #one step metropolis hasting update for all locations in parallel
357 |             else:
358 |                 iter_param = list(map(GWR_update,iter_param))       #one step metropolis hasting update for all locations
359 |             
360 |             #if((i+1) % thin == 0):
361 |                 #print('{0}% complete.'.format((i+1)*100/num_iter), flush=True)
362 |         else:
363 |             if(is_para):
364 |                 iter_param = list(pool.map(GWR_update_new,iter_param))      #one step metropolis hasting update for all locations in parallel
365 |             else:
366 |                 iter_param = list(map(GWR_update_new,iter_param))       #one step metropolis hasting update for all locations
367 |             accept_number = []
368 |             for j in range(num_location):       #calculate the elpd at each location by for loop
369 |                 sub_log_lik = 0
370 |                 loc_foc = list(location.loc[location['index']==j][['x','y']].values[0])
371 |                 #testing subdata
372 |                 subdata=data.iloc[index_sel[j]]     #testing set at location j
373 |                 sto_phi[((i+1-burn_in)//thin) - 1][j] = iter_param[j][0]
374 |                 sto_theta[((i+1-burn_in)//thin) - 1][j] = iter_param[j][4] 
375 |                 accept_number.append(iter_param[j][5])
376 |                 slice_like = lambda x: weight_like_s(x,loc_foc,iter_param[j][0],iter_param[j][4],h)     #define the likelihood function by using posterior phi and theta at location j of iteration i
377 |                 sub_log_lik = sum( subdata.apply(slice_like,axis=1) )       #calculate the likelihood of testing set at location j by using posterior phi and theta at iteration i
378 |                 loglik_sum[j] += sub_log_lik        #add likelihood of testing set at location j of iteration i to previous likelihood storation.
379 |                 elpd[j] = loglik_sum[j]/((i+1-burn_in)//thin)       #elpd at location j = the mean of all likelihoods of testing set at location j before iteration i
380 |             accept_rate = np.mean(np.array(accept_number)/i)        #calculate the mean acceptance rate
381 |             elpd_mean = np.mean(elpd)       #mean of elpd across all locations
382 |             #print('{0}% complete. The ELPD is: {site}. The average acceptance rate is: {rate}'.format((i+1)*100/num_iter, site=elpd_mean, rate=accept_rate), flush=True)
383 |     result = {'phi':sto_phi,'theta':sto_theta,'ELPD':elpd_mean}
384 |     return(result)
385 |     
386 |     
387 | time_one = datetime.now()
388 | if __name__ == '__main__':
389 |     pool = Pool(processes=num_core)
390 |     re=GWR_MCMC_multloc(init,13000,1,3000)
391 | time_two = datetime.now()
392 | 
393 | #print(time_two-time_one)        #time used for MCMC updates
394 | 
395 | est_phi=sum(re['phi'])/re['phi'].shape[0]       #posterior estimation of phi (posterior mean)
396 | est_theta=sum(re['theta'])/re['theta'].shape[0]     ##posterior estimation of theta (posterior mean)
397 | 
398 | trace = np.zeros(shape=[re['phi'].shape[0],re['phi'].shape[2]+re['theta'].shape[2]])        #trace record of posterior samples (central location only)
399 | for k in range(re['phi'].shape[0]):
400 |     trace[k][0:re['phi'].shape[2]] = re['phi'][k][re['phi'][0].shape[0]//2]
401 |     trace[k][re['phi'].shape[2]:] = re['theta'][k][re['theta'][0].shape[0]//2]
402 | #np.savetxt('trace'+str(h)+'.csv',trace,delimiter=',')
403 | 
404 | #print(est_phi)
405 | #np.savetxt("est_phi"+str(h)+".csv", est_phi, delimiter=",")
406 | #print(est_theta)
407 | #np.savetxt("est_theta"+str(h)+".csv", est_theta, delimiter=",")
408 | print([h,re['ELPD']])
409 | 


--------------------------------------------------------------------------------
/Application-to-real-dataset/EpiData.csv:
--------------------------------------------------------------------------------
 1 | 15,100,142,725,1,25.1,33.4
 2 | 15,100,284,905,1,27,14.6
 3 | 15,100,137,645,1,28.3,29.3
 4 | 15,100,30,510,1,30.3,66.5
 5 | 15,100,30,496,1,30.1,136.3
 6 | 15,100,16,401,1,28.8,213.3
 7 | 15,100,116,542,1,27.9,228.4
 8 | 15,100,254,720,1,27.1,301.9
 9 | 15,100,413,982,1,27.3,265.9
10 | 15,100,329,891,1,26.4,223.2
11 | 15,100,53,400,1,25.3,74.6
12 | 15,100,18,296,1,24.5,63.3
13 | 15,100,27,342,1,23.2,32.9
14 | 15,100,32,274,1,25.7,17
15 | 15,100,12,271,1,25.6,86.4
16 | 15,100,18,261,1,27.8,124.8
17 | 15,100,13,256,1,27.9,219.8
18 | 15,100,23,365,1,27.6,243.5
19 | 15,100,155,505,1,27.3,300.3
20 | 15,100,221,494,1,27,338.4
21 | 15,100,246,514,1,26.8,289.9
22 | 15,100,42,205,1,26.4,217
23 | 15,100,35,172,1,25.8,73.5
24 | 15,100,80,378,1,23.5,48
25 | 15,100,29,227,1,24.7,68.3
26 | 15,100,64,326,1,26.4,29.5
27 | 15,100,57,368,1,28,54.9
28 | 15,100,26,226,1,29.1,102.1
29 | 15,100,18,216,1,28.4,211.8
30 | 15,100,47,604,1,27.9,154.4
31 | 15,100,106,454,1,27.4,193.6
32 | 15,100,113,384,1,27.4,201.3
33 | 15,100,190,537,1,27.1,274.6
34 | 15,100,99,341,1,27.2,140.2
35 | 15,100,93,436,1,26.8,143.2
36 | 15,100,66,250,1,25.6,50.9
37 | 15,100,113,427,1,24.5,43
38 | 15,100,77,371,1,27.1,23.4
39 | 15,100,56,358,1,28.3,36.5
40 | 15,100,18,251,1,29.7,88.9
41 | 15,100,20,376,1,29.1,143.8
42 | 15,100,23,375,1,27.8,223.8
43 | 15,100,32,302,1,27.2,266.6
44 | 15,100,67,489,1,27.4,275.2
45 | 15,100,54,406,1,26.8,312.1
46 | 15,100,71,471,1,26.3,222.7
47 | 15,100,75,327,1,26,109.9
48 | 15,100,51,299,1,22,23.9
49 | 15,100,85,385,1,22.4,8.7
50 | 15,100,171,486,1,25.5,3.6
51 | 15,100,183,518,1,28.2,33.8
52 | 15,100,76,378,1,29.3,61.6
53 | 15,100,99,350,1,29.2,158.4
54 | 15,100,54,317,1,28.4,202.6
55 | 15,100,111,396,1,27.7,217.3
56 | 15,100,60,333,1,27.4,282.3
57 | 15,100,35,245,1,27.3,207.2
58 | 15,100,19,140,1,27.1,145.9
59 | 15,100,19,84,1,26.4,108.1
60 | 15,100,20,101,1,24.2,54.7
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bayesian geographically weighted regression
 2 | This is the Python code to conduct Bayesian geographically weighted regression proposed in the paper "Generalized Geographically Weighted Regression Model within a Modularized Bayesian Framework". It includes code to replicate the results presented in section "Simulation" and "Application to real dataset".
 3 | 
 4 | # Authors
 5 | Yang Liu and Robert Goudie
 6 | 
 7 | MRC Biostatistics Unit, University of Cambridge
 8 | 
 9 | ## Reference
10 | 
11 | The paper can be accessed via:
12 | 
13 | Liu, Yang and Robert J.B. Goudie. “Generalized Geographically Weighted Regression Model within a Modularized Bayesian Framework.” (2021). [	arXiv:2106.00996][arXiv] 	
14 | 
15 | # Abstract of the proposed model
16 | Geographically weighted regression (GWR) models handle geographical dependence through a spatially varying coefficient model and have been widely used in applied science, but its general Bayesian extension is unclear because it involves a weighted log-likelihood which does not imply a probability distribution on data. We present a Bayesian GWR model and show that its essence is dealing with partial misspecification of the model. Current modularized Bayesian inference models accommodate partial misspecification from a single component of the model. We extend these models to handle partial misspecification in more than one component of the model, as required for our Bayesian GWR model. Information from the various spatial locations is manipulated via a geographically weighted kernel and the optimal manipulation is chosen according to a Kullback–Leibler (KL) divergence. We justify the model via an information risk minimization approach and show the consistency of the proposed estimator in terms of a geographically weighted KL divergence.
17 | 


--------------------------------------------------------------------------------
/Simulation/BGWR.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Feb 19 14:08:10 2022
  4 | 
  5 | @author: Yang Liu
  6 | """
  7 | 
  8 | 
  9 | import numpy as np
 10 | import os
 11 | import pandas as pd
 12 | from scipy.special import loggamma
 13 | import scipy.stats
 14 | from datetime import datetime
 15 | from multiprocessing import Pool
 16 | 
 17 | file_id = int(os.getenv('SLURM_ARRAY_TASK_ID'))
 18 | #print([task_id,type(task_id)],flush=True)
 19 | 
 20 | task_id = (file_id-1)//10
 21 | rep_id = file_id%10
 22 | is_para = True    #Using parallel computing？
 23 | num_core = 10     #number of cores
 24 | fitting_ratio = 0    #the proportion of samples used for testing at the location of interest (i.e., splitting samples at location of interest into fitting set and testing set).
 25 | is_eucliDis = True      #Using Euclidian distance?
 26 | is_block = True        #block sampling?
 27 | 
 28 | # set multiple cores
 29 | #pool = ThreadPool(4)
 30 | 
 31 | 
 32 | #geographical kernel bandwidth
 33 | h = [0.0001,4,1000][task_id]
 34 | 
 35 | print('h is {h}'.format(h=h), flush=True)
 36 | 
 37 | # set multiple cores
 38 | #pool = ThreadPool(4)
 39 | 
 40 | #Geographically weighted kernel (exponential kernel)
 41 | def G_kernel(d,h):
 42 |     return(np.exp(-d**2/h**2))
 43 |     
 44 | #euclidean distance
 45 | def eucliDis(A,B):
 46 |     A = np.array(A)
 47 |     B = np.array(B)
 48 |     return np.sqrt(sum((A-B)**2))
 49 | 
 50 | #spherical distance (measured in KM)
 51 | def Haversine(A,B):
 52 |     """
 53 |     This uses the ‘haversine’ formula to calculate the great-circle distance between two points – that is, 
 54 |     the shortest distance over the earth’s surface – giving an ‘as-the-crow-flies’ distance between the points 
 55 |     (ignoring any hills they fly over, of course!).
 56 |     Haversine
 57 |     formula:    a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
 58 |     c = 2 ⋅ atan2( √a, √(1−a) )
 59 |     d = R ⋅ c
 60 |     where   φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km);
 61 |     note that angles need to be in radians to pass to trig functions!
 62 |     """
 63 |     lat1,lon1,lat2,lon2 = A[0],A[1],B[0],B[1]
 64 |     
 65 |     R = 6378.0088
 66 |     lat1,lon1,lat2,lon2 = map(np.radians, [lat1,lon1,lat2,lon2])
 67 | 
 68 |     dlat = lat2 - lat1
 69 |     dlon = lon2 - lon1
 70 |     a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2) **2
 71 |     c = 2 * np.arctan2(a**0.5, (1-a)**0.5)
 72 |     d = R * c
 73 |     return round(d,4)
 74 | 
 75 | #log-likelihood of negative binomial distribution
 76 | def negBion(outcome,offset,covariate,phi,theta):
 77 |     mean = np.exp(np.log(offset)+sum(np.array(covariate)*np.array(phi)))
 78 |     result = loggamma(outcome+1/theta)-loggamma(1/theta)-loggamma(outcome+1)-(1/theta)*np.log(1+theta*mean)+outcome*np.log(theta*mean/(1+theta*mean))
 79 |     return result
 80 | 
 81 | #log-prior for phi (uniform) 
 82 | def prior_phi(phi):
 83 |     if max(abs(np.array(phi)))>1000:
 84 |         return np.log(0)
 85 |     else:
 86 |         return np.log(1/2000)*len(phi)
 87 |     
 88 | #log-prior for theta (uniform 0,1000)
 89 | def prior_theta(theta):
 90 |     if theta>1000 or theta<=0:
 91 |         return np.log(0)
 92 |     else:
 93 |         return np.log(1/1000)
 94 |     
 95 | #baseline proposal sd for phi (proportional to estimated correlation matrix of phi)
 96 | #pro_st = np.array([[ 0.19800107, -0.03503183, -0.0323151],
 97 | #       [-0.03503183,  0.03827607, -0.00078348],
 98 | #       [-0.0323151 , -0.00078348,  0.00733836]])
 99 | 
100 | pro_st = np.array([[ 0.199212513, -0.00319569314, -0.037421612],
101 |        [-0.00319569314, 0.0079491569, 0.000190719635],
102 |        [-0.037421612, 0.000190719635, 0.00757105334]])
103 |     
104 | #two step adatpive proposal sd:
105 | #aggressive proposal sd for phi to approximate true value before burn_in
106 | pro_early = [np.dot(pro_st*2,pro_st*2),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st)]
107 | #mild proposal sd for phi to achieve a good mixture after burn_in
108 | pro_later = [np.dot(pro_st*2,pro_st*2),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10)]
109 |  
110 | #aggressive proposal sd for theta to approximate true value before burn_in 
111 | pro_theta_early = [0.05,0.03,0.03,0.03,0.015,0.015,0.005,0.004,0.004,0.004]
112 | #mild proposal sd for theta to achieve a good mixture after burn_in
113 | pro_theta_later = [0.05,0.015,0.015,0.015,0.005,0.005,0.005,0.004,0.004,0.004]
114 | 
115 | #proposal sampling function for phi (multivariate normal)
116 | def r_phi(phi):
117 |     phi = np.array(phi)
118 |     phi_n = scipy.stats.multivariate_normal(phi,pro_early[task_id]).rvs(1)
119 |     return phi_n
120 | #proposal density function for phi
121 | def d_phi(phi_n,phi):
122 |     return np.log(scipy.stats.multivariate_normal(phi,pro_early[task_id]).pdf(phi_n))
123 | 
124 | #proposal sampling function for phi (multivariate normal)
125 | def r_phi_new(phi):
126 |     phi = np.array(phi)
127 |     phi_n = scipy.stats.multivariate_normal(phi,pro_later[task_id]).rvs(1)
128 |     return phi_n
129 | #proposal density function for phi
130 | def d_phi_new(phi_n,phi):
131 |     return np.log(scipy.stats.multivariate_normal(phi,pro_later[task_id]).pdf(phi_n))
132 | 
133 | #proposal sampling function for theta (truncated normal)
134 | def r_theta(theta):
135 |     sd = pro_theta_early[task_id]
136 |     out = np.random.normal(theta, sd, len(theta))
137 |     out[out<=0] = 0.001
138 |     return out
139 | 
140 | #proposal density function for theta
141 | def d_theta(theta_n,theta):
142 |     sd =  pro_theta_early[task_id]
143 |     return sum(scipy.stats.norm.pdf(theta_n, theta, sd))
144 | 
145 | #proposal sampling function for theta (truncated normal)
146 | def r_theta_new(theta):
147 |     sd = pro_theta_later[task_id]
148 |     out = np.random.normal(theta, sd, len(theta))
149 |     out[out<=0] = 0.001
150 |     return out
151 | 
152 | #proposal density function for theta
153 | def d_theta_new(theta_n,theta):
154 |     sd =  pro_theta_later[task_id]
155 |     return sum(scipy.stats.norm.pdf(theta_n, theta, sd))
156 | 
157 | 
158 | 
159 | #import data from file
160 | data = pd.read_csv('simulateDate.csv',encoding='utf-8',header=0)
161 | #extract coordinates of locations
162 | location = data[['x','y']].drop_duplicates(subset=['x','y'])
163 | #number of locations
164 | num_location = location.shape[0]
165 | #add main key to location table
166 | location['index']=range(num_location)
167 | 
168 | #randomly select fitting subdata for each location of interest (cross validation)
169 | index_sel = []
170 | 
171 | for k in range(num_location):
172 |     loc_foc = location.values[k][0:2]
173 |     index_loc = data[(data['x']==loc_foc[0]) & (data['y']==loc_foc[1])].index
174 |     num_sel = int(len(index_loc)*fitting_ratio)
175 |     index_sel.append(np.sort(np.random.choice(index_loc,size=num_sel,replace=False)))
176 |     
177 | #given a observation (data_slice) and a location of interest (loc_int), this function calculate the geographical weight
178 | def kernel_weight(data_slice,loc_int,h):
179 |     loc1=data_slice[0:2]
180 |     if(is_eucliDis == True):
181 |         dis = eucliDis(loc1,loc_int)
182 |     else:
183 |         dis = Haversine(loc1,loc_int)
184 |     kern = G_kernel(dis,h)
185 |     return(kern)
186 | 
187 | #geographical weighting kernel matrix
188 | #In this part, we store all geographical weights in matrix (or list) "geo_weight" to avoid redundant calculation. The number of rows is equal to the number of location of interest. At each location of interest (e.g., each row), we calculate the geographical weight for each sample.
189 | weight = []
190 | theta_rep_num = np.zeros(shape=[num_location,num_location])
191 | theta_slice_ind = np.zeros(shape=[num_location,(num_location+1)])
192 | 
193 | theta_slice_ind_full = [np.equal(data.iloc[:,:2].values, location[['x','y']].values[i]).all(axis=1).sum() for i in range(num_location)]
194 | theta_slice_ind_drop = []
195 | for i in range(num_location):
196 |     theta_slice_ind_drop.append(np.equal(data.drop(index_sel[i]).iloc[:,:2].values, location[['x','y']].values[i]).all(axis=1).sum())
197 | for i in range(num_location):
198 |     theta_rep_num[i] = theta_slice_ind_full
199 |     theta_rep_num[i][i] = theta_slice_ind_drop[i]
200 | 
201 | for i in range(num_location):
202 |     loc_int_inner = location.values[i][0:2]
203 |     slice_weight = lambda x: kernel_weight(x,loc_int_inner,h)
204 |     weight.append(np.array([0]*i+list(map(slice_weight,data.drop_duplicates(subset=['x','y']).iloc[i:,:].values))))
205 |     theta_slice_ind[i] = np.append(0,[sum(theta_rep_num[i][:(k+1)]) for k in range(len(theta_rep_num[i]))])
206 | 
207 | 
208 | theta_slice_ind = theta_slice_ind.astype(int)        
209 | weight = np.array(weight)        
210 | weight = weight + weight.T - np.eye(num_location)
211 | geo_weight = [np.repeat(weight[i],theta_rep_num[i].astype(int)) for i in range(num_location)]
212 | 
213 | 
214 | #data index that is omitted due to very low weight:
215 | minimum_threshold = 10**(-1)
216 | drop_set = []
217 | theta_num = np.zeros(num_location)
218 | for j in range(num_location):
219 |     theta_num[j] = np.equal(data.iloc[:,:2].values, location[['x','y']].values[j]).all(axis=1).sum()
220 | for i in range(num_location):
221 |     set_one = index_sel[i]
222 |     full_geo_weight = np.repeat(weight[i],theta_num.astype(int)) 
223 |     set_two = np.where(full_geo_weight<minimum_threshold)[0]
224 |     drop_set.append(np.array(list(set(set_one).union(set(set_two)))))
225 | 
226 | geo_weight_cut = []
227 | for i in range(num_location):
228 |     if(len(drop_set[i])==0):
229 |         geo_weight_cut.append(np.repeat(weight[i],theta_num.astype(int)))
230 |     else:
231 |         geo_weight_cut.append(np.delete(np.repeat(weight[i],theta_num.astype(int)),drop_set[i]))
232 | 
233 | 
234 | 
235 | print("geo_weight has been calculated")
236 | 
237 | # joint log-density of negative binomial likelihood and prior given a certain location of interest.   Vectorization for fast calculation
238 | def joint_like(data,loc_ind,phi,theta,if_cut):
239 |     if(if_cut == True):
240 |         theta_expand = np.repeat(np.array(theta),theta_num[loc_ind].astype(int)) 
241 |         if(len(drop_set[loc_ind])>0):
242 |             theta_expand = np.delete(theta_expand,drop_set[loc_ind]) 
243 |         outcome = np.array(data.drop(drop_set[loc_ind])['outcome'])
244 |         offset = np.array(data.drop(drop_set[loc_ind])['offset'])
245 |         covariate = np.array(data.drop(drop_set[loc_ind]).iloc[:, 4:])
246 |         mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi)))))
247 |         result = sum(geo_weight_cut[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean))))
248 |         result += (prior_phi(phi) + sum(list(map(prior_theta, theta))))
249 |         return(result)
250 |     else:
251 |         theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int))   
252 |         outcome = np.array(data.drop(index_sel[loc_ind])['outcome'])
253 |         offset = np.array(data.drop(index_sel[loc_ind])['offset'])
254 |         covariate = np.array(data.drop(index_sel[loc_ind]).iloc[:, 4:])
255 |         mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi)))))
256 |         result = sum(geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean))))
257 |         result += (prior_phi(phi) + sum(list(map(prior_theta, theta))))
258 |         return(result)
259 | 
260 | 
261 | 
262 | def theta_like(subdata,loc_ind,phi,theta):
263 |     theta_expand = np.repeat(theta,theta_rep_num[loc_ind][loc_ind].astype(int))   
264 |     outcome = np.array(subdata['outcome'])
265 |     offset = np.array(subdata['offset'])
266 |     covariate = np.array(subdata.iloc[:, 4:])
267 |     mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi)))))
268 |     result = sum(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean)))
269 |     result += prior_theta(theta)
270 |     return(result)
271 |   
272 |     
273 | 
274 | #old code (discarded)
275 | #def weight_like(data_slice,loc_int,phi,theta,h):
276 | #    loc1=data_slice[0:2]
277 | #    theta_ind = int(location.loc[(location['x']==loc1[0]) & (location['y']==loc1[1])]['index'])
278 | #    dis = eucliDis(loc1,loc_int)
279 | #    kern = G_kernel(dis,h)
280 | #    return(kern*negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta[theta_ind]))
281 | 
282 | 
283 | #def joint_like(data,loc_int,phi,theta,h):
284 | #    slice_like = lambda x: weight_like(x,loc_int,phi,theta,h)
285 | #    #result = sum( data.apply(slice_like,axis=1) )
286 | #    result = sum(map(slice_like,data.values))
287 | #    result += (prior_phi(phi) + sum(list(map(prior_theta, theta))))
288 | #    return(result)
289 |   
290 | #init=[[1,1],[1]*num_location]
291 |     
292 | 
293 | # redefine weighted likelihood function for a single sample, this function is used to calculate the likelihood value of samples from testing set (cross validation).
294 | def weight_like_s(data_slice,phi,theta):
295 |     return(negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta))
296 | 
297 | #Given necessary model information "model_info" (i.e., list of value of phi, value of theta, coordinates of location of interest, joint density, value of theta at location of interest, number of accepted proposals),
298 | #function "GWR_update" updates old "model_info" by one step metropolis hasting. The output is new "model_info".
299 | #Note that, "GWR_update" only update one location. Therefore, it will be applied in parallel for all locations. See following function "GWR_MCMC_multloc"
300 | if is_block:
301 |     def GWR_update(model_info):
302 |         phi_old = model_info[0]
303 |         theta_old = model_info[1]
304 |         loc_int = model_info[2]
305 |         loc_ind = model_info[7]
306 |         subdata = data.loc[(data['x']==loc_int[0]) & (data['y']==loc_int[1])]
307 |         joint_old = model_info[3]
308 |         accept_num = model_info[5]
309 |         if_cut = model_info[6]
310 |         phi_new = r_phi(phi_old)
311 |         theta_new_ax = r_theta(theta_old)
312 |         joint_new_phi = joint_like(data,loc_ind,phi_new,theta_new_ax,if_cut)
313 |         rate_phi = joint_new_phi + d_phi(phi_old,phi_new) + d_theta(theta_old,theta_new_ax) - joint_old - d_phi(phi_new,phi_old) - d_theta(theta_new_ax,theta_old)
314 |         alfa_phi = min(1,np.exp(rate_phi))
315 |         runif = np.random.uniform(0,1,1)[0]
316 |         update_sample = [phi_new,theta_new_ax,joint_new_phi,(accept_num + 1)] if runif < alfa_phi else [phi_old,theta_old,joint_old,accept_num]        
317 |         phi_old, theta_old, joint_old, accept_num = update_sample[0],update_sample[1],update_sample[2],update_sample[3]
318 |         theta_new = r_theta(np.array([model_info[4]]))
319 |         joint_new_theta = theta_like(subdata,loc_ind,phi_old,theta_new)
320 |         joint_old_theta = theta_like(subdata,loc_ind,phi_old,model_info[4])
321 |         rate_theta = joint_new_theta + d_theta(model_info[4],theta_new) - joint_old_theta - d_theta(theta_new,model_info[4])
322 |         alfa_theta = min(1,np.exp(rate_theta))
323 |         runif = np.random.uniform(0,1,1)[0]
324 |         sto_theta = theta_new[0] if runif < alfa_theta else model_info[4]
325 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,model_info[7],0])
326 |         
327 |         
328 |     def GWR_update_new(model_info):
329 |         phi_old = model_info[0]
330 |         theta_old = model_info[1]
331 |         loc_int = model_info[2]
332 |         loc_ind = model_info[7]
333 |         subdata = data.loc[(data['x']==loc_int[0]) & (data['y']==loc_int[1])]
334 |         joint_old = model_info[3]
335 |         accept_num = model_info[5]
336 |         if_cut = model_info[6]
337 |         phi_new = r_phi_new(phi_old)
338 |         theta_new_ax = r_theta_new(theta_old)
339 |         joint_new_phi = joint_like(data,loc_ind,phi_new,theta_new_ax,if_cut)
340 |         rate_phi = joint_new_phi + d_phi_new(phi_old,phi_new) + d_theta_new(theta_old,theta_new_ax) - joint_old - d_phi_new(phi_new,phi_old) - d_theta_new(theta_new_ax,theta_old)
341 |         alfa_phi = min(1,np.exp(rate_phi))
342 |         runif = np.random.uniform(0,1,1)[0]
343 |         update_sample = [phi_new,theta_new_ax,joint_new_phi,(accept_num + 1)] if runif < alfa_phi else [phi_old,theta_old,joint_old,accept_num]        
344 |         phi_old, theta_old, joint_old, accept_num = update_sample[0],update_sample[1],update_sample[2],update_sample[3]
345 |         theta_new = r_theta_new(np.array([model_info[4]]))
346 |         joint_new_theta = theta_like(subdata,loc_ind,phi_old,theta_new)
347 |         joint_old_theta = theta_like(subdata,loc_ind,phi_old,model_info[4])
348 |         rate_theta = joint_new_theta + d_theta_new(model_info[4],theta_new) - joint_old_theta - d_theta_new(theta_new,model_info[4])
349 |         alfa_theta = min(1,np.exp(rate_theta))
350 |         runif = np.random.uniform(0,1,1)[0]
351 |         sto_theta = theta_new[0] if runif < alfa_theta else model_info[4]
352 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,model_info[7],0])
353 |         
354 |         
355 | else:
356 |     def GWR_update(model_info):
357 |         phi_old = model_info[0]
358 |         theta_old = model_info[1]
359 |         loc_int = model_info[2]
360 |         loc_ind = model_info[7]
361 |         joint_old = model_info[3]
362 |         accept_num = model_info[5]
363 |         if_cut = model_info[6]
364 |         phi_new = r_phi(phi_old)
365 |         theta_new = r_theta(theta_old)
366 |         joint_new = joint_like(data,loc_ind,phi_new,theta_new,if_cut)
367 |         rate = joint_new + d_phi(phi_old,phi_new) + d_theta(theta_old,theta_new) - joint_old - d_phi(phi_new,phi_old) - d_theta(theta_new,theta_old)
368 |         alfa = min(1,np.exp(rate))
369 |         runif = np.random.uniform(0,1,1)[0]
370 |         update_sample = [phi_new,theta_new,joint_new,(accept_num + 1)] if runif < alfa else [phi_old,theta_old,joint_old,accept_num]        
371 |         phi_old, theta_old, joint_old, accept_num = update_sample[0],update_sample[1],update_sample[2],update_sample[3]
372 |         sto_theta = theta_old[loc_ind]
373 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,model_info[7],0])
374 |         
375 |     def GWR_update_new(model_info):
376 |         phi_old = model_info[0]
377 |         theta_old = model_info[1]
378 |         loc_int = model_info[2]
379 |         loc_ind = model_info[7]
380 |         joint_old = model_info[3]
381 |         accept_num = model_info[5]
382 |         if_cut = model_info[6]
383 |         phi_new = r_phi_new(phi_old)
384 |         theta_new = r_theta_new(theta_old)
385 |         joint_new = joint_like(data,loc_ind,phi_new,theta_new,if_cut)
386 |         rate = joint_new + d_phi_new(phi_old,phi_new) + d_theta_new(theta_old,theta_new) - joint_old - d_phi_new(phi_new,phi_old) - d_theta_new(theta_new,theta_old)
387 |         alfa = min(1,np.exp(rate))
388 |         runif = np.random.uniform(0,1,1)[0]     
389 |         update_sample = [phi_new,theta_new,joint_new,(accept_num + 1)] if runif < alfa else [phi_old,theta_old,joint_old,accept_num]        
390 |         phi_old, theta_old, joint_old, accept_num = update_sample[0],update_sample[1],update_sample[2],update_sample[3]
391 |         sto_theta = theta_old[loc_ind]
392 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,model_info[7],0])
393 |     
394 |     
395 |    
396 | 
397 | #initial value "init" (list of initial phi, initial theta, coordinates of location of interest, initial joint density)    
398 | #init_phi = [2,1,1]  
399 | #init_param = pd.read_csv('init_param.csv',encoding='utf-8',header=0)
400 | #init = [[[init_param.iloc[x]['phi0'],init_param.iloc[x]['phi1'],init_param.iloc[x]['phi2']],[init_param.iloc[x]['theta']]*num_location,list(location[['x','y']].values[x]),joint_like(data,x,[init_param.iloc[x]['phi0'],init_param.iloc[x]['phi1'],init_param.iloc[x]['phi2']],[init_param.iloc[x]['theta']]*num_location,True),init_param.iloc[x]['theta'],0,True,x,0] for x in range(len(location[['x','y']].values))] 
401 | init_phi = [2.8421216,0.4794747,0.2232922]
402 | init_theta = 0.5
403 | init = [[init_phi,[init_theta]*num_location,list(location[['x','y']].values[x]),joint_like(data,x,init_phi,[init_theta]*num_location,True),init_theta,0,True,x,0] for x in range(len(location[['x','y']].values))]
404 | 
405 | 
406 | #MCMC updates of all location simultaneously (mignt be in parallel).
407 | def GWR_MCMC_multloc(init,num_iter,thin,burn_in):
408 |     sto_phi = np.zeros([(num_iter-burn_in)//thin,num_location,len(init[0][0])])     #store posterior samples of phi
409 |     sto_theta = np.zeros([(num_iter-burn_in)//thin,num_location,1])     #store posterior samples of theta
410 |     iter_param = init   
411 |     #loglik_sum = np.zeros(num_location)     #store the log-likelihood of testing set for all locations
412 |     #elpd = np.zeros(num_location)       #store the estimated elpd for all locations
413 |     for i in range(num_iter):
414 |         if( (i<=(burn_in-1)) | ((i+1) % thin !=0)):
415 |             if(is_para):
416 |                 iter_param = list(pool.map(GWR_update,iter_param))      #one step metropolis hasting update for all locations in parallel
417 |             else:
418 |                 iter_param = list(map(GWR_update,iter_param))       #one step metropolis hasting update for all locations
419 |             if((i+1)%100==0):
420 |                 accept_rate = np.mean(np.array([iter_param[s][5] for s in range(num_location)])/i)
421 |                 print('The average acceptance rate is: {rate}'.format(rate=accept_rate),flush=True)
422 |                 
423 |             if((i+1) % thin == 0):
424 |                 print('{0}% complete.'.format((i+1)*100/num_iter), flush=True)
425 |         else:
426 |             if(i%20==0):
427 |                 for s in range(num_location):
428 |                     iter_param[s][6] = False
429 |                 drop_like = np.array([iter_param[q][3] for q in range(num_location)])
430 | 
431 |             if(is_para):
432 |                 iter_param = list(pool.map(GWR_update_new,iter_param))      #one step metropolis hasting update for all locations in parallel
433 |             else:
434 |                 iter_param = list(map(GWR_update_new,iter_param))       #one step metropolis hasting update for all locations
435 | 
436 |             accept_rate = np.mean(np.array([iter_param[s][5] for s in range(num_location)])/i)     #calculate the mean acceptance rate
437 |             sto_phi[((i+1-burn_in)//thin) - 1] = np.array([iter_param[s][0] for s in range(num_location)])
438 |             sto_theta[((i+1-burn_in)//thin) - 1] = np.array([[iter_param[s][4]] for s in range(num_location)])
439 |             
440 |             if(i%20==0):
441 |                 #print(sto_phi[((i+1-burn_in)//thin) -1], flush=True)
442 |                 #print('Theta at 800 is: {theta}'.format(theta=sto_theta[((i+1-burn_in)//thin) - 1][800]),flush=True)
443 |                 for s in range(num_location):
444 |                     iter_param[s][6] = True
445 |                 full_like = np.array([iter_param[q][3] for q in range(num_location)])
446 |                 print('The approximation rate is: {rate}'.format(rate=np.mean(drop_like/full_like)),flush=True)
447 |             print('{0}% complete. The average acceptance rate is: {rate}'.format((i+1)*100/num_iter, rate=accept_rate), flush=True)
448 |     result = {'phi':sto_phi,'theta':sto_theta}
449 |     return(result)
450 |     
451 |     
452 | time_one = datetime.now()
453 | if __name__ == '__main__':
454 |     pool = Pool(processes=num_core)
455 |     re=GWR_MCMC_multloc(init,4000,1,1000)
456 | time_two = datetime.now()
457 | 
458 | print(time_two-time_one)        #time used for MCMC updates
459 | 
460 | est_phi=sum(re['phi'])/re['phi'].shape[0]       #posterior estimation of phi (posterior mean)
461 | est_theta=sum(re['theta'])/re['theta'].shape[0]     ##posterior estimation of theta (posterior mean)
462 | 
463 | trace = np.zeros(shape=[re['phi'].shape[0],re['phi'].shape[2]+re['theta'].shape[2]])        #trace record of posterior samples (central location only)
464 | for k in range(re['phi'].shape[0]):
465 |     trace[k][0:re['phi'].shape[2]] = re['phi'][k][re['phi'][0].shape[0]//2+19]
466 |     trace[k][re['phi'].shape[2]:] = re['theta'][k][re['theta'][0].shape[0]//2+19]
467 | np.savetxt('trace'+str(h)+'rep'+str(rep_id)+'.csv',trace,delimiter=',')
468 | 
469 | print(est_phi)
470 | np.savetxt("est_phi"+str(h)+'rep'+str(rep_id)+".csv", est_phi, delimiter=",")
471 | print(est_theta)
472 | np.savetxt("est_theta"+str(h)+'rep'+str(rep_id)+".csv", est_theta, delimiter=",")
473 | #print([h,re['ELPD']])
474 | 


--------------------------------------------------------------------------------
/Simulation/BandwidthSelection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Feb 19 14:08:10 2022
  4 | 
  5 | @author: Yang Liu
  6 | """
  7 | 
  8 | 
  9 | import numpy as np
 10 | import os
 11 | import pandas as pd
 12 | from scipy.special import loggamma
 13 | import scipy.stats
 14 | from datetime import datetime
 15 | from multiprocessing import Pool
 16 | 
 17 | file_id = int(os.getenv('SLURM_ARRAY_TASK_ID'))
 18 | #print([task_id,type(task_id)],flush=True)
 19 | 
 20 | task_id = (file_id-1)//5
 21 | #task_id = (file_id-1)
 22 | 
 23 | rep_id = file_id%5
 24 | is_para = True    #Using parallel computing？
 25 | num_core = 30     #number of cores
 26 | fitting_ratio = 0.5    #the proportion of samples used for fitting at the location of interest (i.e., splitting samples at location of interest into fitting set and testing set).
 27 | is_eucliDis = True      #Using Euclidian distance?
 28 | is_block = False        #block sampling?
 29 | 
 30 | 
 31 | # set multiple cores
 32 | #pool = ThreadPool(4)
 33 | 
 34 | #geographical kernel bandwidth
 35 | h = [0.0001,2,4,6,8,10,20,40,80,1000][task_id]
 36 | 
 37 | print('h is {h}'.format(h=h), flush=True)
 38 | 
 39 | #Geographically weighted kernel (exponential kernel)
 40 | def G_kernel(d,h):
 41 |     return(np.exp(-d**2/h**2))
 42 |     
 43 | #euclidean distance
 44 | def eucliDis(A,B):
 45 |     A = np.array(A)
 46 |     B = np.array(B)
 47 |     return np.sqrt(sum((A-B)**2))
 48 | 
 49 | #spherical distance (measured in KM)
 50 | def Haversine(A,B):
 51 |     """
 52 |     This uses the ‘haversine’ formula to calculate the great-circle distance between two points – that is, 
 53 |     the shortest distance over the earth’s surface – giving an ‘as-the-crow-flies’ distance between the points 
 54 |     (ignoring any hills they fly over, of course!).
 55 |     Haversine
 56 |     formula:    a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
 57 |     c = 2 ⋅ atan2( √a, √(1−a) )
 58 |     d = R ⋅ c
 59 |     where   φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km);
 60 |     note that angles need to be in radians to pass to trig functions!
 61 |     """
 62 |     lat1,lon1,lat2,lon2 = A[0],A[1],B[0],B[1]
 63 |     
 64 |     R = 6378.0088
 65 |     lat1,lon1,lat2,lon2 = map(np.radians, [lat1,lon1,lat2,lon2])
 66 | 
 67 |     dlat = lat2 - lat1
 68 |     dlon = lon2 - lon1
 69 |     a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2) **2
 70 |     c = 2 * np.arctan2(a**0.5, (1-a)**0.5)
 71 |     d = R * c
 72 |     return round(d,4)
 73 | 
 74 | #log-likelihood of negative binomial distribution
 75 | def negBion(outcome,offset,covariate,phi,theta):
 76 |     mean = np.exp(np.log(offset)+sum(np.array(covariate)*np.array(phi)))
 77 |     result = loggamma(outcome+1/theta)-loggamma(1/theta)-loggamma(outcome+1)-(1/theta)*np.log(1+theta*mean)+outcome*np.log(theta*mean/(1+theta*mean))
 78 |     return result
 79 | 
 80 | #log-prior for phi (uniform) 
 81 | def prior_phi(phi):
 82 |     if max(abs(np.array(phi)))>1000:
 83 |         return np.log(0)
 84 |     else:
 85 |         return np.log(1/2000)*len(phi)
 86 |     
 87 | #log-prior for theta (uniform 0,1000)
 88 | def prior_theta(theta):
 89 |     if theta>1000 or theta<=0:
 90 |         return np.log(0)
 91 |     else:
 92 |         return np.log(1/1000)
 93 |     
 94 | #baseline proposal sd for phi (proportional to estimated correlation matrix of phi)
 95 | #pro_st = np.array([[ 0.19800107, -0.03503183, -0.0323151],
 96 | #       [-0.03503183,  0.03827607, -0.00078348],
 97 | #       [-0.0323151 , -0.00078348,  0.00733836]])
 98 | 
 99 | pro_st = np.array([[ 0.199212513, -0.00319569314, -0.037421612],
100 |        [-0.00319569314, 0.0079491569, 0.000190719635],
101 |        [-0.037421612, 0.000190719635, 0.00757105334]])
102 |     
103 | #two step adatpive proposal sd:
104 | #aggressive proposal sd for phi to approximate true value before burn_in
105 | pro_early = [np.dot(pro_st*2,pro_st*2),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st)]
106 | #mild proposal sd for phi to achieve a good mixture after burn_in
107 | pro_later = [np.dot(pro_st*2,pro_st*2),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10)]
108 |  
109 | #aggressive proposal sd for theta to approximate true value before burn_in 
110 | pro_theta_early = [0.05,0.03,0.03,0.03,0.015,0.015,0.005,0.004,0.004,0.004]
111 | #mild proposal sd for theta to achieve a good mixture after burn_in
112 | pro_theta_later = [0.05,0.015,0.015,0.015,0.005,0.005,0.005,0.004,0.004,0.004]
113 | 
114 | #proposal sampling function for phi (multivariate normal)
115 | def r_phi(phi):
116 |     phi = np.array(phi)
117 |     phi_n = scipy.stats.multivariate_normal(phi,pro_early[task_id]).rvs(1)
118 |     return phi_n
119 | #proposal density function for phi
120 | def d_phi(phi_n,phi):
121 |     return np.log(scipy.stats.multivariate_normal(phi,pro_early[task_id]).pdf(phi_n))
122 | 
123 | #proposal sampling function for phi (multivariate normal)
124 | def r_phi_new(phi):
125 |     phi = np.array(phi)
126 |     phi_n = scipy.stats.multivariate_normal(phi,pro_later[task_id]).rvs(1)
127 |     return phi_n
128 | #proposal density function for phi
129 | def d_phi_new(phi_n,phi):
130 |     return np.log(scipy.stats.multivariate_normal(phi,pro_later[task_id]).pdf(phi_n))
131 | 
132 | #proposal sampling function for theta (truncated normal)
133 | def r_theta(theta):
134 |     sd = pro_theta_early[task_id]
135 |     out = np.random.normal(theta, sd, len(theta))
136 |     out[out<=0] = 0.001
137 |     return out
138 | 
139 | #proposal density function for theta
140 | def d_theta(theta_n,theta):
141 |     sd =  pro_theta_early[task_id]
142 |     return sum(scipy.stats.norm.pdf(theta_n, theta, sd))
143 | 
144 | #proposal sampling function for theta (truncated normal)
145 | def r_theta_new(theta):
146 |     sd = pro_theta_later[task_id]
147 |     out = np.random.normal(theta, sd, len(theta))
148 |     out[out<=0] = 0.001
149 |     return out
150 | 
151 | #proposal density function for theta
152 | def d_theta_new(theta_n,theta):
153 |     sd =  pro_theta_later[task_id]
154 |     return sum(scipy.stats.norm.pdf(theta_n, theta, sd))
155 | 
156 | 
157 | 
158 | #import data from file
159 | data = pd.read_csv('simulateDate.csv',encoding='utf-8',header=0)
160 | #extract coordinates of locations
161 | location = data[['x','y']].drop_duplicates(subset=['x','y'])
162 | #number of locations
163 | num_location = location.shape[0]
164 | #add main key to location table
165 | location['index']=range(num_location)
166 | 
167 | #randomly select fitting subdata for each location of interest (cross validation)
168 | index_sel = []
169 | 
170 | for k in range(num_location):
171 |     loc_foc = location.values[k][0:2]
172 |     index_loc = data[(data['x']==loc_foc[0]) & (data['y']==loc_foc[1])].index
173 |     num_sel = int(len(index_loc)*fitting_ratio)
174 |     index_sel.append(np.sort(np.random.choice(index_loc,size=num_sel,replace=False)))
175 |     
176 | #given a observation (data_slice) and a location of interest (loc_int), this function calculate the geographical weight
177 | def kernel_weight(data_slice,loc_int,h):
178 |     loc1=data_slice[0:2]
179 |     if(is_eucliDis == True):
180 |         dis = eucliDis(loc1,loc_int)
181 |     else:
182 |         dis = Haversine(loc1,loc_int)
183 |     kern = G_kernel(dis,h)
184 |     return(kern)
185 | 
186 | #geographical weighting kernel matrix
187 | #In this part, we store all geographical weights in matrix (or list) "geo_weight" to avoid redundant calculation. The number of rows is equal to the number of location of interest. At each location of interest (e.g., each row), we calculate the geographical weight for each sample.
188 | weight = []
189 | theta_rep_num = np.zeros(shape=[num_location,num_location])
190 | theta_slice_ind = np.zeros(shape=[num_location,(num_location+1)])
191 | 
192 | theta_slice_ind_full = [np.equal(data.iloc[:,:2].values, location[['x','y']].values[i]).all(axis=1).sum() for i in range(num_location)]
193 | theta_slice_ind_drop = []
194 | for i in range(num_location):
195 |     theta_slice_ind_drop.append(np.equal(data.drop(index_sel[i]).iloc[:,:2].values, location[['x','y']].values[i]).all(axis=1).sum())
196 | for i in range(num_location):
197 |     theta_rep_num[i] = theta_slice_ind_full
198 |     theta_rep_num[i][i] = theta_slice_ind_drop[i]
199 | 
200 | for i in range(num_location):
201 |     loc_int_inner = location.values[i][0:2]
202 |     slice_weight = lambda x: kernel_weight(x,loc_int_inner,h)
203 |     weight.append(np.array([0]*i+list(map(slice_weight,data.drop_duplicates(subset=['x','y']).iloc[i:,:].values))))
204 |     theta_slice_ind[i] = np.append(0,[sum(theta_rep_num[i][:(k+1)]) for k in range(len(theta_rep_num[i]))])
205 | 
206 | 
207 | theta_slice_ind = theta_slice_ind.astype(int)        
208 | weight = np.array(weight)        
209 | weight = weight + weight.T - np.eye(num_location)
210 | geo_weight = [np.repeat(weight[i],theta_rep_num[i].astype(int)) for i in range(num_location)]
211 | 
212 | 
213 | #data index that is omitted due to very low weight:
214 | minimum_threshold = 10**(-1)
215 | drop_set = []
216 | theta_num = np.zeros(num_location)
217 | for j in range(num_location):
218 |     theta_num[j] = np.equal(data.iloc[:,:2].values, location[['x','y']].values[j]).all(axis=1).sum()
219 | for i in range(num_location):
220 |     set_one = index_sel[i]
221 |     full_geo_weight = np.repeat(weight[i],theta_num.astype(int)) 
222 |     set_two = np.where(full_geo_weight<minimum_threshold)[0]
223 |     drop_set.append(np.array(list(set(set_one).union(set(set_two)))))
224 | 
225 | geo_weight_cut = [np.delete(np.repeat(weight[i],theta_num.astype(int)),drop_set[i]) for i in range(num_location)]
226 | 
227 | print("geo_weight has been calculated")
228 | 
229 | # joint log-density of negative binomial likelihood and prior given a certain location of interest.   Vectorization for fast calculation
230 | def joint_like(data,loc_ind,phi,theta,if_cut):
231 |     if(if_cut == True):
232 |         theta_expand = np.repeat(np.array(theta),theta_num[loc_ind].astype(int)) 
233 |         theta_expand = np.delete(theta_expand,drop_set[loc_ind])
234 |         outcome = np.array(data.drop(drop_set[loc_ind])['outcome'])
235 |         offset = np.array(data.drop(drop_set[loc_ind])['offset'])
236 |         covariate = np.array(data.drop(drop_set[loc_ind]).iloc[:, 4:])
237 |         mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi)))))
238 |         result = sum(geo_weight_cut[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean))))
239 |         result += (prior_phi(phi) + sum(list(map(prior_theta, theta))))
240 |         return(result)
241 |     else:
242 |         theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int))   
243 |         outcome = np.array(data.drop(index_sel[loc_ind])['outcome'])
244 |         offset = np.array(data.drop(index_sel[loc_ind])['offset'])
245 |         covariate = np.array(data.drop(index_sel[loc_ind]).iloc[:, 4:])
246 |         mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi)))))
247 |         result = sum(geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean))))
248 |         result += (prior_phi(phi) + sum(list(map(prior_theta, theta))))
249 |         return(result)
250 | 
251 | 
252 | 
253 | def theta_like(subdata,loc_ind,phi,theta):
254 |     theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int))   
255 |     outcome = np.array(subdata['outcome'])
256 |     offset = np.array(subdata['offset'])
257 |     covariate = np.array(subdata.iloc[:, 4:])
258 |     mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi)))))
259 |     theta_like_value = geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean)))
260 |     result = np.array([sum(theta_like_value[theta_slice_ind[loc_ind][k]:theta_slice_ind[loc_ind][k+1]]) for k in range(num_location)])
261 |     result += np.array(list(map(prior_theta, theta)))
262 |     return(result)
263 |     
264 |     
265 | 
266 | #old code (discarded)
267 | #def weight_like(data_slice,loc_int,phi,theta,h):
268 | #    loc1=data_slice[0:2]
269 | #    theta_ind = int(location.loc[(location['x']==loc1[0]) & (location['y']==loc1[1])]['index'])
270 | #    dis = eucliDis(loc1,loc_int)
271 | #    kern = G_kernel(dis,h)
272 | #    return(kern*negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta[theta_ind]))
273 | 
274 | 
275 | #def joint_like(data,loc_int,phi,theta,h):
276 | #    slice_like = lambda x: weight_like(x,loc_int,phi,theta,h)
277 | #    #result = sum( data.apply(slice_like,axis=1) )
278 | #    result = sum(map(slice_like,data.values))
279 | #    result += (prior_phi(phi) + sum(list(map(prior_theta, theta))))
280 | #    return(result)
281 |   
282 | #init=[[1,1],[1]*num_location]
283 |     
284 | 
285 | # redefine weighted likelihood function for a single sample, this function is used to calculate the likelihood value of samples from testing set (cross validation).
286 | def weight_like_s(data_slice,phi,theta):
287 |     return(negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta))
288 | 
289 | #Given necessary model information "model_info" (i.e., list of value of phi, value of theta, coordinates of location of interest, joint density, value of theta at location of interest, number of accepted proposals),
290 | #function "GWR_update" updates old "model_info" by one step metropolis hasting. The output is new "model_info".
291 | #Note that, "GWR_update" only update one location. Therefore, it will be applied in parallel for all locations. See following function "GWR_MCMC_multloc"
292 | if is_block:
293 |     def GWR_update(model_info):
294 |         phi_old = model_info[0]
295 |         theta_old = model_info[1]
296 |         loc_int = model_info[2]
297 |         loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
298 |         subdata = data.drop(index_sel[loc_ind])
299 |         joint_old = model_info[3]
300 |         accept_num = model_info[5]
301 |         if_cut = model_info[6]
302 |         theta_focus = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index'])
303 |         phi_new = r_phi(phi_old)
304 |         theta_new = r_theta(theta_old)
305 |         joint_new_phi = joint_like(data,loc_ind,phi_new,theta_old,if_cut)
306 |         rate_phi = joint_new_phi + d_phi(phi_old,phi_new) - joint_old - d_phi(phi_new,phi_old) 
307 |         alfa_phi = min(1,np.exp(rate_phi))
308 |         runif = np.random.uniform(0,1,1)[0]
309 |         phi_old = phi_new if runif < alfa_phi else phi_old
310 |         accept_num = (accept_num + 1) if runif <alfa_phi else accept_num
311 |         joint_new_theta = theta_like(subdata,loc_ind,phi_old,theta_new)
312 |         joint_old_theta = theta_like(subdata,loc_ind,phi_old,theta_old)
313 |         rate_theta = joint_new_theta + d_theta(theta_old,theta_new) - joint_old_theta - d_theta(theta_new,theta_old)
314 |         alfa_theta = np.minimum(np.ones_like(rate_theta),np.exp(rate_theta))
315 |         runif = np.random.uniform(0,1,len(alfa_theta))
316 |         theta_pro = [theta_new[q] if runif[q] < alfa_theta[q] else theta_old[q] for q in range(num_location)]
317 |         theta_old = theta_pro
318 |         sto_theta = theta_old[theta_focus]
319 |         joint_old = joint_like(data,loc_ind,phi_old,theta_old,True)
320 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,0])
321 | else:
322 |     def GWR_update(model_info):
323 |         phi_old = model_info[0]
324 |         theta_old = model_info[1]
325 |         loc_int = model_info[2]
326 |         loc_ind = model_info[7]
327 |         joint_old = model_info[3]
328 |         accept_num = model_info[5]
329 |         if_cut = model_info[6]
330 |         phi_new = r_phi(phi_old)
331 |         theta_new = r_theta(theta_old)
332 |         joint_new = joint_like(data,loc_ind,phi_new,theta_new,if_cut)
333 |         rate = joint_new + d_phi(phi_old,phi_new) + d_theta(theta_old,theta_new) - joint_old - d_phi(phi_new,phi_old) - d_theta(theta_new,theta_old)
334 |         alfa = min(1,np.exp(rate))
335 |         runif = np.random.uniform(0,1,1)[0]
336 |         update_sample = [phi_new,theta_new,joint_new,(accept_num + 1)] if runif < alfa else [phi_old,theta_old,joint_old,accept_num]        
337 |         phi_old, theta_old, joint_old, accept_num = update_sample[0],update_sample[1],update_sample[2],update_sample[3]
338 |         sto_theta = theta_old[loc_ind]
339 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,model_info[7],0])
340 |         
341 |     def GWR_update_new(model_info):
342 |         phi_old = model_info[0]
343 |         theta_old = model_info[1]
344 |         loc_int = model_info[2]
345 |         loc_ind = model_info[7]
346 |         joint_old = model_info[3]
347 |         accept_num = model_info[5]
348 |         if_cut = model_info[6]
349 |         phi_new = r_phi_new(phi_old)
350 |         theta_new = r_theta_new(theta_old)
351 |         joint_new = joint_like(data,loc_ind,phi_new,theta_new,if_cut)
352 |         rate = joint_new + d_phi_new(phi_old,phi_new) + d_theta_new(theta_old,theta_new) - joint_old - d_phi_new(phi_new,phi_old) - d_theta_new(theta_new,theta_old)
353 |         alfa = min(1,np.exp(rate))
354 |         runif = np.random.uniform(0,1,1)[0]     
355 |         update_sample = [phi_new,theta_new,joint_new,(accept_num + 1)] if runif < alfa else [phi_old,theta_old,joint_old,accept_num]        
356 |         phi_old, theta_old, joint_old, accept_num = update_sample[0],update_sample[1],update_sample[2],update_sample[3]
357 |         sto_theta = theta_old[loc_ind]
358 |         #testing subdata
359 |         subdata=data.iloc[index_sel[loc_ind]]     #testing set at location j
360 |         slice_like = lambda x: weight_like_s(x,phi_old,sto_theta)     #define the likelihood function by using posterior phi and theta at location j of iteration i
361 |         sub_log_lik = sum( subdata.apply(slice_like,axis=1) )       #calculate the likelihood of testing set at location j by using posterior phi and theta at iteration i
362 |         return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,model_info[7],sub_log_lik])
363 |     
364 |     
365 | 
366 | #initial value "init" (list of initial phi, initial theta, coordinates of location of interest, initial joint density)    
367 | #init_phi = [2,1,1]  
368 | #init_param = pd.read_csv('init_param.csv',encoding='utf-8',header=0)
369 | #init = [[[init_param.iloc[x]['phi0'],init_param.iloc[x]['phi1'],init_param.iloc[x]['phi2']],[init_param.iloc[x]['theta']]*num_location,list(location[['x','y']].values[x]),joint_like(data,x,[init_param.iloc[x]['phi0'],init_param.iloc[x]['phi1'],init_param.iloc[x]['phi2']],[init_param.iloc[x]['theta']]*num_location,True),init_param.iloc[x]['theta'],0,True,x,0] for x in range(len(location[['x','y']].values))] 
370 | init_phi = [2.8421216,0.4794747,0.2232922]
371 | init_theta = 0.5
372 | init = [[init_phi,[init_theta]*num_location,list(location[['x','y']].values[x]),joint_like(data,x,init_phi,[init_theta]*num_location,True),init_theta,0,True,x,0] for x in range(len(location[['x','y']].values))]
373 | 
374 | 
375 | #MCMC updates of all location simultaneously (mignt be in parallel).
376 | def GWR_MCMC_multloc(init,num_iter,thin,burn_in):
377 |     sto_phi = np.zeros([(num_iter-burn_in)//thin,num_location,len(init[0][0])])     #store posterior samples of phi
378 |     sto_theta = np.zeros([(num_iter-burn_in)//thin,num_location,1])     #store posterior samples of theta
379 |     iter_param = init   
380 |     loglik_sum = np.zeros(num_location)     #store the log-likelihood of testing set for all locations
381 |     elpd = np.zeros(num_location)       #store the estimated elpd for all locations
382 |     for i in range(num_iter):
383 |         if( (i<=(burn_in-1)) | ((i+1) % thin !=0)):
384 |             if(is_para):
385 |                 iter_param = list(pool.map(GWR_update,iter_param))      #one step metropolis hasting update for all locations in parallel
386 |             else:
387 |                 iter_param = list(map(GWR_update,iter_param))       #one step metropolis hasting update for all locations
388 |             if((i+1)%100==0):
389 |                 accept_rate = np.mean(np.array([iter_param[s][5] for s in range(num_location)])/i)
390 |                 print('The average acceptance rate is: {rate}'.format(rate=accept_rate),flush=True)
391 |                 
392 |             if((i+1) % thin == 0):
393 |                 print('{0}% complete.'.format((i+1)*100/num_iter), flush=True)
394 |         else:
395 |             if(i%20==0):
396 |                 for s in range(num_location):
397 |                     iter_param[s][6] = False
398 |                 drop_like = np.array([iter_param[q][3] for q in range(num_location)])
399 | 
400 |             if(is_para):
401 |                 iter_param = list(pool.map(GWR_update_new,iter_param))      #one step metropolis hasting update for all locations in parallel
402 |             else:
403 |                 iter_param = list(map(GWR_update_new,iter_param))       #one step metropolis hasting update for all locations
404 | 
405 |             accept_rate = np.mean(np.array([iter_param[s][5] for s in range(num_location)])/i)     #calculate the mean acceptance rate
406 |             sto_phi[((i+1-burn_in)//thin) - 1] = np.array([iter_param[s][0] for s in range(num_location)])
407 |             sto_theta[((i+1-burn_in)//thin) - 1] = np.array([[iter_param[s][4]] for s in range(num_location)])
408 |             aa = np.array([loglik_sum[s] + iter_param[s][8] for s in range(num_location)])
409 |             loglik_sum = aa
410 |             elpd = np.array([loglik_sum[s]/((i+1-burn_in)//thin) for s in range(num_location)])
411 |             elpd_mean = np.mean(elpd)       #mean of elpd across all locations
412 |             
413 |             if(i%20==0):
414 |                 print(sto_phi[((i+1-burn_in)//thin) -1], flush=True)
415 |                 print('Theta at 800 is: {theta}'.format(theta=sto_theta[((i+1-burn_in)//thin) - 1][800]),flush=True)
416 |                 for s in range(num_location):
417 |                     iter_param[s][6] = True
418 |                 full_like = np.array([iter_param[q][3] for q in range(num_location)])
419 |                 print('The approximation rate is: {rate}'.format(rate=np.mean(drop_like/full_like)),flush=True)
420 |             print('{0}% complete. The ELPD is: {site}. The average acceptance rate is: {rate}'.format((i+1)*100/num_iter, site=elpd_mean, rate=accept_rate), flush=True)
421 |     result = {'phi':sto_phi,'theta':sto_theta,'ELPD':elpd_mean}
422 |     return(result)
423 |     
424 |     
425 | time_one = datetime.now()
426 | if __name__ == '__main__':
427 |     pool = Pool(processes=num_core)
428 |     re=GWR_MCMC_multloc(init,4000,1,1000)
429 | time_two = datetime.now()
430 | 
431 | print(time_two-time_one)        #time used for MCMC updates
432 | 
433 | est_phi=sum(re['phi'])/re['phi'].shape[0]       #posterior estimation of phi (posterior mean)
434 | est_theta=sum(re['theta'])/re['theta'].shape[0]     ##posterior estimation of theta (posterior mean)
435 | 
436 | trace = np.zeros(shape=[re['phi'].shape[0],re['phi'].shape[2]+re['theta'].shape[2]])        #trace record of posterior samples (central location only)
437 | for k in range(re['phi'].shape[0]):
438 |     trace[k][0:re['phi'].shape[2]] = re['phi'][k][re['phi'][0].shape[0]//2]
439 |     trace[k][re['phi'].shape[2]:] = re['theta'][k][re['theta'][0].shape[0]//2]
440 | #np.savetxt('trace'+str(h)+'.csv',trace,delimiter=',')
441 | 
442 | print(est_phi)
443 | #np.savetxt("est_phi"+str(h)+".csv", est_phi, delimiter=",")
444 | #print(est_theta)
445 | #np.savetxt("est_theta"+str(h)+".csv", est_theta, delimiter=",")
446 | print([h,re['ELPD']])
447 | 


--------------------------------------------------------------------------------
/Simulation/simuData.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Jul  2 11:28:52 2020
 4 | 
 5 | @author: Yang Liu
 6 | """
 7 | 
 8 | import numpy as np
 9 | import os
10 | import pandas as pd
11 | from scipy.special import gamma
12 | import scipy.stats
13 | 
14 | 
15 | def rv_binom(offset,covariate,phi,theta):
16 |     r=1/theta
17 |     p= r/(np.exp(np.log(offset)+sum(np.array(covariate)*np.array(phi)))+r)
18 |     return(scipy.stats.nbinom(r,p).rvs(1))
19 | 
20 | 
21 | re = pd.DataFrame({'x':[],'y':[],'outcome':[],'offset':[],'x1':[],'x2':[],'x3':[]})
22 | true_param = pd.DataFrame({'x':[],'y':[],'phi1':[],'phi2':[],'theta':[]})
23 | offset=1
24 | 
25 | for i in range(40):
26 |     for j in range(40):
27 |         x=i+1
28 |         y=j+1
29 |         phi=[3,0.1+0.01*np.sqrt((x)**2+(y)**2),0.05*(np.sin(np.pi/2 + np.pi*y/20)+np.cos(np.pi/2 + np.pi*y/20)+4)]
30 |         #phi=[3,0.15,0.22]
31 |         #theta = float(np.random.uniform(0.5,0.6,1))
32 |         theta = float(np.random.normal(0.5,0.01,1))
33 |         for n in range(100):
34 |             covariate=[1,float(np.random.uniform(0,10,1)),float(np.random.uniform(2,7,1))]
35 |             outcome = int(rv_binom(offset,covariate,phi,theta))
36 |             re = re.append({'x':x,'y':y,'outcome':outcome,'offset':offset,'x1':covariate[0],'x2':covariate[1],'x3':covariate[2]},ignore_index=True)
37 |             true_param = true_param.append({'x':x,'y':y,'phi1':phi[1],'phi2':phi[2],'theta':theta},ignore_index=True)
38 |         print([i,j])
39 | 
40 | re.to_csv('simulateDate.csv')
41 | true_param.to_csv('true_parameter.csv')
42 | 


--------------------------------------------------------------------------------