├── Application-to-real-dataset ├── BGWR.py ├── BandwidthSelection.py └── EpiData.csv ├── README.md └── Simulation ├── BGWR.py ├── BandwidthSelection.py ├── simuData.py └── simulateDate.csv /Application-to-real-dataset/BGWR.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Dec 19 14:08:10 2020 4 | 5 | @author: Yang Liu 6 | """ 7 | 8 | 9 | import numpy as np 10 | import os 11 | import pandas as pd 12 | from scipy.special import loggamma 13 | import scipy.stats 14 | from datetime import datetime 15 | from multiprocessing import Pool 16 | 17 | file_id = int(os.getenv('SLURM_ARRAY_TASK_ID')) 18 | #print([task_id,type(task_id)],flush=True) 19 | 20 | task_id = 4 21 | rep_id = file_id 22 | is_para = True #Using parallel computing? 23 | num_core = 10 #number of cores 24 | fitting_ratio = 0 #the proportion of samples used for testing at the location of interest (i.e., splitting samples at location of interest into fitting set and testing set). 25 | is_eucliDis = False #Using Euclidian distance? 26 | is_block = True #block sampling? 27 | 28 | 29 | # set multiple cores 30 | #pool = ThreadPool(4) 31 | 32 | #geographical kernel bandwidth 33 | h = [100,500,1000,2000,3000,5000,7000,10000,20000][task_id] 34 | 35 | 36 | #Geographically weighted kernel (exponential kernel) 37 | def G_kernel(d,h): 38 | return(np.exp(-d**2/h**2)) 39 | 40 | #euclidean distance 41 | def eucliDis(A,B): 42 | A = np.array(A) 43 | B = np.array(B) 44 | return np.sqrt(sum((A-B)**2)) 45 | 46 | #spherical distance (measured in KM) 47 | def Haversine(A,B): 48 | """ 49 | This uses the ‘haversine’ formula to calculate the great-circle distance between two points – that is, 50 | the shortest distance over the earth’s surface – giving an ‘as-the-crow-flies’ distance between the points 51 | (ignoring any hills they fly over, of course!). 52 | Haversine 53 | formula: a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2) 54 | c = 2 ⋅ atan2( √a, √(1−a) ) 55 | d = R ⋅ c 56 | where φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km); 57 | note that angles need to be in radians to pass to trig functions! 58 | """ 59 | lat1,lon1,lat2,lon2 = A[0],A[1],B[0],B[1] 60 | 61 | R = 6378.0088 62 | lat1,lon1,lat2,lon2 = map(np.radians, [lat1,lon1,lat2,lon2]) 63 | 64 | dlat = lat2 - lat1 65 | dlon = lon2 - lon1 66 | a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2) **2 67 | c = 2 * np.arctan2(a**0.5, (1-a)**0.5) 68 | d = R * c 69 | return round(d,4) 70 | 71 | #log-likelihood of negative binomial distribution 72 | def negBion(outcome,offset,covariate,phi,theta): 73 | mean = np.exp(np.log(offset)+sum(np.array(covariate)*np.array(phi))) 74 | result = loggamma(outcome+1/theta)-loggamma(1/theta)-loggamma(outcome+1)-(1/theta)*np.log(1+theta*mean)+outcome*np.log(theta*mean/(1+theta*mean)) 75 | return result 76 | 77 | #log-prior for phi (uniform) 78 | def prior_phi(phi): 79 | if max(abs(np.array(phi)))>1000: 80 | return np.log(0) 81 | else: 82 | return np.log(1/2000)*len(phi) 83 | 84 | #log-prior for theta (uniform 0,1000) 85 | def prior_theta(theta): 86 | if theta>1000 or theta<=0: 87 | return np.log(0) 88 | else: 89 | return np.log(1/1000) 90 | 91 | #baseline proposal sd for phi (proportional to estimated correlation matrix of phi) 92 | #EURO 93 | pro_st = np.array([[ 1,0,0], 94 | [0, 0.1, 0], 95 | [0 , 0, 0.01]]) 96 | 97 | #SouthEast 98 | pro_st2 = np.array([[ 9.93206712e-01, -3.78288045e-02, -1.04242828e-05], 99 | [-3.78288045e-02, 6.41566397e-03, -3.39223226e-04], 100 | [-1.04242828e-05, -3.39223226e-04, 3.57276820e-04]]) 101 | 102 | 103 | #two step adatpive proposal sd: 104 | #aggressive proposal sd for phi to approximate true value before burn_in 105 | pro_early = [np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st)] 106 | #mild proposal sd for phi to achieve a good mixture after burn_in 107 | pro_later1 = [np.dot(pro_st,pro_st),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2)] 108 | pro_later2 = [np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2)] 109 | 110 | 111 | #aggressive proposal sd for theta to approximate true value before burn_in 112 | pro_theta_early = [0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25] 113 | #mild proposal sd for theta to achieve a good mixture after burn_in 114 | pro_theta_later = [0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25] 115 | 116 | #speccialized with latitude 117 | #proposal sampling function for phi (multivariate normal) 118 | def r_phi(phi): 119 | phi = np.array(phi) 120 | phi_n = scipy.stats.multivariate_normal(phi,pro_early[task_id]).rvs(1) 121 | return phi_n 122 | #proposal density function for phi 123 | def d_phi(phi_n,phi): 124 | return np.log(scipy.stats.multivariate_normal(phi,pro_early[task_id]).pdf(phi_n)) 125 | 126 | def r_phi_new(phi,lat): 127 | if(lat>20): 128 | phi = np.array(phi) 129 | phi_n = scipy.stats.multivariate_normal(phi,pro_later1[task_id]).rvs(1) 130 | return phi_n 131 | else: 132 | phi = np.array(phi) 133 | phi_n = scipy.stats.multivariate_normal(phi,pro_later2[task_id]).rvs(1) 134 | return phi_n 135 | #proposal density function for phi 136 | def d_phi_new(phi_n,phi,lat): 137 | if(lat>20): 138 | return np.log(scipy.stats.multivariate_normal(phi,pro_later1[task_id]).pdf(phi_n)) 139 | else: 140 | return np.log(scipy.stats.multivariate_normal(phi,pro_later2[task_id]).pdf(phi_n)) 141 | 142 | #proposal sampling function for theta (truncated normal) 143 | def r_theta(theta): 144 | lower, upper, sd = 0, 1000, pro_theta_early[task_id-1] 145 | X = scipy.stats.truncnorm( 146 | (lower-theta)/sd,(upper-theta)/sd,loc=theta,scale=sd) 147 | return float(X.rvs(size=1)) 148 | #proposal density function for theta 149 | def d_theta(theta_n,theta): 150 | theta_n = np.array(theta_n) 151 | theta = np.array(theta) 152 | lower, upper, sd = 0, 1000, pro_theta_early[task_id-1] 153 | X = scipy.stats.truncnorm( 154 | (lower-theta)/sd,(upper-theta)/sd,loc=theta,scale=sd) 155 | return sum(np.log(X.pdf(theta_n))) 156 | 157 | 158 | #import data from file 159 | data = pd.read_csv('EpiData.csv',encoding='utf-8',header=0) 160 | #extract coordinates of locations 161 | location = data[['x','y']].drop_duplicates(subset=['x','y']) 162 | #number of locations 163 | num_location = location.shape[0] 164 | #add main key to location table 165 | location['index']=range(num_location) 166 | 167 | #randomly select fitting subdata for each location of interest (cross validation) 168 | index_sel = [] 169 | 170 | for k in range(num_location): 171 | loc_foc = location.values[k][0:2] 172 | index_loc = data[(data['x']==loc_foc[0]) & (data['y']==loc_foc[1])].index 173 | num_sel = int(len(index_loc)*fitting_ratio) 174 | index_sel.append(np.sort(np.random.choice(index_loc,size=num_sel,replace=False))) 175 | 176 | #given a observation (data_slice) and a location of interest (loc_int), this function calculate the geographical weight 177 | def kernel_weight(data_slice,loc_int,h): 178 | loc1=data_slice[0:2] 179 | if(is_eucliDis == True): 180 | dis = eucliDis(loc1,loc_int) 181 | else: 182 | dis = Haversine(loc1,loc_int) 183 | kern = G_kernel(dis,h) 184 | return(kern) 185 | 186 | #geographical weighting kernel matrix 187 | #In this part, we store all geographical weights in matrix (or list) "geo_weight" to avoid redundant calculation. The number of rows is equal to the number of location of interest. At each location of interest (e.g., each row), we calculate the geographical weight for each sample. 188 | weight = [] 189 | theta_rep_num = np.zeros(shape=[num_location,num_location]) 190 | theta_slice_ind = np.zeros(shape=[num_location,(num_location+1)]) 191 | for i in range(num_location): 192 | loc_int_inner = location.values[i][0:2] 193 | slice_weight = lambda x: kernel_weight(x,loc_int_inner,h) 194 | weight.append(np.array([0]*i+list(map(slice_weight,data.drop_duplicates(subset=['x','y']).iloc[i:,:].values)))) 195 | for j in range(num_location): 196 | theta_rep_num[i][j] = np.equal(data.drop(index_sel[i]).iloc[:,:2].values, location[['x','y']].values[j]).all(axis=1).sum() 197 | theta_slice_ind[i] = np.append(0,[sum(theta_rep_num[i][:(k+1)]) for k in range(len(theta_rep_num[i]))]) 198 | 199 | theta_slice_ind = theta_slice_ind.astype(int) 200 | weight = np.array(weight) 201 | weight = weight + weight.T - np.eye(num_location) 202 | geo_weight = [np.repeat(weight[i],theta_rep_num[i].astype(int)) for i in range(num_location)] 203 | 204 | # joint log-density of negative binomial likelihood and prior given a certain location of interest. Vectorization for fast calculation 205 | def joint_like(data,loc_int,phi,theta): 206 | loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index']) 207 | theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int)) 208 | outcome = np.array(data.drop(index_sel[loc_ind])['outcome']) 209 | offset = np.array(data.drop(index_sel[loc_ind])['offset']) 210 | covariate = np.array(data.drop(index_sel[loc_ind]).iloc[:, 4:]) 211 | mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi))))) 212 | result = sum(geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean)))) 213 | result += (prior_phi(phi) + sum(list(map(prior_theta, theta)))) 214 | return(result) 215 | 216 | def theta_like(subdata,loc_ind,phi,theta): 217 | theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int)) 218 | outcome = np.array(subdata['outcome']) 219 | offset = np.array(subdata['offset']) 220 | covariate = np.array(subdata.iloc[:, 4:]) 221 | mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi))))) 222 | theta_like_value = geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean))) 223 | result = np.array([sum(theta_like_value[theta_slice_ind[loc_ind][k]:theta_slice_ind[loc_ind][k+1]]) for k in range(num_location)]) 224 | result += np.array(list(map(prior_theta, theta))) 225 | return(result) 226 | 227 | 228 | 229 | #old code (discarded) 230 | #def weight_like(data_slice,loc_int,phi,theta,h): 231 | # loc1=data_slice[0:2] 232 | # theta_ind = int(location.loc[(location['x']==loc1[0]) & (location['y']==loc1[1])]['index']) 233 | # dis = eucliDis(loc1,loc_int) 234 | # kern = G_kernel(dis,h) 235 | # return(kern*negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta[theta_ind])) 236 | 237 | 238 | #def joint_like(data,loc_int,phi,theta,h): 239 | # slice_like = lambda x: weight_like(x,loc_int,phi,theta,h) 240 | # #result = sum( data.apply(slice_like,axis=1) ) 241 | # result = sum(map(slice_like,data.values)) 242 | # result += (prior_phi(phi) + sum(list(map(prior_theta, theta)))) 243 | # return(result) 244 | 245 | #init=[[1,1],[1]*num_location] 246 | 247 | 248 | #Given necessary model information "model_info" (i.e., list of value of phi, value of theta, coordinates of location of interest, joint density, value of theta at location of interest, number of accepted proposals), 249 | #function "GWR_update" updates old "model_info" by one step metropolis hasting. The output is new "model_info". 250 | #Note that, "GWR_update" only update one location. Therefore, it will be applied in parallel for all locations. See following function "GWR_MCMC_multloc" 251 | if is_block: 252 | def GWR_update(model_info): 253 | phi_old = model_info[0] 254 | theta_old = model_info[1] 255 | loc_int = model_info[2] 256 | loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index']) 257 | subdata = data.drop(index_sel[loc_ind]) 258 | joint_old = model_info[3] 259 | accept_num = model_info[5] 260 | theta_focus = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index']) 261 | phi_new = r_phi(phi_old) 262 | theta_new = list(map(r_theta,theta_old)) 263 | joint_new_phi = joint_like(data,loc_int,phi_new,theta_old) 264 | rate_phi = joint_new_phi + d_phi(phi_old,phi_new) - joint_old - d_phi(phi_new,phi_old) 265 | alfa_phi = min(1,np.exp(rate_phi)) 266 | runif = np.random.uniform(0,1,1)[0] 267 | phi_old = phi_new if runif < alfa_phi else phi_old 268 | accept_num = (accept_num + 1) if runif 1000: 80 | return np.log(0) 81 | else: 82 | return np.log(1/2000)*len(phi) 83 | 84 | #log-prior for theta (uniform 0,1000) 85 | def prior_theta(theta): 86 | if theta>1000 or theta<=0: 87 | return np.log(0) 88 | else: 89 | return np.log(1/1000) 90 | 91 | #baseline proposal sd for phi (proportional to estimated correlation matrix of phi) 92 | #EURO 93 | pro_st = np.array([[ 1,0,0], 94 | [0, 0.1, 0], 95 | [0 , 0, 0.01]]) 96 | 97 | #SouthEast 98 | pro_st2 = np.array([[ 9.93206712e-01, -3.78288045e-02, -1.04242828e-05], 99 | [-3.78288045e-02, 6.41566397e-03, -3.39223226e-04], 100 | [-1.04242828e-05, -3.39223226e-04, 3.57276820e-04]]) 101 | 102 | 103 | #two step adatpive proposal sd: 104 | #aggressive proposal sd for phi to approximate true value before burn_in 105 | pro_early = [np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st)] 106 | #mild proposal sd for phi to achieve a good mixture after burn_in 107 | pro_later1 = [np.dot(pro_st,pro_st),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st/4,pro_st/4),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2)] 108 | pro_later2 = [np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2),np.dot(pro_st2,pro_st2)] 109 | 110 | 111 | #aggressive proposal sd for theta to approximate true value before burn_in 112 | pro_theta_early = [0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25] 113 | #mild proposal sd for theta to achieve a good mixture after burn_in 114 | pro_theta_later = [0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25] 115 | 116 | #speccialized with latitude 117 | #proposal sampling function for phi (multivariate normal) 118 | def r_phi(phi): 119 | phi = np.array(phi) 120 | phi_n = scipy.stats.multivariate_normal(phi,pro_early[task_id-1]).rvs(1) 121 | return phi_n 122 | #proposal density function for phi 123 | def d_phi(phi_n,phi): 124 | return np.log(scipy.stats.multivariate_normal(phi,pro_early[task_id-1]).pdf(phi_n)) 125 | 126 | def r_phi_new(phi,lat): 127 | if(lat>20): 128 | phi = np.array(phi) 129 | phi_n = scipy.stats.multivariate_normal(phi,pro_later1[task_id-1]).rvs(1) 130 | return phi_n 131 | else: 132 | phi = np.array(phi) 133 | phi_n = scipy.stats.multivariate_normal(phi,pro_later2[task_id-1]).rvs(1) 134 | return phi_n 135 | #proposal density function for phi 136 | def d_phi_new(phi_n,phi,lat): 137 | if(lat>20): 138 | return np.log(scipy.stats.multivariate_normal(phi,pro_later1[task_id-1]).pdf(phi_n)) 139 | else: 140 | return np.log(scipy.stats.multivariate_normal(phi,pro_later2[task_id-1]).pdf(phi_n)) 141 | 142 | #proposal sampling function for theta (truncated normal) 143 | def r_theta(theta): 144 | lower, upper, sd = 0, 1000, pro_theta_early[task_id-1] 145 | X = scipy.stats.truncnorm( 146 | (lower-theta)/sd,(upper-theta)/sd,loc=theta,scale=sd) 147 | return float(X.rvs(size=1)) 148 | #proposal density function for theta 149 | def d_theta(theta_n,theta): 150 | theta_n = np.array(theta_n) 151 | theta = np.array(theta) 152 | lower, upper, sd = 0, 1000, pro_theta_early[task_id-1] 153 | X = scipy.stats.truncnorm( 154 | (lower-theta)/sd,(upper-theta)/sd,loc=theta,scale=sd) 155 | return sum(np.log(X.pdf(theta_n))) 156 | 157 | 158 | #import data from file 159 | data = pd.read_csv('EpiData(test).csv',encoding='utf-8',header=0) 160 | #extract coordinates of locations 161 | location = data[['x','y']].drop_duplicates(subset=['x','y']) 162 | #number of locations 163 | num_location = location.shape[0] 164 | #add main key to location table 165 | location['index']=range(num_location) 166 | 167 | #randomly select fitting subdata for each location of interest (cross validation) 168 | index_sel = [] 169 | 170 | for k in range(num_location): 171 | loc_foc = location.values[k][0:2] 172 | index_loc = data[(data['x']==loc_foc[0]) & (data['y']==loc_foc[1])].index 173 | num_sel = int(len(index_loc)*fitting_ratio) 174 | index_sel.append(np.sort(np.random.choice(index_loc,size=num_sel,replace=False))) 175 | 176 | #given a observation (data_slice) and a location of interest (loc_int), this function calculate the geographical weight 177 | def kernel_weight(data_slice,loc_int,h): 178 | loc1=data_slice[0:2] 179 | if(is_eucliDis == True): 180 | dis = eucliDis(loc1,loc_int) 181 | else: 182 | dis = Haversine(loc1,loc_int) 183 | kern = G_kernel(dis,h) 184 | return(kern) 185 | 186 | #geographical weighting kernel matrix 187 | #In this part, we store all geographical weights in matrix (or list) "geo_weight" to avoid redundant calculation. The number of rows is equal to the number of location of interest. At each location of interest (e.g., each row), we calculate the geographical weight for each sample. 188 | weight = [] 189 | theta_rep_num = np.zeros(shape=[num_location,num_location]) 190 | theta_slice_ind = np.zeros(shape=[num_location,(num_location+1)]) 191 | for i in range(num_location): 192 | loc_int_inner = location.values[i][0:2] 193 | slice_weight = lambda x: kernel_weight(x,loc_int_inner,h) 194 | weight.append(np.array([0]*i+list(map(slice_weight,data.drop_duplicates(subset=['x','y']).iloc[i:,:].values)))) 195 | for j in range(num_location): 196 | theta_rep_num[i][j] = np.equal(data.drop(index_sel[i]).iloc[:,:2].values, location[['x','y']].values[j]).all(axis=1).sum() 197 | theta_slice_ind[i] = np.append(0,[sum(theta_rep_num[i][:(k+1)]) for k in range(len(theta_rep_num[i]))]) 198 | 199 | theta_slice_ind = theta_slice_ind.astype(int) 200 | weight = np.array(weight) 201 | weight = weight + weight.T - np.eye(num_location) 202 | geo_weight = [np.repeat(weight[i],theta_rep_num[i].astype(int)) for i in range(num_location)] 203 | 204 | # joint log-density of negative binomial likelihood and prior given a certain location of interest. Vectorization for fast calculation 205 | def joint_like(data,loc_int,phi,theta): 206 | loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index']) 207 | theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int)) 208 | outcome = np.array(data.drop(index_sel[loc_ind])['outcome']) 209 | offset = np.array(data.drop(index_sel[loc_ind])['offset']) 210 | covariate = np.array(data.drop(index_sel[loc_ind]).iloc[:, 4:]) 211 | mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi))))) 212 | result = sum(geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean)))) 213 | result += (prior_phi(phi) + sum(list(map(prior_theta, theta)))) 214 | return(result) 215 | 216 | def theta_like(subdata,loc_ind,phi,theta): 217 | theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int)) 218 | outcome = np.array(subdata['outcome']) 219 | offset = np.array(subdata['offset']) 220 | covariate = np.array(subdata.iloc[:, 4:]) 221 | mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi))))) 222 | theta_like_value = geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean))) 223 | result = np.array([sum(theta_like_value[theta_slice_ind[loc_ind][k]:theta_slice_ind[loc_ind][k+1]]) for k in range(num_location)]) 224 | result += np.array(list(map(prior_theta, theta))) 225 | return(result) 226 | 227 | 228 | 229 | #old code (discarded) 230 | #def weight_like(data_slice,loc_int,phi,theta,h): 231 | # loc1=data_slice[0:2] 232 | # theta_ind = int(location.loc[(location['x']==loc1[0]) & (location['y']==loc1[1])]['index']) 233 | # dis = eucliDis(loc1,loc_int) 234 | # kern = G_kernel(dis,h) 235 | # return(kern*negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta[theta_ind])) 236 | 237 | 238 | #def joint_like(data,loc_int,phi,theta,h): 239 | # slice_like = lambda x: weight_like(x,loc_int,phi,theta,h) 240 | # #result = sum( data.apply(slice_like,axis=1) ) 241 | # result = sum(map(slice_like,data.values)) 242 | # result += (prior_phi(phi) + sum(list(map(prior_theta, theta)))) 243 | # return(result) 244 | 245 | #init=[[1,1],[1]*num_location] 246 | 247 | 248 | #Given necessary model information "model_info" (i.e., list of value of phi, value of theta, coordinates of location of interest, joint density, value of theta at location of interest, number of accepted proposals), 249 | #function "GWR_update" updates old "model_info" by one step metropolis hasting. The output is new "model_info". 250 | #Note that, "GWR_update" only update one location. Therefore, it will be applied in parallel for all locations. See following function "GWR_MCMC_multloc" 251 | if is_block: 252 | def GWR_update(model_info): 253 | phi_old = model_info[0] 254 | theta_old = model_info[1] 255 | loc_int = model_info[2] 256 | loc_ind = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index']) 257 | subdata = data.drop(index_sel[loc_ind]) 258 | joint_old = model_info[3] 259 | accept_num = model_info[5] 260 | theta_focus = int(location.loc[(location['x']==loc_int[0]) & (location['y']==loc_int[1])]['index']) 261 | phi_new = r_phi(phi_old) 262 | theta_new = list(map(r_theta,theta_old)) 263 | joint_new_phi = joint_like(data,loc_int,phi_new,theta_old) 264 | rate_phi = joint_new_phi + d_phi(phi_old,phi_new) - joint_old - d_phi(phi_new,phi_old) 265 | alfa_phi = min(1,np.exp(rate_phi)) 266 | runif = np.random.uniform(0,1,1)[0] 267 | phi_old = phi_new if runif < alfa_phi else phi_old 268 | accept_num = (accept_num + 1) if runif 1000: 84 | return np.log(0) 85 | else: 86 | return np.log(1/2000)*len(phi) 87 | 88 | #log-prior for theta (uniform 0,1000) 89 | def prior_theta(theta): 90 | if theta>1000 or theta<=0: 91 | return np.log(0) 92 | else: 93 | return np.log(1/1000) 94 | 95 | #baseline proposal sd for phi (proportional to estimated correlation matrix of phi) 96 | #pro_st = np.array([[ 0.19800107, -0.03503183, -0.0323151], 97 | # [-0.03503183, 0.03827607, -0.00078348], 98 | # [-0.0323151 , -0.00078348, 0.00733836]]) 99 | 100 | pro_st = np.array([[ 0.199212513, -0.00319569314, -0.037421612], 101 | [-0.00319569314, 0.0079491569, 0.000190719635], 102 | [-0.037421612, 0.000190719635, 0.00757105334]]) 103 | 104 | #two step adatpive proposal sd: 105 | #aggressive proposal sd for phi to approximate true value before burn_in 106 | pro_early = [np.dot(pro_st*2,pro_st*2),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st)] 107 | #mild proposal sd for phi to achieve a good mixture after burn_in 108 | pro_later = [np.dot(pro_st*2,pro_st*2),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10)] 109 | 110 | #aggressive proposal sd for theta to approximate true value before burn_in 111 | pro_theta_early = [0.05,0.03,0.03,0.03,0.015,0.015,0.005,0.004,0.004,0.004] 112 | #mild proposal sd for theta to achieve a good mixture after burn_in 113 | pro_theta_later = [0.05,0.015,0.015,0.015,0.005,0.005,0.005,0.004,0.004,0.004] 114 | 115 | #proposal sampling function for phi (multivariate normal) 116 | def r_phi(phi): 117 | phi = np.array(phi) 118 | phi_n = scipy.stats.multivariate_normal(phi,pro_early[task_id]).rvs(1) 119 | return phi_n 120 | #proposal density function for phi 121 | def d_phi(phi_n,phi): 122 | return np.log(scipy.stats.multivariate_normal(phi,pro_early[task_id]).pdf(phi_n)) 123 | 124 | #proposal sampling function for phi (multivariate normal) 125 | def r_phi_new(phi): 126 | phi = np.array(phi) 127 | phi_n = scipy.stats.multivariate_normal(phi,pro_later[task_id]).rvs(1) 128 | return phi_n 129 | #proposal density function for phi 130 | def d_phi_new(phi_n,phi): 131 | return np.log(scipy.stats.multivariate_normal(phi,pro_later[task_id]).pdf(phi_n)) 132 | 133 | #proposal sampling function for theta (truncated normal) 134 | def r_theta(theta): 135 | sd = pro_theta_early[task_id] 136 | out = np.random.normal(theta, sd, len(theta)) 137 | out[out<=0] = 0.001 138 | return out 139 | 140 | #proposal density function for theta 141 | def d_theta(theta_n,theta): 142 | sd = pro_theta_early[task_id] 143 | return sum(scipy.stats.norm.pdf(theta_n, theta, sd)) 144 | 145 | #proposal sampling function for theta (truncated normal) 146 | def r_theta_new(theta): 147 | sd = pro_theta_later[task_id] 148 | out = np.random.normal(theta, sd, len(theta)) 149 | out[out<=0] = 0.001 150 | return out 151 | 152 | #proposal density function for theta 153 | def d_theta_new(theta_n,theta): 154 | sd = pro_theta_later[task_id] 155 | return sum(scipy.stats.norm.pdf(theta_n, theta, sd)) 156 | 157 | 158 | 159 | #import data from file 160 | data = pd.read_csv('simulateDate.csv',encoding='utf-8',header=0) 161 | #extract coordinates of locations 162 | location = data[['x','y']].drop_duplicates(subset=['x','y']) 163 | #number of locations 164 | num_location = location.shape[0] 165 | #add main key to location table 166 | location['index']=range(num_location) 167 | 168 | #randomly select fitting subdata for each location of interest (cross validation) 169 | index_sel = [] 170 | 171 | for k in range(num_location): 172 | loc_foc = location.values[k][0:2] 173 | index_loc = data[(data['x']==loc_foc[0]) & (data['y']==loc_foc[1])].index 174 | num_sel = int(len(index_loc)*fitting_ratio) 175 | index_sel.append(np.sort(np.random.choice(index_loc,size=num_sel,replace=False))) 176 | 177 | #given a observation (data_slice) and a location of interest (loc_int), this function calculate the geographical weight 178 | def kernel_weight(data_slice,loc_int,h): 179 | loc1=data_slice[0:2] 180 | if(is_eucliDis == True): 181 | dis = eucliDis(loc1,loc_int) 182 | else: 183 | dis = Haversine(loc1,loc_int) 184 | kern = G_kernel(dis,h) 185 | return(kern) 186 | 187 | #geographical weighting kernel matrix 188 | #In this part, we store all geographical weights in matrix (or list) "geo_weight" to avoid redundant calculation. The number of rows is equal to the number of location of interest. At each location of interest (e.g., each row), we calculate the geographical weight for each sample. 189 | weight = [] 190 | theta_rep_num = np.zeros(shape=[num_location,num_location]) 191 | theta_slice_ind = np.zeros(shape=[num_location,(num_location+1)]) 192 | 193 | theta_slice_ind_full = [np.equal(data.iloc[:,:2].values, location[['x','y']].values[i]).all(axis=1).sum() for i in range(num_location)] 194 | theta_slice_ind_drop = [] 195 | for i in range(num_location): 196 | theta_slice_ind_drop.append(np.equal(data.drop(index_sel[i]).iloc[:,:2].values, location[['x','y']].values[i]).all(axis=1).sum()) 197 | for i in range(num_location): 198 | theta_rep_num[i] = theta_slice_ind_full 199 | theta_rep_num[i][i] = theta_slice_ind_drop[i] 200 | 201 | for i in range(num_location): 202 | loc_int_inner = location.values[i][0:2] 203 | slice_weight = lambda x: kernel_weight(x,loc_int_inner,h) 204 | weight.append(np.array([0]*i+list(map(slice_weight,data.drop_duplicates(subset=['x','y']).iloc[i:,:].values)))) 205 | theta_slice_ind[i] = np.append(0,[sum(theta_rep_num[i][:(k+1)]) for k in range(len(theta_rep_num[i]))]) 206 | 207 | 208 | theta_slice_ind = theta_slice_ind.astype(int) 209 | weight = np.array(weight) 210 | weight = weight + weight.T - np.eye(num_location) 211 | geo_weight = [np.repeat(weight[i],theta_rep_num[i].astype(int)) for i in range(num_location)] 212 | 213 | 214 | #data index that is omitted due to very low weight: 215 | minimum_threshold = 10**(-1) 216 | drop_set = [] 217 | theta_num = np.zeros(num_location) 218 | for j in range(num_location): 219 | theta_num[j] = np.equal(data.iloc[:,:2].values, location[['x','y']].values[j]).all(axis=1).sum() 220 | for i in range(num_location): 221 | set_one = index_sel[i] 222 | full_geo_weight = np.repeat(weight[i],theta_num.astype(int)) 223 | set_two = np.where(full_geo_weight0): 242 | theta_expand = np.delete(theta_expand,drop_set[loc_ind]) 243 | outcome = np.array(data.drop(drop_set[loc_ind])['outcome']) 244 | offset = np.array(data.drop(drop_set[loc_ind])['offset']) 245 | covariate = np.array(data.drop(drop_set[loc_ind]).iloc[:, 4:]) 246 | mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi))))) 247 | result = sum(geo_weight_cut[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean)))) 248 | result += (prior_phi(phi) + sum(list(map(prior_theta, theta)))) 249 | return(result) 250 | else: 251 | theta_expand = np.repeat(np.array(theta),theta_rep_num[loc_ind].astype(int)) 252 | outcome = np.array(data.drop(index_sel[loc_ind])['outcome']) 253 | offset = np.array(data.drop(index_sel[loc_ind])['offset']) 254 | covariate = np.array(data.drop(index_sel[loc_ind]).iloc[:, 4:]) 255 | mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi))))) 256 | result = sum(geo_weight[loc_ind]*(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean)))) 257 | result += (prior_phi(phi) + sum(list(map(prior_theta, theta)))) 258 | return(result) 259 | 260 | 261 | 262 | def theta_like(subdata,loc_ind,phi,theta): 263 | theta_expand = np.repeat(theta,theta_rep_num[loc_ind][loc_ind].astype(int)) 264 | outcome = np.array(subdata['outcome']) 265 | offset = np.array(subdata['offset']) 266 | covariate = np.array(subdata.iloc[:, 4:]) 267 | mean = np.exp(np.log(offset) + np.array(list(map(sum,covariate*np.array(phi))))) 268 | result = sum(loggamma(outcome+1/theta_expand)-loggamma(1/theta_expand)-loggamma(outcome+1)-(1/theta_expand)*np.log(1+theta_expand*mean)+outcome*np.log(theta_expand*mean/(1+theta_expand*mean))) 269 | result += prior_theta(theta) 270 | return(result) 271 | 272 | 273 | 274 | #old code (discarded) 275 | #def weight_like(data_slice,loc_int,phi,theta,h): 276 | # loc1=data_slice[0:2] 277 | # theta_ind = int(location.loc[(location['x']==loc1[0]) & (location['y']==loc1[1])]['index']) 278 | # dis = eucliDis(loc1,loc_int) 279 | # kern = G_kernel(dis,h) 280 | # return(kern*negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta[theta_ind])) 281 | 282 | 283 | #def joint_like(data,loc_int,phi,theta,h): 284 | # slice_like = lambda x: weight_like(x,loc_int,phi,theta,h) 285 | # #result = sum( data.apply(slice_like,axis=1) ) 286 | # result = sum(map(slice_like,data.values)) 287 | # result += (prior_phi(phi) + sum(list(map(prior_theta, theta)))) 288 | # return(result) 289 | 290 | #init=[[1,1],[1]*num_location] 291 | 292 | 293 | # redefine weighted likelihood function for a single sample, this function is used to calculate the likelihood value of samples from testing set (cross validation). 294 | def weight_like_s(data_slice,phi,theta): 295 | return(negBion(data_slice[2],data_slice[3],data_slice[4:],phi,theta)) 296 | 297 | #Given necessary model information "model_info" (i.e., list of value of phi, value of theta, coordinates of location of interest, joint density, value of theta at location of interest, number of accepted proposals), 298 | #function "GWR_update" updates old "model_info" by one step metropolis hasting. The output is new "model_info". 299 | #Note that, "GWR_update" only update one location. Therefore, it will be applied in parallel for all locations. See following function "GWR_MCMC_multloc" 300 | if is_block: 301 | def GWR_update(model_info): 302 | phi_old = model_info[0] 303 | theta_old = model_info[1] 304 | loc_int = model_info[2] 305 | loc_ind = model_info[7] 306 | subdata = data.loc[(data['x']==loc_int[0]) & (data['y']==loc_int[1])] 307 | joint_old = model_info[3] 308 | accept_num = model_info[5] 309 | if_cut = model_info[6] 310 | phi_new = r_phi(phi_old) 311 | theta_new_ax = r_theta(theta_old) 312 | joint_new_phi = joint_like(data,loc_ind,phi_new,theta_new_ax,if_cut) 313 | rate_phi = joint_new_phi + d_phi(phi_old,phi_new) + d_theta(theta_old,theta_new_ax) - joint_old - d_phi(phi_new,phi_old) - d_theta(theta_new_ax,theta_old) 314 | alfa_phi = min(1,np.exp(rate_phi)) 315 | runif = np.random.uniform(0,1,1)[0] 316 | update_sample = [phi_new,theta_new_ax,joint_new_phi,(accept_num + 1)] if runif < alfa_phi else [phi_old,theta_old,joint_old,accept_num] 317 | phi_old, theta_old, joint_old, accept_num = update_sample[0],update_sample[1],update_sample[2],update_sample[3] 318 | theta_new = r_theta(np.array([model_info[4]])) 319 | joint_new_theta = theta_like(subdata,loc_ind,phi_old,theta_new) 320 | joint_old_theta = theta_like(subdata,loc_ind,phi_old,model_info[4]) 321 | rate_theta = joint_new_theta + d_theta(model_info[4],theta_new) - joint_old_theta - d_theta(theta_new,model_info[4]) 322 | alfa_theta = min(1,np.exp(rate_theta)) 323 | runif = np.random.uniform(0,1,1)[0] 324 | sto_theta = theta_new[0] if runif < alfa_theta else model_info[4] 325 | return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,model_info[7],0]) 326 | 327 | 328 | def GWR_update_new(model_info): 329 | phi_old = model_info[0] 330 | theta_old = model_info[1] 331 | loc_int = model_info[2] 332 | loc_ind = model_info[7] 333 | subdata = data.loc[(data['x']==loc_int[0]) & (data['y']==loc_int[1])] 334 | joint_old = model_info[3] 335 | accept_num = model_info[5] 336 | if_cut = model_info[6] 337 | phi_new = r_phi_new(phi_old) 338 | theta_new_ax = r_theta_new(theta_old) 339 | joint_new_phi = joint_like(data,loc_ind,phi_new,theta_new_ax,if_cut) 340 | rate_phi = joint_new_phi + d_phi_new(phi_old,phi_new) + d_theta_new(theta_old,theta_new_ax) - joint_old - d_phi_new(phi_new,phi_old) - d_theta_new(theta_new_ax,theta_old) 341 | alfa_phi = min(1,np.exp(rate_phi)) 342 | runif = np.random.uniform(0,1,1)[0] 343 | update_sample = [phi_new,theta_new_ax,joint_new_phi,(accept_num + 1)] if runif < alfa_phi else [phi_old,theta_old,joint_old,accept_num] 344 | phi_old, theta_old, joint_old, accept_num = update_sample[0],update_sample[1],update_sample[2],update_sample[3] 345 | theta_new = r_theta_new(np.array([model_info[4]])) 346 | joint_new_theta = theta_like(subdata,loc_ind,phi_old,theta_new) 347 | joint_old_theta = theta_like(subdata,loc_ind,phi_old,model_info[4]) 348 | rate_theta = joint_new_theta + d_theta_new(model_info[4],theta_new) - joint_old_theta - d_theta_new(theta_new,model_info[4]) 349 | alfa_theta = min(1,np.exp(rate_theta)) 350 | runif = np.random.uniform(0,1,1)[0] 351 | sto_theta = theta_new[0] if runif < alfa_theta else model_info[4] 352 | return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,model_info[7],0]) 353 | 354 | 355 | else: 356 | def GWR_update(model_info): 357 | phi_old = model_info[0] 358 | theta_old = model_info[1] 359 | loc_int = model_info[2] 360 | loc_ind = model_info[7] 361 | joint_old = model_info[3] 362 | accept_num = model_info[5] 363 | if_cut = model_info[6] 364 | phi_new = r_phi(phi_old) 365 | theta_new = r_theta(theta_old) 366 | joint_new = joint_like(data,loc_ind,phi_new,theta_new,if_cut) 367 | rate = joint_new + d_phi(phi_old,phi_new) + d_theta(theta_old,theta_new) - joint_old - d_phi(phi_new,phi_old) - d_theta(theta_new,theta_old) 368 | alfa = min(1,np.exp(rate)) 369 | runif = np.random.uniform(0,1,1)[0] 370 | update_sample = [phi_new,theta_new,joint_new,(accept_num + 1)] if runif < alfa else [phi_old,theta_old,joint_old,accept_num] 371 | phi_old, theta_old, joint_old, accept_num = update_sample[0],update_sample[1],update_sample[2],update_sample[3] 372 | sto_theta = theta_old[loc_ind] 373 | return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,model_info[7],0]) 374 | 375 | def GWR_update_new(model_info): 376 | phi_old = model_info[0] 377 | theta_old = model_info[1] 378 | loc_int = model_info[2] 379 | loc_ind = model_info[7] 380 | joint_old = model_info[3] 381 | accept_num = model_info[5] 382 | if_cut = model_info[6] 383 | phi_new = r_phi_new(phi_old) 384 | theta_new = r_theta_new(theta_old) 385 | joint_new = joint_like(data,loc_ind,phi_new,theta_new,if_cut) 386 | rate = joint_new + d_phi_new(phi_old,phi_new) + d_theta_new(theta_old,theta_new) - joint_old - d_phi_new(phi_new,phi_old) - d_theta_new(theta_new,theta_old) 387 | alfa = min(1,np.exp(rate)) 388 | runif = np.random.uniform(0,1,1)[0] 389 | update_sample = [phi_new,theta_new,joint_new,(accept_num + 1)] if runif < alfa else [phi_old,theta_old,joint_old,accept_num] 390 | phi_old, theta_old, joint_old, accept_num = update_sample[0],update_sample[1],update_sample[2],update_sample[3] 391 | sto_theta = theta_old[loc_ind] 392 | return([list(phi_old),theta_old,loc_int,joint_old,sto_theta,accept_num,if_cut,model_info[7],0]) 393 | 394 | 395 | 396 | 397 | #initial value "init" (list of initial phi, initial theta, coordinates of location of interest, initial joint density) 398 | #init_phi = [2,1,1] 399 | #init_param = pd.read_csv('init_param.csv',encoding='utf-8',header=0) 400 | #init = [[[init_param.iloc[x]['phi0'],init_param.iloc[x]['phi1'],init_param.iloc[x]['phi2']],[init_param.iloc[x]['theta']]*num_location,list(location[['x','y']].values[x]),joint_like(data,x,[init_param.iloc[x]['phi0'],init_param.iloc[x]['phi1'],init_param.iloc[x]['phi2']],[init_param.iloc[x]['theta']]*num_location,True),init_param.iloc[x]['theta'],0,True,x,0] for x in range(len(location[['x','y']].values))] 401 | init_phi = [2.8421216,0.4794747,0.2232922] 402 | init_theta = 0.5 403 | init = [[init_phi,[init_theta]*num_location,list(location[['x','y']].values[x]),joint_like(data,x,init_phi,[init_theta]*num_location,True),init_theta,0,True,x,0] for x in range(len(location[['x','y']].values))] 404 | 405 | 406 | #MCMC updates of all location simultaneously (mignt be in parallel). 407 | def GWR_MCMC_multloc(init,num_iter,thin,burn_in): 408 | sto_phi = np.zeros([(num_iter-burn_in)//thin,num_location,len(init[0][0])]) #store posterior samples of phi 409 | sto_theta = np.zeros([(num_iter-burn_in)//thin,num_location,1]) #store posterior samples of theta 410 | iter_param = init 411 | #loglik_sum = np.zeros(num_location) #store the log-likelihood of testing set for all locations 412 | #elpd = np.zeros(num_location) #store the estimated elpd for all locations 413 | for i in range(num_iter): 414 | if( (i<=(burn_in-1)) | ((i+1) % thin !=0)): 415 | if(is_para): 416 | iter_param = list(pool.map(GWR_update,iter_param)) #one step metropolis hasting update for all locations in parallel 417 | else: 418 | iter_param = list(map(GWR_update,iter_param)) #one step metropolis hasting update for all locations 419 | if((i+1)%100==0): 420 | accept_rate = np.mean(np.array([iter_param[s][5] for s in range(num_location)])/i) 421 | print('The average acceptance rate is: {rate}'.format(rate=accept_rate),flush=True) 422 | 423 | if((i+1) % thin == 0): 424 | print('{0}% complete.'.format((i+1)*100/num_iter), flush=True) 425 | else: 426 | if(i%20==0): 427 | for s in range(num_location): 428 | iter_param[s][6] = False 429 | drop_like = np.array([iter_param[q][3] for q in range(num_location)]) 430 | 431 | if(is_para): 432 | iter_param = list(pool.map(GWR_update_new,iter_param)) #one step metropolis hasting update for all locations in parallel 433 | else: 434 | iter_param = list(map(GWR_update_new,iter_param)) #one step metropolis hasting update for all locations 435 | 436 | accept_rate = np.mean(np.array([iter_param[s][5] for s in range(num_location)])/i) #calculate the mean acceptance rate 437 | sto_phi[((i+1-burn_in)//thin) - 1] = np.array([iter_param[s][0] for s in range(num_location)]) 438 | sto_theta[((i+1-burn_in)//thin) - 1] = np.array([[iter_param[s][4]] for s in range(num_location)]) 439 | 440 | if(i%20==0): 441 | #print(sto_phi[((i+1-burn_in)//thin) -1], flush=True) 442 | #print('Theta at 800 is: {theta}'.format(theta=sto_theta[((i+1-burn_in)//thin) - 1][800]),flush=True) 443 | for s in range(num_location): 444 | iter_param[s][6] = True 445 | full_like = np.array([iter_param[q][3] for q in range(num_location)]) 446 | print('The approximation rate is: {rate}'.format(rate=np.mean(drop_like/full_like)),flush=True) 447 | print('{0}% complete. The average acceptance rate is: {rate}'.format((i+1)*100/num_iter, rate=accept_rate), flush=True) 448 | result = {'phi':sto_phi,'theta':sto_theta} 449 | return(result) 450 | 451 | 452 | time_one = datetime.now() 453 | if __name__ == '__main__': 454 | pool = Pool(processes=num_core) 455 | re=GWR_MCMC_multloc(init,4000,1,1000) 456 | time_two = datetime.now() 457 | 458 | print(time_two-time_one) #time used for MCMC updates 459 | 460 | est_phi=sum(re['phi'])/re['phi'].shape[0] #posterior estimation of phi (posterior mean) 461 | est_theta=sum(re['theta'])/re['theta'].shape[0] ##posterior estimation of theta (posterior mean) 462 | 463 | trace = np.zeros(shape=[re['phi'].shape[0],re['phi'].shape[2]+re['theta'].shape[2]]) #trace record of posterior samples (central location only) 464 | for k in range(re['phi'].shape[0]): 465 | trace[k][0:re['phi'].shape[2]] = re['phi'][k][re['phi'][0].shape[0]//2+19] 466 | trace[k][re['phi'].shape[2]:] = re['theta'][k][re['theta'][0].shape[0]//2+19] 467 | np.savetxt('trace'+str(h)+'rep'+str(rep_id)+'.csv',trace,delimiter=',') 468 | 469 | print(est_phi) 470 | np.savetxt("est_phi"+str(h)+'rep'+str(rep_id)+".csv", est_phi, delimiter=",") 471 | print(est_theta) 472 | np.savetxt("est_theta"+str(h)+'rep'+str(rep_id)+".csv", est_theta, delimiter=",") 473 | #print([h,re['ELPD']]) 474 | -------------------------------------------------------------------------------- /Simulation/BandwidthSelection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Feb 19 14:08:10 2022 4 | 5 | @author: Yang Liu 6 | """ 7 | 8 | 9 | import numpy as np 10 | import os 11 | import pandas as pd 12 | from scipy.special import loggamma 13 | import scipy.stats 14 | from datetime import datetime 15 | from multiprocessing import Pool 16 | 17 | file_id = int(os.getenv('SLURM_ARRAY_TASK_ID')) 18 | #print([task_id,type(task_id)],flush=True) 19 | 20 | task_id = (file_id-1)//5 21 | #task_id = (file_id-1) 22 | 23 | rep_id = file_id%5 24 | is_para = True #Using parallel computing? 25 | num_core = 30 #number of cores 26 | fitting_ratio = 0.5 #the proportion of samples used for fitting at the location of interest (i.e., splitting samples at location of interest into fitting set and testing set). 27 | is_eucliDis = True #Using Euclidian distance? 28 | is_block = False #block sampling? 29 | 30 | 31 | # set multiple cores 32 | #pool = ThreadPool(4) 33 | 34 | #geographical kernel bandwidth 35 | h = [0.0001,2,4,6,8,10,20,40,80,1000][task_id] 36 | 37 | print('h is {h}'.format(h=h), flush=True) 38 | 39 | #Geographically weighted kernel (exponential kernel) 40 | def G_kernel(d,h): 41 | return(np.exp(-d**2/h**2)) 42 | 43 | #euclidean distance 44 | def eucliDis(A,B): 45 | A = np.array(A) 46 | B = np.array(B) 47 | return np.sqrt(sum((A-B)**2)) 48 | 49 | #spherical distance (measured in KM) 50 | def Haversine(A,B): 51 | """ 52 | This uses the ‘haversine’ formula to calculate the great-circle distance between two points – that is, 53 | the shortest distance over the earth’s surface – giving an ‘as-the-crow-flies’ distance between the points 54 | (ignoring any hills they fly over, of course!). 55 | Haversine 56 | formula: a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2) 57 | c = 2 ⋅ atan2( √a, √(1−a) ) 58 | d = R ⋅ c 59 | where φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km); 60 | note that angles need to be in radians to pass to trig functions! 61 | """ 62 | lat1,lon1,lat2,lon2 = A[0],A[1],B[0],B[1] 63 | 64 | R = 6378.0088 65 | lat1,lon1,lat2,lon2 = map(np.radians, [lat1,lon1,lat2,lon2]) 66 | 67 | dlat = lat2 - lat1 68 | dlon = lon2 - lon1 69 | a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2) **2 70 | c = 2 * np.arctan2(a**0.5, (1-a)**0.5) 71 | d = R * c 72 | return round(d,4) 73 | 74 | #log-likelihood of negative binomial distribution 75 | def negBion(outcome,offset,covariate,phi,theta): 76 | mean = np.exp(np.log(offset)+sum(np.array(covariate)*np.array(phi))) 77 | result = loggamma(outcome+1/theta)-loggamma(1/theta)-loggamma(outcome+1)-(1/theta)*np.log(1+theta*mean)+outcome*np.log(theta*mean/(1+theta*mean)) 78 | return result 79 | 80 | #log-prior for phi (uniform) 81 | def prior_phi(phi): 82 | if max(abs(np.array(phi)))>1000: 83 | return np.log(0) 84 | else: 85 | return np.log(1/2000)*len(phi) 86 | 87 | #log-prior for theta (uniform 0,1000) 88 | def prior_theta(theta): 89 | if theta>1000 or theta<=0: 90 | return np.log(0) 91 | else: 92 | return np.log(1/1000) 93 | 94 | #baseline proposal sd for phi (proportional to estimated correlation matrix of phi) 95 | #pro_st = np.array([[ 0.19800107, -0.03503183, -0.0323151], 96 | # [-0.03503183, 0.03827607, -0.00078348], 97 | # [-0.0323151 , -0.00078348, 0.00733836]]) 98 | 99 | pro_st = np.array([[ 0.199212513, -0.00319569314, -0.037421612], 100 | [-0.00319569314, 0.0079491569, 0.000190719635], 101 | [-0.037421612, 0.000190719635, 0.00757105334]]) 102 | 103 | #two step adatpive proposal sd: 104 | #aggressive proposal sd for phi to approximate true value before burn_in 105 | pro_early = [np.dot(pro_st*2,pro_st*2),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st),np.dot(pro_st,pro_st)] 106 | #mild proposal sd for phi to achieve a good mixture after burn_in 107 | pro_later = [np.dot(pro_st*2,pro_st*2),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/5,pro_st/5),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10),np.dot(pro_st/10,pro_st/10)] 108 | 109 | #aggressive proposal sd for theta to approximate true value before burn_in 110 | pro_theta_early = [0.05,0.03,0.03,0.03,0.015,0.015,0.005,0.004,0.004,0.004] 111 | #mild proposal sd for theta to achieve a good mixture after burn_in 112 | pro_theta_later = [0.05,0.015,0.015,0.015,0.005,0.005,0.005,0.004,0.004,0.004] 113 | 114 | #proposal sampling function for phi (multivariate normal) 115 | def r_phi(phi): 116 | phi = np.array(phi) 117 | phi_n = scipy.stats.multivariate_normal(phi,pro_early[task_id]).rvs(1) 118 | return phi_n 119 | #proposal density function for phi 120 | def d_phi(phi_n,phi): 121 | return np.log(scipy.stats.multivariate_normal(phi,pro_early[task_id]).pdf(phi_n)) 122 | 123 | #proposal sampling function for phi (multivariate normal) 124 | def r_phi_new(phi): 125 | phi = np.array(phi) 126 | phi_n = scipy.stats.multivariate_normal(phi,pro_later[task_id]).rvs(1) 127 | return phi_n 128 | #proposal density function for phi 129 | def d_phi_new(phi_n,phi): 130 | return np.log(scipy.stats.multivariate_normal(phi,pro_later[task_id]).pdf(phi_n)) 131 | 132 | #proposal sampling function for theta (truncated normal) 133 | def r_theta(theta): 134 | sd = pro_theta_early[task_id] 135 | out = np.random.normal(theta, sd, len(theta)) 136 | out[out<=0] = 0.001 137 | return out 138 | 139 | #proposal density function for theta 140 | def d_theta(theta_n,theta): 141 | sd = pro_theta_early[task_id] 142 | return sum(scipy.stats.norm.pdf(theta_n, theta, sd)) 143 | 144 | #proposal sampling function for theta (truncated normal) 145 | def r_theta_new(theta): 146 | sd = pro_theta_later[task_id] 147 | out = np.random.normal(theta, sd, len(theta)) 148 | out[out<=0] = 0.001 149 | return out 150 | 151 | #proposal density function for theta 152 | def d_theta_new(theta_n,theta): 153 | sd = pro_theta_later[task_id] 154 | return sum(scipy.stats.norm.pdf(theta_n, theta, sd)) 155 | 156 | 157 | 158 | #import data from file 159 | data = pd.read_csv('simulateDate.csv',encoding='utf-8',header=0) 160 | #extract coordinates of locations 161 | location = data[['x','y']].drop_duplicates(subset=['x','y']) 162 | #number of locations 163 | num_location = location.shape[0] 164 | #add main key to location table 165 | location['index']=range(num_location) 166 | 167 | #randomly select fitting subdata for each location of interest (cross validation) 168 | index_sel = [] 169 | 170 | for k in range(num_location): 171 | loc_foc = location.values[k][0:2] 172 | index_loc = data[(data['x']==loc_foc[0]) & (data['y']==loc_foc[1])].index 173 | num_sel = int(len(index_loc)*fitting_ratio) 174 | index_sel.append(np.sort(np.random.choice(index_loc,size=num_sel,replace=False))) 175 | 176 | #given a observation (data_slice) and a location of interest (loc_int), this function calculate the geographical weight 177 | def kernel_weight(data_slice,loc_int,h): 178 | loc1=data_slice[0:2] 179 | if(is_eucliDis == True): 180 | dis = eucliDis(loc1,loc_int) 181 | else: 182 | dis = Haversine(loc1,loc_int) 183 | kern = G_kernel(dis,h) 184 | return(kern) 185 | 186 | #geographical weighting kernel matrix 187 | #In this part, we store all geographical weights in matrix (or list) "geo_weight" to avoid redundant calculation. The number of rows is equal to the number of location of interest. At each location of interest (e.g., each row), we calculate the geographical weight for each sample. 188 | weight = [] 189 | theta_rep_num = np.zeros(shape=[num_location,num_location]) 190 | theta_slice_ind = np.zeros(shape=[num_location,(num_location+1)]) 191 | 192 | theta_slice_ind_full = [np.equal(data.iloc[:,:2].values, location[['x','y']].values[i]).all(axis=1).sum() for i in range(num_location)] 193 | theta_slice_ind_drop = [] 194 | for i in range(num_location): 195 | theta_slice_ind_drop.append(np.equal(data.drop(index_sel[i]).iloc[:,:2].values, location[['x','y']].values[i]).all(axis=1).sum()) 196 | for i in range(num_location): 197 | theta_rep_num[i] = theta_slice_ind_full 198 | theta_rep_num[i][i] = theta_slice_ind_drop[i] 199 | 200 | for i in range(num_location): 201 | loc_int_inner = location.values[i][0:2] 202 | slice_weight = lambda x: kernel_weight(x,loc_int_inner,h) 203 | weight.append(np.array([0]*i+list(map(slice_weight,data.drop_duplicates(subset=['x','y']).iloc[i:,:].values)))) 204 | theta_slice_ind[i] = np.append(0,[sum(theta_rep_num[i][:(k+1)]) for k in range(len(theta_rep_num[i]))]) 205 | 206 | 207 | theta_slice_ind = theta_slice_ind.astype(int) 208 | weight = np.array(weight) 209 | weight = weight + weight.T - np.eye(num_location) 210 | geo_weight = [np.repeat(weight[i],theta_rep_num[i].astype(int)) for i in range(num_location)] 211 | 212 | 213 | #data index that is omitted due to very low weight: 214 | minimum_threshold = 10**(-1) 215 | drop_set = [] 216 | theta_num = np.zeros(num_location) 217 | for j in range(num_location): 218 | theta_num[j] = np.equal(data.iloc[:,:2].values, location[['x','y']].values[j]).all(axis=1).sum() 219 | for i in range(num_location): 220 | set_one = index_sel[i] 221 | full_geo_weight = np.repeat(weight[i],theta_num.astype(int)) 222 | set_two = np.where(full_geo_weight