├── GGPR_ESDA_Chicago.ipynb ├── GGPR_ESDA_UK.ipynb ├── GGPR_Prediction.ipynb ├── README.md ├── Simulated_data.ipynb ├── chicago.geojson ├── geoshapley.py ├── images └── example.txt └── leave_data.geojson /GGPR_Prediction.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyMdGhJSm7/HA4lmhHDBvV8k"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["import numpy as np\n","from sklearn.gaussian_process import GaussianProcessRegressor\n","from sklearn.gaussian_process.kernels import Kernel, StationaryKernelMixin, NormalizedKernelMixin\n","import geopandas as gpd\n","from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score\n","from sklearn.preprocessing import StandardScaler\n","from sklearn.model_selection import train_test_split\n","from scipy.optimize import fmin_l_bfgs_b\n","import warnings\n","warnings.filterwarnings(\"ignore\")\n","\n","class SpatialSimilarityKernel(StationaryKernelMixin, NormalizedKernelMixin, Kernel):\n"," \"\"\"\n"," Spatial Similarity kernel class that extends scikit-learn's Gaussian Process kernels.\n"," Inherits from StationaryKernelMixin (for stationarity) and NormalizedKernelMixin.\n"," \"\"\"\n","\n"," def __init__(self, P_function, sigma_values, epsilon=0.0001):\n"," \"\"\"\n"," Initializes the spatial similarity kernel.\n"," :param P_function: An array of weights that will be used to combine the exponential terms (feature importance).\n"," :param sigma_values: An array of variance estimates for each feature, used in the exponential decay.\n"," :param epsilon: A small value added to diagonal elements for numerical stability.\n"," \"\"\"\n"," self.P_function = P_function\n"," self.sigma_values = sigma_values\n"," self.epsilon = epsilon\n","\n"," def _calculate_similarity(self, u, v):\n"," \"\"\"\n"," Computes element-wise exponential similarity between two matrices (u, v).\n"," Exponential decay is applied based on the squared differences scaled by sigma_values.\n","\n"," :param u: Feature matrix X (shape [n_samples, n_features])\n"," :param v: Feature matrix Y (shape [m_samples, n_features])\n"," :return: An array E_i containing spatial similarity for each feature.\n"," \"\"\"\n"," # Add extra dimensions to broadcast easily\n"," u = u[:, np.newaxis, :]\n"," v = v[np.newaxis, :, :]\n","\n"," # Compute squared differences between each pair of points\n"," sq_diff = (u - v) ** 2\n","\n"," # Calculate exponential decay for each feature using sigma_values\n"," E_i = np.exp(-((sq_diff) / (2 * (self.sigma_values))))\n"," return E_i\n","\n"," def __call__(self, X, Y=None, eval_gradient=False):\n"," \"\"\"\n"," Main function used by GaussianProcessRegressor to compute the kernel matrix.\n","\n"," :param X: Feature matrix of shape [n_samples, n_features]\n"," :param Y: (Optional) Another feature matrix. If None, use X for both.\n"," :param eval_gradient: (Unused) Determines if gradient is computed, not used here.\n"," :return: The kernel similarity matrix of shape [n_samples, m_samples].\n"," \"\"\"\n"," if Y is None:\n"," Y = X\n","\n"," # Get exponential similarities feature-wise\n"," E_i = self._calculate_similarity(X, Y)\n","\n"," # Weighted average across features using P_function as weights\n"," S_uv = np.average(E_i, axis=2, weights=self.P_function)\n"," return S_uv\n","\n"," def diag(self, X):\n"," \"\"\"\n"," Returns the diagonal of the kernel matrix for a given X.\n"," Includes a small epsilon for numerical stability.\n"," \"\"\"\n"," return np.diag(np.ones(X.shape[0])) + self.epsilon\n","\n"," def is_stationary(self):\n"," \"\"\"\n"," Indicates that the kernel is stationary (depends on distances, not absolute positions).\n"," \"\"\"\n"," return True\n","\n","\n","def loss_function(P_function, X_train, y_train, sigma_values, alpha=1e-0):\n"," \"\"\"\n"," Defines a loss function for optimization based on Gaussian Process Regressor predictions.\n","\n"," :param P_function: Current set of feature weights to be evaluated.\n"," :param X_train: Training features.\n"," :param y_train: Training target values.\n"," :param sigma_values: Array of variances used in the exponential similarity calculation.\n"," :param alpha: Regularization parameter for the Gaussian Process Regressor.\n"," :return: RMSE (root mean squared error) for the given set of feature weights.\n"," \"\"\"\n"," # Instantiate the spatial similarity kernel with the current P_function\n"," kernel = SpatialSimilarityKernel(P_function=P_function, sigma_values=sigma_values)\n"," # Create a GPR model with the spatial similarity kernel\n"," gpr = GaussianProcessRegressor(kernel=kernel, alpha=alpha, optimizer=None)\n"," # Fit the model on the training set\n"," gpr.fit(X_train, y_train)\n"," # Predict on the same training data\n"," y_pred = gpr.predict(X_train, return_std=False)\n"," # Compute the RMSE for this set of predictions\n"," rmse = mean_squared_error(y_train, y_pred, squared=False)\n"," return rmse\n","\n","def objective(P_function):\n"," \"\"\"\n"," Wrapper function for the optimizer (fmin_l_bfgs_b).\n"," Returns the RMSE for the current P_function, which the optimizer tries to minimize.\n"," \"\"\"\n"," return loss_function(P_function, X_train, y_train, sigma_values, alpha=1e-0)\n","\n","# Load data\n","chicago = gpd.read_file(\"/data/chicago_scv.geojson\")\n","chicago['pct_white'] = 1 - chicago['pct_nonwhi']\n","chicago['PD_log'] = np.log(chicago['population'])\n","y = np.log(chicago['TripCount'].values)\n","\n","# Select features\n","X_vars = ['pct_white','pct_bachel', 'pct_no_veh','PD_log','job_entrop','TripMiles_']\n","X = chicago[X_vars]\n","\n","# Standardize the feature matrix\n","scaler = StandardScaler()\n","X_scaled = scaler.fit_transform(X)\n","\n","# Split data into training and test sets\n","X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.9, random_state=999)\n","\n","# Compute the variance of each feature in the training set\n","sigma_values = np.array(np.var(X_train, axis=0))\n","\n","# Initialize P_function (weights) for all features\n","initial_P_function = np.ones(len(X_vars)) * 1\n","\n","# Set bounds for each weight (between 0 and 5)\n","bounds = [(0, 5)] * len(X_vars)\n","\n","# Use L-BFGS-B to optimize the objective function\n","optimized_P_function, min_loss, info = fmin_l_bfgs_b(\n"," objective,\n"," initial_P_function,\n"," bounds=bounds,\n"," approx_grad=True\n",")\n","\n","print(\"Optimized P_function: \", optimized_P_function)\n","\n","# Build a GaussianProcessRegressor using the optimized weights\n","kernel = SpatialSimilarityKernel(P_function=optimized_P_function, sigma_values=sigma_values)\n","gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-1, optimizer=None)\n","\n","# Fit the final model on the training data\n","gpr.fit(X_train, y_train)\n","\n","# Predict on the test set\n","y_pred = gpr.predict(X_test, return_std=False)\n","\n","# Compute metrics (MSE, MAE, MAPE, R²) on the test set\n","mse = mean_squared_error(y_test, y_pred)\n","mae = mean_absolute_error(y_test, y_pred)\n","mape = mean_absolute_percentage_error(y_test, y_pred)\n","r2 = r2_score(y_test, y_pred)\n","r2 = round(r2, 4)\n","\n","print(\"MSE: \", mse.round(4))\n","print(\"MAE: \", mae.round(4))\n","print(\"MAPE: \", mape.round(4))\n","print(\"R² Score: \", r2)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"SJJbkDe7CL0s","executionInfo":{"status":"ok","timestamp":1732718045232,"user_tz":-480,"elapsed":3896,"user":{"displayName":"Joseph Jiao","userId":"15208460091880993566"}},"outputId":"11b6fb90-3a10-4894-fba8-6d0b82683789"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Optimized P_function: [0.04688675 2.24758405 0.43979082 0.71099497 0.12188657 1.72973757]\n","MSE: 0.6014\n","MAE: 0.5837\n","MAPE: 0.0661\n","R² Score: 0.8223\n"]}]},{"cell_type":"code","source":["import numpy as np\n","from sklearn.gaussian_process import GaussianProcessRegressor\n","from sklearn.gaussian_process.kernels import Kernel, StationaryKernelMixin, NormalizedKernelMixin\n","import geopandas as gpd\n","from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score\n","from sklearn.preprocessing import StandardScaler\n","from sklearn.model_selection import train_test_split\n","from scipy.optimize import fmin_l_bfgs_b\n","import warnings\n","warnings.filterwarnings(\"ignore\")\n","\n","class SpatialSimilarityKernel(StationaryKernelMixin, NormalizedKernelMixin, Kernel):\n"," \"\"\"\n"," Spatial Similarity kernel class that extends scikit-learn's Gaussian Process kernels.\n"," Inherits from StationaryKernelMixin (for stationarity) and NormalizedKernelMixin.\n"," \"\"\"\n","\n"," def __init__(self, P_function, sigma_values, epsilon=0.0001):\n"," \"\"\"\n"," Initializes the spatial similarity kernel.\n"," :param P_function: An array of weights that will be used to combine the exponential terms (feature importance).\n"," :param sigma_values: An array of variance estimates for each feature, used in the exponential decay.\n"," :param epsilon: A small value added to diagonal elements for numerical stability.\n"," \"\"\"\n"," self.P_function = P_function\n"," self.sigma_values = sigma_values\n"," self.epsilon = epsilon\n","\n"," def _calculate_similarity(self, u, v):\n"," \"\"\"\n"," Computes element-wise exponential similarity between two matrices (u, v).\n"," Exponential decay is applied based on the squared differences scaled by sigma_values.\n","\n"," :param u: Feature matrix X (shape [n_samples, n_features])\n"," :param v: Feature matrix Y (shape [m_samples, n_features])\n"," :return: An array E_i containing spatial similarity for each feature.\n"," \"\"\"\n"," # Add extra dimensions to broadcast easily\n"," u = u[:, np.newaxis, :]\n"," v = v[np.newaxis, :, :]\n","\n"," # Compute squared differences between each pair of points\n"," sq_diff = (u - v) ** 2\n","\n"," # Calculate exponential decay for each feature using sigma_values\n"," E_i = np.exp(-((sq_diff) / (2 * (self.sigma_values))))\n"," return E_i\n","\n"," def __call__(self, X, Y=None, eval_gradient=False):\n"," \"\"\"\n"," Main function used by GaussianProcessRegressor to compute the kernel matrix.\n","\n"," :param X: Feature matrix of shape [n_samples, n_features]\n"," :param Y: (Optional) Another feature matrix. If None, use X for both.\n"," :param eval_gradient: (Unused) Determines if gradient is computed, not used here.\n"," :return: The kernel similarity matrix of shape [n_samples, m_samples].\n"," \"\"\"\n"," if Y is None:\n"," Y = X\n","\n"," # Get exponential similarities feature-wise\n"," E_i = self._calculate_similarity(X, Y)\n","\n"," # Weighted average across features using P_function as weights\n"," S_uv = np.average(E_i, axis=2, weights=self.P_function)\n"," return S_uv\n","\n"," def diag(self, X):\n"," \"\"\"\n"," Returns the diagonal of the kernel matrix for a given X.\n"," Includes a small epsilon for numerical stability.\n"," \"\"\"\n"," return np.diag(np.ones(X.shape[0])) + self.epsilon\n","\n"," def is_stationary(self):\n"," \"\"\"\n"," Indicates that the kernel is stationary (depends on distances, not absolute positions).\n"," \"\"\"\n"," return True\n","\n","\n","def loss_function(P_function, X_train, y_train, sigma_values, alpha=1e-1):\n"," \"\"\"\n"," Defines a loss function for optimization based on Gaussian Process Regressor predictions.\n","\n"," :param P_function: Current set of feature weights to be evaluated.\n"," :param X_train: Training features.\n"," :param y_train: Training target values.\n"," :param sigma_values: Array of variances used in the exponential similarity calculation.\n"," :param alpha: Regularization parameter for the Gaussian Process Regressor.\n"," :return: RMSE (root mean squared error) for the given set of feature weights.\n"," \"\"\"\n"," # Instantiate the spatial similarity kernel with the current P_function\n"," kernel = SpatialSimilarityKernel(P_function=P_function, sigma_values=sigma_values)\n","\n"," # Create a GPR model with the spatial similarity kernel\n"," gpr = GaussianProcessRegressor(kernel=kernel, alpha=alpha, optimizer=None)\n","\n"," # Fit the model on the training set\n"," gpr.fit(X_train, y_train)\n","\n"," # Predict on the same training data\n"," y_pred = gpr.predict(X_train, return_std=False)\n","\n"," # Compute the RMSE for this set of predictions\n"," rmse = mean_squared_error(y_train, y_pred, squared=False)\n"," return rmse\n","\n","def objective(P_function):\n"," \"\"\"\n"," Wrapper function for the optimizer (fmin_l_bfgs_b).\n"," Returns the RMSE for the current P_function, which the optimizer tries to minimize.\n"," \"\"\"\n"," return loss_function(P_function, X_train, y_train, sigma_values, alpha=1e-1)\n","\n","\n","# Load data\n","Berxit = gpd.read_file(\"/data/leave_data.geojson\")\n","y = Berxit['leave'].values\n","X_vars = ['to15','over65','lhosp','manu','badhealth','bornuk']\n","X = Berxit[X_vars]\n","\n","# Standardize feature matrix\n","scaler = StandardScaler()\n","X_scaled = scaler.fit_transform(X)\n","\n","# Standardize the target as well\n","y_scaler = StandardScaler()\n","y_scaled = y_scaler.fit_transform(y.reshape(-1, 1))\n","\n","# Split data into training and test sets\n","X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.9, random_state=42)\n","\n","# Compute the variance of each feature in the training set\n","sigma_values = np.array(np.var(X_train, axis=0))\n","\n","# Initialize P_function (weights) for all features\n","initial_P_function = np.ones(len(X_vars)) * 1\n","\n","# Set bounds for each weight (between 0 and 5)\n","bounds = [(0, 5)] * len(X_vars)\n","\n","# Use L-BFGS-B to optimize the objective function\n","optimized_P_function, min_loss, info = fmin_l_bfgs_b(\n"," objective,\n"," initial_P_function,\n"," bounds=bounds,\n"," approx_grad=True\n",")\n","\n","print(\"Optimized P_function: \", optimized_P_function)\n","\n","# Build a GaussianProcessRegressor using the optimized weights\n","kernel = SpatialSimilarityKernel(P_function=optimized_P_function, sigma_values=sigma_values)\n","gpr = GaussianProcessRegressor(kernel=kernel, alpha=1e-1, optimizer=None)\n","\n","# Fit the final model on the training data\n","gpr.fit(X_train, y_train)\n","\n","# Predict on the test set\n","y_pred = gpr.predict(X_test, return_std=False)\n","\n","# Compute metrics (MSE, MAE, MAPE, R²) on the test set\n","mse = mean_squared_error(y_test, y_pred)\n","mae = mean_absolute_error(y_test, y_pred)\n","mape = mean_absolute_percentage_error(y_test, y_pred)\n","r2 = r2_score(y_test, y_pred)\n","r2 = round(r2, 4)\n","\n","print(\"MSE: \", mse.round(4))\n","print(\"MAE: \", mae.round(4))\n","print(\"MAPE: \", mape.round(4))\n","print(\"R² Score: \", r2)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pebRrezACnT-","executionInfo":{"status":"ok","timestamp":1732718214740,"user_tz":-480,"elapsed":1200,"user":{"displayName":"Joseph Jiao","userId":"15208460091880993566"}},"outputId":"a453f58c-1f11-4f13-a0a4-2f46df83457f"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Optimized P_function: [1.53809149 1.69200801 0.58865995 0.23002332 0.78079264 0.41889233]\n","MSE: 0.3735\n","MAE: 0.4637\n","MAPE: 1.7959\n","R² Score: 0.6193\n"]}]}]} -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Geographical Gaussian Process Regression 2 | Geographical Gaussian Process Regression (GGPR): A Spatial Machine Learning Model Based on **Spatial Similarity**: 3 | 4 | defined as [A-Xing Zhu et al. (2018)](https://doi.org/10.1080/19475683.2018.1534890): *"The more similar geographic configurations of two points (areas), the more similar the values (processes) of the target variable at these two points (areas)".* 5 | 6 | GGPR has two functions: **spatial prediction** and **exploratory spatial data analysis (ESDA)**. For spatial prediction, GGPR uses spatial similarity as a kernel function to calibrate the GPR model and then predicts the values of unknown observations. To perform ESDA, we build on the spatial prediction function by adding a Matern kernel with spatial coordinates, as it effectively captures spatial dependence in spatial data. This approach offers two benefits. First, it allows GGPR to achieve explainability through GeoShapley (We strongly recommend using the uploaded version in our code). Second, GGPR can capture global geographic trends while effectively depicting local details and irregularities, thereby considering spatial dependence and spatial similarity to enhance GGPR’s capability in ESDA. 7 | 8 | 9 | 10 | ### Problem Statement 11 | 1.Most existing ML in spatial data can lead to overly optimistic performance and biased errors due to the conflict between spatial autocorrelation and the independent and identically distributed (i.i.d.) assumption. 12 | 13 | 2.Spatial ML models based on spatial dependence and spatial heterogeneity often require dense samples to satisfy the spatial stationarity assumption, making them inappropriate for small-sample prediction. 14 | 15 | 3.Few studies offered explainable tools to interpret black-box ML models and capture spatial effects for assisting the spatially decision-making research. 16 | 17 | ### Abstract 18 | This study proposes a new spatial machine learning model called Geographical Gaussian Process Regression (GGPR). GGPR is extended from Gaussian Process Regression (GPR) by using the principle of spatial (geographic) similarity for calibration, and it is designed to conduct spatial prediction and exploratory spatial data analysis (ESDA). GGPR addresses several key challenges in spatial machine learning. First, as a probabilistic model, GGPR avoids the conflict between spatial autocorrelation and the assumption of independent and identically distributed (i.i.d.), thus enhancing the model’s objectivity and reliability in spatial prediction. Second, GGPR is suitable for small-sample prediction that most existing models can hardly handle. Finally, integrated with GeoShapley, GGPR is an explainable model can measure spatial effects and explain the outcomes. Evaluated on two distinct datasets, GGPR demonstrates superior predictive performance compared to other popular machine learning models across various sampling ratios, with its advantage becoming particularly pronounced at smaller sampling ratios. As an ESDA model, GGPR demonstrates enhanced accuracy, better computational efficiency, and a comparable ability to measure spatial effects against both Multiscale Geographically Weighted Regression (MGWR) and Geographical Random Forests (GRF). In short, GGPR offers spatial data scientists a new method for predicting and exploring complex geographical processes. 19 | 20 | ### Spatial Prediction 21 | For spatial prediction, GGPR outperformed traditional GPR family models, Random Forests (RF), and eXtreme Gradient Boosting (XGBoost) across both datasets, with its advantages being particularly evident in small-sample predictions. 22 | 23 | The following figure illustrates the impact of noise on the performance of GGPR across two datasets. To ensure model robustness, we recommend setting the noise level for GGPR at or around 10-1 (0.1). 24 | 25 | 26 | 27 | ### Exploratory Spatial Data Analysis 28 | It is important to note that when used as an ESDA model, the GGPR model is employed to predict its own data (in-sample prediction) to maximally learn spatial patterns and capture geographic phenomena, a practice similar to that of MGWR and GRF. To avoid the over-fitting, we employed five-fold cross validation to train GGPR to get the best parameters. Then, we reintroduced the entire dataset into the model for in-sample prediction to fit the model, and subsequently used GeoShapley Python package to explain the results, obtaining the distribution of GeoShapley values. 29 | 30 | As shown in following figure, by comparing the true spatially varying coefficients, we can observe that both MGWR and GGPR are capable of capturing spatial effects, and their coefficient estimation results are very similar. Although our simulated dataset includes more noise than [Ziqi Li (2024)](https://github.com/Ziqi-Li/geoshapley), they are essentially similar in nature. By comparing the ability of traditional machine learning models (GPR, XGBoost, RF, SVM) presented in [Ziqi Li (2024)](https://github.com/Ziqi-Li/geoshapley) to capture spatial effects, we observe that GGPR demonstrates significant advantages. This effectively highlights GGPR’s strength as a spatial machine learning model in understanding geographic processes. 31 | 32 | 33 | We presented the explainable results for the referendum dataset. 34 | 35 | 36 | 37 | 38 | ### Dataset 39 | Two datasets were used to evaluate the performance of GGPR: ride-hailing service demand in Chicago [(Ziqi Li, 2022)](https://github.com/Ziqi-Li/SHAP_spatial_data_paper) and the referendum on EU membership in the UK [(Evan Odell, 2020)](https://cran.r-project.org/src/contrib/Archive/parlitools/). 40 | 41 | ### Reference: 42 | To cite this paper: Zhenzhi Jiao & Ran Tao (2025). Geographical Gaussian Process Regression (GGPR): A Spatial Machine Learning Model Based on Spatial Similarity. Geographical Analysis. https://doi.org/10.1111/gean.12423 43 | 44 | Zhu, A. X., Lu, G., Liu, J., Qin, C. Z., & Zhou, C. (2018). Spatial prediction based on Third Law of Geography. Annals of GIS, 24(4), 225-240. 45 | 46 | Li, Z. (2024). GeoShapley: A Game Theory Approach to Measuring Spatial Effects in Machine Learning Models. Annals of the American Association of Geographers, 1-21. 47 | 48 | Li, Z. (2022). Extracting spatial effects from machine learning model using local interpretation method: An example of SHAP and XGBoost. Computers, Environment and Urban Systems, 96, 101845. 49 | 50 | -------------------------------------------------------------------------------- /geoshapley.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tqdm.auto import tqdm 4 | import scipy.special 5 | import itertools 6 | import matplotlib.pyplot as plt 7 | from math import factorial,ceil 8 | from joblib import Parallel, delayed 9 | 10 | 11 | class GeoShapleyExplainer: 12 | def __init__(self, predict_f, background=None, g=2): 13 | """ 14 | Initialize the GeoShapleyExplainer. 15 | 16 | predict_f: The predict function of the model to be explained. 17 | background: The background data (numpy array) used for the explanation. 18 | g: The number of location features in the data (default is 2). For example, feature set contains a pair of cooridnates (lat,long) g=2. 19 | """ 20 | self.predict_f = predict_f 21 | self.background = background 22 | self.g = g 23 | self.n, self.M = background.shape 24 | 25 | 26 | def _kernel_geoshap_single(self, x, reference): 27 | """ 28 | Calculate GeoShapley value for a single sample and a reference point in the background data 29 | 30 | x: current sample 31 | reference: a reference point in the background data 32 | """ 33 | 34 | #M = 4, g = 2, k = 2 35 | 36 | k = self.M - self.g 37 | M = self.M 38 | 39 | Z = np.zeros((2**(k+1),2*k+2)) 40 | 41 | #intercept 42 | Z[:,-1] = 1 43 | 44 | weights = np.zeros(2**(k+1)) 45 | 46 | V = np.zeros((2**(k+1),M)) 47 | 48 | for i in range(2**(k+1)): 49 | V[i,:] = reference 50 | 51 | #Mark 1 for each combination 52 | for i,s in enumerate(self._powerset(range(k+1))): 53 | 54 | s = list(s) 55 | Z[i,s] = 1 56 | V[i,s] = x[s] 57 | 58 | if k in s: #If location is in 59 | V[i, (k+1):] = x[(k+1):] 60 | 61 | if (len(s) > 1): #mark interaction 62 | for j in s: 63 | if j < k: 64 | Z[i, k+1+j] = 1 65 | 66 | weights[i] = self._shapley_kernel(k+1, len(s)) 67 | #print("s:", s) 68 | #print("Z:", Z[i,:]) 69 | 70 | y = self.predict_f(V).reshape(-1) 71 | 72 | #Solve WLS 73 | #ZTw = np.dot(Z.T, np.diag(weights)) 74 | ZTw = Z.T * weights 75 | 76 | phi = np.linalg.solve(np.dot(ZTw, Z), np.dot(ZTw, y)) 77 | 78 | return phi 79 | 80 | 81 | def _kernel_geoshap_all(self, x): 82 | """ 83 | Calculate GeoShapley value for a single sample and averaged over the background data 84 | 85 | x: current sample 86 | """ 87 | 88 | k = self.M - self.g 89 | n = self.n 90 | 91 | # feature primary + 92 | # 2*geo_interaction to other features + 93 | # interaction + 94 | # intercept 95 | 96 | phi = np.zeros(k + k + 1 + 1) 97 | 98 | for i in range(n): 99 | reference = self.background[i,:] 100 | phi = phi + self._kernel_geoshap_single(x, reference) 101 | 102 | phi = phi/n 103 | base_value = phi[-1] 104 | geoshap_values = phi[:-1] 105 | 106 | return base_value, geoshap_values 107 | 108 | 109 | 110 | def explain(self, X_geo, n_jobs=-1): 111 | """ 112 | Explain the data. 113 | 114 | X_geo: pandas dataframe to be explained 115 | n_jobs: number of jobs for parallel computation (default is -1, using all available processors) 116 | 117 | return: A GeoShapleyResults object containing the results of the explanation. 118 | """ 119 | 120 | self.X_geo = X_geo 121 | n,M = X_geo.shape 122 | k = M - self.g 123 | 124 | geoshaps_total = np.zeros((n,(2*k + 1))) 125 | 126 | # Parallel computation 127 | results = Parallel(n_jobs=n_jobs)(delayed(self._kernel_geoshap_all)(X_geo.values[i, :]) for i in tqdm(range(n))) 128 | 129 | # Extract results 130 | geoshaps_total = np.array([result[1] for result in results]) 131 | base_value = results[0][0] # Assuming base_value is same for all 132 | 133 | primary = geoshaps_total[:,:k] 134 | geo = geoshaps_total[:,k] 135 | geo_intera = geoshaps_total[:,(k+1):] 136 | 137 | 138 | return GeoShapleyResults(self, base_value, primary, geo, geo_intera) 139 | 140 | 141 | def _powerset(self, iterable): 142 | """ 143 | Calculate possible coliation sets 144 | 145 | """ 146 | s = list(iterable) 147 | return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s)+1)) 148 | 149 | 150 | def _shapley_kernel(self, M, s): 151 | """ 152 | Calculate Shapley Kernel 153 | 154 | M: number of features 155 | s: number of features in the coalition 156 | """ 157 | if s == 0 or s == M: 158 | return 100000000 159 | return (M-1)/(scipy.special.binom(M,s)*s*(M-s)) 160 | 161 | 162 | 163 | 164 | class GeoShapleyResults: 165 | def __init__(self, explainer, base_value, primary, geo, geo_intera): 166 | """ 167 | Initializes the GeoShapleyResults. 168 | 169 | base_value: The base value 170 | primary: The primary global feature effects 171 | geo: The intrinsic location effect 172 | geo_intera: The interaction effects between location and other features 173 | X_geo: The data being explained 174 | 175 | """ 176 | self.base_value = base_value 177 | self.primary = primary 178 | self.geo = geo 179 | self.geo_intera = geo_intera 180 | self.explainer = explainer 181 | self.predict_f = explainer.predict_f 182 | self.X_geo = explainer.X_geo 183 | self.g = explainer.g 184 | self.M = explainer.M 185 | self.background = explainer.background 186 | 187 | 188 | def get_svc(self, col, coef_type = "gwr", include_primary=False, coords=None): 189 | """ 190 | Calculate the spatial (location-spefific) coefficient for each feature 191 | 192 | col: specify the column index to be calculated 193 | coef_type: 194 | "raw": raw coefficient based on the ratio of interaction effect and mean removed feature value. 195 | May result in extreme values. 196 | "gwr": coefficient based on GWR smoothing. Requires mgwr package. 197 | 198 | include_primary: whether to include the primary effect in the spatial coefficient 199 | coords: a numpy array of the coordinates of the data. If not provided, the last two columns of the data will be used as coordinates. 200 | 201 | """ 202 | 203 | n,k = self.primary.shape 204 | 205 | params = np.zeros((n, k)) 206 | params[:,:] = self.geo_intera 207 | 208 | if include_primary: 209 | params[:,:] = params[:,:] + self.primary 210 | 211 | for j in col: 212 | if coef_type == "raw": 213 | params[:,j] = params[:,j] / (self.X_geo.values - self.X_geo.values.mean(axis=0))[:,j] 214 | 215 | if coef_type == "gwr": 216 | try: 217 | import mgwr 218 | except ImportError: 219 | print("Please install mgwr package (e.g., pip install mgwr)") 220 | 221 | if coords is None: #Assuming the last two columns are the coordinates 222 | coords = np.array(list(zip(self.X_geo.values[:,-2], self.X_geo.values[:,-1]))) 223 | 224 | y = params[:,j].reshape(-1,1) 225 | X = (self.X_geo.values - self.X_geo.values.mean(axis=0))[:,j].reshape(-1,1) 226 | gwr_selector = mgwr.sel_bw.Sel_BW(coords, y, X) 227 | gwr_bw = gwr_selector.search(bw_min=20) 228 | gwr_model = mgwr.gwr.GWR(coords, y, X, gwr_bw).fit() 229 | params[:,j] = gwr_model.params[:,1] 230 | 231 | return params[:,col] 232 | 233 | 234 | def geoshap_to_shap(self): 235 | """ 236 | Convert GeoShapley values to Shapley values. 237 | This will evenly redistribute the interaction effect evenly to a feature-location pair. 238 | 239 | """ 240 | n,k = self.primary.shape 241 | params = np.zeros((n, k+1)) 242 | 243 | params[:,:-1] = self.primary + self.geo_intera/2 244 | params[:,-1] = self.base_value + self.geo + np.sum(self.geo_intera/2,axis=1) 245 | 246 | return params 247 | 248 | 249 | def summary_plot(self, include_interaction=True, dpi=200, **kwargs): 250 | """ 251 | Generate a SHAP-style summary plot of the GeoShapley values. 252 | 253 | include_interaction: whether to include the interaction effect in the summary plot 254 | dpi: figure dpi 255 | kwargs: other arguments passed to shap.summary_plot 256 | 257 | """ 258 | 259 | try: 260 | import shap 261 | except ImportError: 262 | print("Please install shap package (e.g., pip install shap)") 263 | 264 | 265 | names = self.X_geo.iloc[:,:-self.g].copy() 266 | names["GEO"] = 0 267 | 268 | if include_interaction: 269 | total = np.hstack((self.primary, self.geo.reshape(-1,1), self.geo_intera)) 270 | names[[name + " x GEO" for name in self.X_geo.columns[:-self.g]]] = self.X_geo.iloc[:,:-self.g].copy() 271 | else: 272 | total = self.geoshap_to_shap() 273 | 274 | plt.figure(dpi=dpi) 275 | shap.summary_plot(total, names, show=False, **kwargs) 276 | 277 | fig, ax = plt.gcf(), plt.gca() 278 | ax.set_xlabel("GeoShapley value (impact on model prediction)") 279 | 280 | 281 | def partial_dependence_plots(self, gam_curve=False, max_cols=3, figsize=None, dpi=200, **kwargs): 282 | """ 283 | Plot partial dependence plots for each feature. 284 | 285 | gam_curve: whether to plot the smoothed GAM curve 286 | max_cols: maximum number of columns in the plot 287 | figsize: figure size 288 | dpi: figure dpi 289 | kwargs: other arguments passed to plt.scatter 290 | 291 | """ 292 | 293 | k = self.primary.shape[1] 294 | 295 | if gam_curve: 296 | try: 297 | import pygam 298 | except ImportError: 299 | print("Please install pygam package (e.g., pip install pygam)") 300 | 301 | num_cols = min(k, max_cols) 302 | num_rows = ceil(k / num_cols) 303 | 304 | fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize) 305 | axs = axs if num_rows > 1 else np.array([axs]) 306 | axs = axs.flatten() 307 | 308 | col_counter = 0 309 | for col in range(k): 310 | axs[col_counter].axhline(0, linestyle='--',color='black') 311 | 312 | if 's' not in kwargs: 313 | kwargs['s'] = 12 314 | if 'color' not in kwargs: 315 | kwargs['color'] = "#2196F3" 316 | if 'edgecolors' not in kwargs: 317 | kwargs['edgecolors'] = "white" 318 | if 'lw' not in kwargs: 319 | kwargs['lw'] = 0.3 320 | 321 | axs[col_counter].scatter(self.X_geo.iloc[:,col], self.primary[:,col],**kwargs) 322 | 323 | axs[col_counter].set_ylabel("GeoShapley Value") 324 | axs[col_counter].set_xlabel(self.X_geo.iloc[:,col].name) 325 | 326 | 327 | if gam_curve: 328 | lam = np.logspace(2, 7, 5).reshape(-1,1) 329 | gam = pygam.LinearGAM(pygam.s(0),fit_intercept=False).gridsearch(self.X_geo.iloc[:,col].values.reshape(-1,1), 330 | self.primary[:,col].reshape(-1,1), lam=lam) 331 | 332 | for i, term in enumerate(gam.terms): 333 | XX = gam.generate_X_grid(term=i) 334 | pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95) 335 | 336 | axs[col_counter].plot(XX,pdep, color="red",lw=2) 337 | 338 | col_counter += 1 339 | 340 | for i in range(col_counter, num_rows * num_cols): 341 | axs[i].axis('off') 342 | 343 | plt.tight_layout() 344 | 345 | 346 | def summary_statistics(self,include_interaction=True): 347 | """ 348 | Calculates summary statistics for the GeoShapley values. 349 | The table is ranked based on the mean absolute value of the GeoShapley values. 350 | 351 | include_interaction: whether to include the interaction effect in the summary statistics 352 | 353 | """ 354 | cols = ["min","25%","50%","75%","max"] 355 | summary_table = pd.DataFrame(np.percentile(self.primary, [0,25,50,75,100],axis=0).T,columns=cols) 356 | summary_table.index = self.X_geo.columns[:-self.g] 357 | summary_table["mean"] = np.mean(self.primary,axis=0) 358 | summary_table["std"] = np.std(self.primary,axis=0) 359 | summary_table["abs. mean"] = np.mean(np.abs(self.primary),axis=0) 360 | 361 | summary_table.loc['GEO'] = np.append(np.percentile(self.geo, [0,25,50,75,100],axis=0).T, 362 | [np.mean(self.geo), np.std(self.geo), 363 | np.mean(np.abs(self.geo))]) 364 | 365 | 366 | if include_interaction: 367 | intera_summary_table = pd.DataFrame(np.percentile(self.geo_intera, [0,25,50,75,100],axis=0).T,columns=cols) 368 | intera_summary_table.index = self.X_geo.columns[:-self.g] + " x GEO" 369 | intera_summary_table["mean"] = np.mean(self.geo_intera,axis=0) 370 | intera_summary_table["std"] = np.std(self.geo_intera,axis=0) 371 | intera_summary_table["abs. mean"] = np.mean(np.abs(self.geo_intera),axis=0) 372 | 373 | summary_table = pd.concat([summary_table, intera_summary_table], ignore_index=False) 374 | 375 | summary_table.sort_values(by=['abs. mean'],ascending=False,inplace=True) 376 | 377 | return summary_table 378 | 379 | def check_additivity(self,atol=1e-5): 380 | """ 381 | Check if the seperate components of GeoShapley add up to the model prediction. 382 | 383 | """ 384 | total = np.sum(self.primary,axis=1) + self.geo + np.sum(self.geo_intera,axis=1) 385 | 386 | print("Components add up to model prediction: ", 387 | np.allclose(total+self.base_value, self.predict_f(self.X_geo).reshape(-1), atol=atol)) 388 | -------------------------------------------------------------------------------- /images/example.txt: -------------------------------------------------------------------------------- 1 | 2 | --------------------------------------------------------------------------------