├── README.md ├── LICENSE ├── main.py ├── .gitignore ├── metrics.py ├── utils.py ├── fc_regression_baseline.py ├── training_environment.py ├── regression_model.py ├── layers.py ├── dataset_preparation.py └── dataset.py /README.md: -------------------------------------------------------------------------------- 1 | # Mobility-Flows-Neural-Networks 2 | 3 | Code for the paper 'Learning Mobility Flows from Urban Features with Spatial Interaction Models and Neural Networks' presented at the 6th IEEE International Conference on Smart Computing (SMARTCOMP 2020). 4 | 5 | Preprint available available [here](https://arxiv.org/abs/2004.11924). 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Felix Opolka 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from regression_model import run_training 2 | from training_environment import TrainingEnvironment, NodeConvGraph, NodeConvType, JKType 3 | 4 | if __name__ == '__main__': 5 | hyperparameters = { 6 | # data set 7 | "data_base_path": "Data/London_high", 8 | "resampling": True, 9 | "n_quantiles": 5000, 10 | "num_bins": 4, 11 | "excluded_node_feature_columns": tuple(), 12 | "excluded_edge_feature_columns": tuple(), 13 | # model 14 | "hidden_dim": 16, 15 | "edge_rep_size": 8, 16 | "num_edge_rep_layers": 2, 17 | "include_node_reps": True, 18 | "node_rep_size": 8, 19 | "num_node_rep_layers": 1, 20 | "improved_gcn": True, 21 | "jk_type": JKType.NoJK, 22 | "node_conv_type": NodeConvType.GraphConvolution, 23 | "adj_flow_threshold": 0, 24 | "dna_heads": 1, 25 | "dna_groups": 1, 26 | "include_edge_flow_feat": False, 27 | "drop_prob": 0.3, 28 | "weighted_loss": False, 29 | "regression_loss": "L2", 30 | # training 31 | "cp_folder": "./checkpoints/test", 32 | "lr": 0.01, 33 | "lr_schedule": (50, 65, 80, 95), 34 | "num_epochs": 110, 35 | "batch_size": 256, 36 | } 37 | TrainingEnvironment.hyperparameter_search(hyperparameters, 3, run_training) 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def compute_mae(predictions, labels, data): 5 | preds_unscaled, y_unscaled = _unscale(predictions, labels, data) 6 | mae = mae_metric(preds_unscaled, y_unscaled) 7 | return mae 8 | 9 | 10 | def compute_mape(predictions, labels, data): 11 | preds_unscaled, y_unscaled = _unscale(predictions, labels, data) 12 | mape = mape_metric(preds_unscaled, y_unscaled) 13 | return mape 14 | 15 | 16 | def compute_ssi(predictions, labels, data): 17 | preds_unscaled, y_unscaled = _unscale(predictions, labels, data) 18 | return ssi_metric(preds_unscaled, y_unscaled) 19 | 20 | 21 | def compute_geh(predictions, labels, data): 22 | preds_unscaled, y_unscaled = _unscale(predictions, labels, data) 23 | return geh_metric(preds_unscaled, y_unscaled) 24 | 25 | 26 | def compute_cpl(predictions, labels, data): 27 | preds_unscaled, y_unscaled = _unscale(predictions, labels, data) 28 | return cpl_metric(preds_unscaled, y_unscaled) 29 | 30 | 31 | def compute_cpc(predictions, labels, data): 32 | preds_unscaled, y_unscaled = _unscale(predictions, labels, data) 33 | return cpc_metric(preds_unscaled, y_unscaled) 34 | 35 | 36 | def compute_binned_metric(metric_f, predictions, labels, bins, data, num_bins): 37 | bins = np.concatenate(bins, axis=0).reshape(-1) 38 | preds_unscaled, y_unscaled = _unscale(predictions, labels, data) 39 | binned_metric = _compute_binned_metric(preds_unscaled, y_unscaled, 40 | bins, num_bins, metric_f) 41 | return binned_metric 42 | 43 | 44 | def compute_macro_metric(metric_f, predictions, labels, bins, data, num_bins): 45 | binned_metric = compute_binned_metric(metric_f, predictions, labels, bins, 46 | data, num_bins) 47 | macro_metric = (np.nanmean(binned_metric) 48 | if not np.all(np.isnan(binned_metric)) 49 | else np.nan) 50 | return macro_metric 51 | 52 | 53 | def _unscale(preds, y, data): 54 | preds_unscaled = np.concatenate(preds, axis=0).reshape(-1, 1) 55 | preds_unscaled = data.label_scaler.inverse_transform(preds_unscaled) 56 | preds_unscaled = preds_unscaled.reshape(-1) 57 | y_unscaled = np.concatenate(y, axis=0).reshape(-1, 1) 58 | y_unscaled = data.label_scaler.inverse_transform(y_unscaled) 59 | y_unscaled = y_unscaled.reshape(-1) 60 | return preds_unscaled, y_unscaled 61 | 62 | 63 | def mae_metric(preds_unscaled, y_unscaled): 64 | mae = np.absolute(preds_unscaled.reshape(-1) - y_unscaled.reshape(-1)) 65 | mae = np.mean(mae) 66 | return mae 67 | 68 | 69 | def mape_metric(preds_unscaled, y_unscaled): 70 | non_zero_target_idcs = y_unscaled > 1e-5 71 | if np.sum(non_zero_target_idcs) == 0: 72 | return np.nan 73 | non_zero_targets = y_unscaled[non_zero_target_idcs] 74 | predicted = preds_unscaled[non_zero_target_idcs] 75 | mape = np.absolute(predicted - non_zero_targets) / non_zero_targets 76 | mape = np.mean(mape, axis=0) 77 | return mape 78 | 79 | 80 | def ssi_metric(preds_unscaled, y_unscaled): 81 | preds_unscaled = preds_unscaled[y_unscaled > 0] 82 | y_unscaled = y_unscaled[y_unscaled > 0] 83 | ssi = (np.sum(2 * np.minimum(preds_unscaled, y_unscaled) 84 | / (preds_unscaled + y_unscaled)) 85 | / len(y_unscaled)) 86 | return ssi 87 | 88 | 89 | def geh_metric(preds_unscaled, y_unscaled): 90 | geh = np.sqrt(2 * (preds_unscaled - y_unscaled)**2 91 | / (preds_unscaled + y_unscaled)) 92 | geh_percentage = len(geh[geh < 5]) / len(geh) 93 | return geh_percentage 94 | 95 | 96 | def cpl_metric(preds_unscaled, y_unscaled): 97 | cpl = (2 * np.sum(preds_unscaled * y_unscaled > 1e-8) 98 | / (np.sum(preds_unscaled > 1e-8) + np.sum(y_unscaled > 1e-8))) 99 | return cpl 100 | 101 | 102 | def cpc_metric(preds_unscaled, y_unscaled): 103 | cpc = (np.sum(2 * np.minimum(preds_unscaled, y_unscaled)) 104 | / (np.sum(preds_unscaled) + np.sum(y_unscaled))) 105 | return cpc 106 | 107 | 108 | def _compute_binned_metric(out, y, bins, num_bins, metric_f): 109 | """ 110 | Computes the given metric for each bin individually. 111 | :param out: NumPy array containing model predictions. 112 | :param y: NumPy array containing labels. 113 | :param bins: NumPy array containing the bins that each label belongs to. 114 | :param num_bins: Total number of bins. 115 | :param metric_f: Function which receives the model predictions and labels 116 | as arguments (in that order) and returns a scalar metric value. 117 | :return: NumPy array of shape [num_bins] containing the metric value for 118 | each bin. 119 | """ 120 | metric_vals = [] 121 | for bin_idx in range(num_bins): 122 | mask = bins == bin_idx 123 | if np.sum(mask) > 0: 124 | vals = metric_f(out[mask], y[mask]) 125 | metric_vals.append(vals) 126 | else: 127 | metric_vals.append(np.nan) 128 | return np.array(metric_vals) 129 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | import torch.sparse as sp 5 | import scipy.sparse as ssp 6 | 7 | 8 | def split_bucketed_data(bin_idcs): 9 | """ 10 | Splits a given set of samples into the specified number of buckets of 11 | equal size. Samples are assigned to buckets based on their label. Each 12 | bucket is split into train, validation, and test set and the overall 13 | training, validation, and test sets are the concatenation of the individual 14 | bucket subsets. 15 | This ensures that train, validation, and test set all contain the same 16 | number of samples of all sizes. 17 | :param bin_idcs: Specifies for each label the bucket it belongs to 18 | :return: Arrays specifying the indices of samples belonging to the 19 | training, validation, and test set respectively. 20 | """ 21 | all_train_idcs = [] 22 | all_val_idcs = [] 23 | all_test_idcs = [] 24 | num_bins = torch.max(bin_idcs) + 1 25 | for idx in range(num_bins): 26 | bucket_samples, = np.where(bin_idcs == idx) 27 | np.random.shuffle(bucket_samples) 28 | split1 = int(0.7 * len(bucket_samples)) 29 | split2 = int(0.8 * len(bucket_samples)) 30 | train_idcs = bucket_samples[:split1] 31 | val_idcs = bucket_samples[split1:split2] 32 | test_idcs = bucket_samples[split2:] 33 | all_train_idcs.append(train_idcs) 34 | all_val_idcs.append(val_idcs) 35 | all_test_idcs.append(test_idcs) 36 | return (np.concatenate(all_train_idcs), np.concatenate(all_val_idcs), 37 | np.concatenate(all_test_idcs)) 38 | 39 | 40 | def bin_data(labels, num_buckets, scale="linear", base=10, bin_bounds=None): 41 | """ 42 | Splits the data into specified number of buckets of equal size. Returns for 43 | each sample the index of the the bucket it belongs to. 44 | :param labels: Unscaled labels used for computing bucket boundaries and 45 | assigning samples to buckets. 46 | :param num_buckets: 47 | :param scale: Whether to use separate the label domain into buckets on a 48 | linear or logarithmic scale. Hence the two options are either "linear" or 49 | "logarithmic". 50 | :param base: Only relevant if scale="logarithmic". Specifies the base of 51 | the logarithm. 52 | :param bin_bounds: Only relevant if scale="custom". 53 | :return: Array of the same length as labels specifying for each sample 54 | which bucket it belongs to. 55 | """ 56 | max_label = np.max(labels) 57 | if scale == "logarithmic": 58 | bin_bounds = [] 59 | base_size = max_label / (base**(num_buckets-1)) 60 | for bin_idx in range(num_buckets): 61 | bin_bounds.append(base**bin_idx * base_size) 62 | bin_bounds[-1] = bin_bounds[-1] + 1.0 63 | elif scale == "linear": 64 | bin_size = int(math.ceil(float(max_label) / float(num_buckets))) 65 | bin_bounds = [bin_size * idx for idx in range(1, num_buckets+1)] 66 | elif scale == "custom" and bin_bounds != None: 67 | if len(bin_bounds) != num_buckets: 68 | raise ValueError(f"Error: Specified number of bins {num_buckets} " 69 | f"does not match specified bin_bounds " 70 | f"(length {len(bin_bounds)})") 71 | else: 72 | raise ValueError(f"Unknown scale type {scale}") 73 | print(f"\tBin bounds: {bin_bounds}") 74 | bin_idcs = np.digitize(labels, bin_bounds) 75 | return bin_idcs 76 | 77 | 78 | def to_sparse_tensor(mat): 79 | """ 80 | Converts a SciPy sparse matrix into a torch sparse tensor. 81 | """ 82 | if isinstance(mat, ssp.csr_matrix) or isinstance(mat, ssp.csc_matrix): 83 | mat = mat.tocoo() 84 | data = mat.data 85 | indices = np.concatenate((mat.row.reshape(1, -1), mat.col.reshape(1, -1)), 86 | axis=0) 87 | sparse_mat = sp.FloatTensor(torch.LongTensor(indices), 88 | torch.FloatTensor(data), 89 | torch.Size(mat.shape)) 90 | return sparse_mat 91 | 92 | 93 | def normalize(mx): 94 | """ 95 | Row-normalize sparse matrix. Adapted from 96 | https://github.com/tkipf/pygcn/blob/master/pygcn/utils.py. 97 | """ 98 | rowsum = np.array(mx.sum(1)) 99 | r_inv = np.power(rowsum, -1).flatten() 100 | r_inv[np.isinf(r_inv)] = 0. 101 | r_mat_inv = ssp.diags(r_inv) 102 | mx = r_mat_inv.dot(mx) 103 | return mx 104 | 105 | 106 | def summarize_tensor(x, title=""): 107 | with torch.no_grad(): 108 | print("-"*10, title, "-"*10, sep="") 109 | shape = x.shape 110 | print(f"Shape: {shape}") 111 | 112 | nans = torch.sum(torch.isnan(x)) 113 | print(f"NaNs: {nans}") 114 | 115 | nnz = torch.sum(x < 1e-8) 116 | print(f"NNZ: {nnz}") 117 | 118 | mean = torch.mean(x) 119 | print(f"Mean: {mean}") 120 | std = torch.std(x) 121 | print(f"Std: {std}") 122 | median = torch.median(x) 123 | print(f"Median: {median}") 124 | 125 | min = torch.min(x) 126 | print(f"Min: {min}") 127 | max = torch.max(x) 128 | print(f"Max: {max}") 129 | print("-"*(20+len(title))) 130 | 131 | 132 | def summarize_feature_matrix(features): 133 | for col_idx in range(features.shape[1]): 134 | values = features[:, col_idx] 135 | mean = np.mean(values) 136 | std = np.std(values) 137 | min_val = np.min(values) 138 | max_val = np.max(values) 139 | num_values = len(np.unique(values)) 140 | is_integer = np.sum(np.ceil(values) - values) <= 1e-10 141 | print("column index:", col_idx) 142 | print(f"statistics: {mean:.3f} +/- {std:.3f}") 143 | print(f"min, max: [{min_val:.3f}, {max_val:.3f}]") 144 | print(f"num unique values: {num_values}") 145 | print(f"data type:", "integer" if is_integer else "float") 146 | print() -------------------------------------------------------------------------------- /fc_regression_baseline.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import os 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | 8 | from dataset import UrbanPlanningDataset 9 | from metrics import compute_mae, compute_mape, \ 10 | compute_ssi, compute_geh, compute_cpl, \ 11 | compute_cpc, compute_binned_metric, compute_macro_metric, mae_metric, \ 12 | mape_metric, ssi_metric, geh_metric, cpl_metric, cpc_metric 13 | from training_environment import TrainingSettings as ts, PerformanceLogger, \ 14 | OutputLogger 15 | from training_environment import checkpoint_filepath 16 | from regression_model import validate_epoch, train_epoch 17 | 18 | 19 | parser = argparse.ArgumentParser(description='UP') 20 | parser.add_argument('--enable-cuda', action='store_true', 21 | help='Enable CUDA') 22 | args = parser.parse_args() 23 | args.device = None 24 | if args.enable_cuda and torch.cuda.is_available(): 25 | args.device = torch.device('cuda') 26 | else: 27 | args.device = torch.device('cpu') 28 | 29 | 30 | class FCEdgeRegressor(nn.Module): 31 | 32 | def __init__(self, num_node_features, num_edge_features, hidden_dim): 33 | super(FCEdgeRegressor, self).__init__() 34 | 35 | self.core = nn.Sequential( 36 | nn.Linear(num_edge_features + 2 * num_node_features, 37 | hidden_dim), 38 | nn.ReLU(), 39 | nn.BatchNorm1d(hidden_dim), 40 | nn.Dropout(p=ts.drop_prob), 41 | nn.Linear(hidden_dim, hidden_dim), 42 | nn.ReLU(), 43 | nn.BatchNorm1d(hidden_dim), 44 | nn.Dropout(p=ts.drop_prob), 45 | nn.Linear(hidden_dim, hidden_dim), 46 | nn.ReLU(), 47 | nn.BatchNorm1d(hidden_dim), 48 | nn.Dropout(p=ts.drop_prob), 49 | nn.Linear(hidden_dim, 1), 50 | ) 51 | 52 | def forward(self, x_nodes, x_edges_batch, edge_indices_batch, edge_indices, 53 | edge_weight=None): 54 | """ 55 | :param x_nodes: Node features of shape [N, D] 56 | :param x_edges_batch: Edge features of shape [B, K] 57 | :param edge_indices_batch: Matrix of shape [B, 2] indicating the 58 | indices of the nodes connected by each edge. 59 | :param edge_indices: Matrix of shape [2, E] indicating for each edge 60 | in the graph the two node IDs it connects. 61 | :return: Predictions for edges with shape [B, 1] 62 | """ 63 | 64 | x_nodes_left = x_nodes[edge_indices_batch[:, 0]] 65 | x_nodes_right = x_nodes[edge_indices_batch[:, 1]] 66 | x_concat = torch.cat([x_nodes_left, x_edges_batch, x_nodes_right], dim=-1) 67 | 68 | out = self.core(x_concat) 69 | 70 | return out.squeeze(-1) 71 | 72 | 73 | def run_training(): 74 | # Set up training environment 75 | if not os.path.exists(ts.cp_folder): 76 | os.makedirs(ts.cp_folder) 77 | log_filepath = checkpoint_filepath(ts.cp_folder, "log", __file__, {}, 78 | ".pk") 79 | summary_filepath = checkpoint_filepath(ts.cp_folder, "summary", __file__, 80 | {}, ".txt") 81 | output_logger = OutputLogger(checkpoint_filepath(ts.cp_folder, "output", 82 | __file__, {}, ".txt")) 83 | sys.stdout = output_logger 84 | ts.write_summary_file(checkpoint_filepath(ts.cp_folder, "hyperparams", 85 | __file__, {}, "txt")) 86 | print(ts.settings_description()) 87 | 88 | # Load data 89 | ds = UrbanPlanningDataset(ts.data_base_path, ts.num_bins, ts.batch_size, 90 | ts.n_quantiles, ts.resampling, 91 | ts.excluded_node_feature_columns, 92 | ts.excluded_edge_feature_columns, False, 93 | ts.include_edge_flow_feat, ts.adj_flow_threshold, 94 | ts.seed) 95 | # Preprocess data 96 | ds.to(args.device) 97 | 98 | def _get_metric_funcs(prefix): 99 | preds_key = prefix+"_predictions" 100 | labels_key = prefix+"_labels" 101 | bins_key = prefix+"_bins" 102 | return { 103 | prefix+"_loss": (lambda m: np.nanmean(m[prefix+"_loss"])), 104 | prefix + "_mae": (lambda m: compute_mae(m[preds_key], m[labels_key], ds)), 105 | prefix + "_binned_mae": (lambda m: compute_binned_metric(mae_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 106 | prefix + "_macro_mae": (lambda m: compute_macro_metric(mae_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 107 | prefix + "_mape": (lambda m: compute_mape(m[preds_key], m[labels_key], ds)), 108 | prefix + "_binned_mape": (lambda m: compute_binned_metric(mape_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 109 | prefix + "_macro_mape": (lambda m: compute_macro_metric(mape_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 110 | prefix + "_ssi": (lambda m: compute_ssi(m[preds_key], m[labels_key], ds)), 111 | prefix + "_binned_ssi": (lambda m: compute_binned_metric(ssi_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 112 | prefix + "_macro_ssi": (lambda m: compute_macro_metric(ssi_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 113 | prefix + "_geh": (lambda m: compute_geh(m[preds_key], m[labels_key], ds)), 114 | prefix + "_binned_geh": (lambda m: compute_binned_metric(geh_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 115 | prefix + "_macro_geh": (lambda m: compute_macro_metric(geh_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 116 | prefix + "_cpl": (lambda m: compute_cpl(m[preds_key], m[labels_key], ds)), 117 | prefix + "_binned_cpl": (lambda m: compute_binned_metric(cpl_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 118 | prefix + "_macro_cpl": (lambda m: compute_macro_metric(cpl_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 119 | prefix + "_cpc": (lambda m: compute_cpc(m[preds_key], m[labels_key], ds)), 120 | prefix + "_binned_cpc": (lambda m: compute_binned_metric(cpc_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 121 | prefix + "_macro_cpc": (lambda m: compute_macro_metric(cpc_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 122 | } 123 | metric_funcs = { 124 | "train_loss": (lambda m: np.nanmean(m["train_loss"])), 125 | **_get_metric_funcs("val"), 126 | **_get_metric_funcs("test"), 127 | } 128 | 129 | logger = PerformanceLogger(metric_funcs, "val_macro_mae", log_filepath, 130 | write_every=ts.write_log_every) 131 | 132 | predictor = FCEdgeRegressor(ds.num_node_feats, ds.num_edge_feats, 133 | hidden_dim=ts.hidden_dim) 134 | predictor = predictor.to(device=args.device) 135 | 136 | optimizer = torch.optim.Adam(predictor.parameters(), lr=ts.lr) 137 | lr_schedule = torch.optim.lr_scheduler.MultiStepLR(optimizer, 138 | list(ts.lr_schedule)) 139 | loss_criterion = (nn.L1Loss() if ts.regression_loss == "L1" 140 | else nn.MSELoss()) 141 | 142 | print("Start training") 143 | for epoch in range(-1, ts.num_epochs): 144 | if epoch >= 0: 145 | train_epoch(epoch, predictor, ds, optimizer, loss_criterion, 146 | logger, lr_schedule) 147 | validate_epoch(epoch, predictor, ds, loss_criterion, ds.val_loader, 148 | logger, test=False) 149 | validate_epoch(epoch, predictor, ds, loss_criterion, ds.test_loader, 150 | logger, test=True) 151 | 152 | logger.complete_epoch() 153 | print(logger.epoch_summary()) 154 | if epoch % ts.write_log_every == 0: 155 | logger.write(log_filepath) 156 | logger.write(log_filepath) 157 | logger.write_summary(summary_filepath, ts.settings_description()) 158 | return logger 159 | 160 | 161 | if __name__ == '__main__': 162 | run_training() 163 | -------------------------------------------------------------------------------- /training_environment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | from datetime import datetime 4 | 5 | import numpy as np 6 | import os 7 | import pickle as pk 8 | from collections import defaultdict 9 | import enum 10 | from pathlib import Path 11 | 12 | 13 | class NodeConvGraph(enum.Enum): 14 | Geo = 1 15 | UnweightedFlow = 2 16 | 17 | 18 | class NodeConvType(enum.Enum): 19 | GraphConvolution = 1 20 | GraphAttention = 2 21 | GraphNodeEdgeConvolution = 3 22 | DNAConvolution = 4 23 | 24 | 25 | class JKType(enum.Enum): 26 | NoJK = "" 27 | Concat = "cat" 28 | MaxPool = "max" 29 | LSTM = "lstm" 30 | 31 | 32 | class TrainingSettings: 33 | ### data set ### 34 | data_base_path = "Data/London_high" 35 | resampling = True 36 | n_quantiles = 1000 37 | num_bins = 4 38 | excluded_node_feature_columns = tuple() 39 | excluded_edge_feature_columns = tuple() 40 | 41 | ### model ### 42 | hidden_dim = 16 # Dimensionality of any intermediate layers 43 | edge_rep_size = 16 # Hidden size of the target edge representation 44 | num_edge_rep_layers = 2 # Number of linear layers for computing the target edge representation 45 | include_node_reps = True # Wheather to include node representations at all 46 | node_rep_size = 16 # Hidden size of the node feature representations 47 | num_node_rep_layers = 1 # Number of GNN layers for computing node representations 48 | improved_gcn = False # Whether to use improved GCN convolutions (i.e. 2 on the adj-matrix diagonal) 49 | jk_type = JKType.NoJK # Whether to use JumpingKnowledge skip connections at all and if yet, which type 50 | node_conv_type = NodeConvType.GraphConvolution 51 | adj_flow_threshold = 0 # When computing node convolutions based on the flow adjancency matrix, only include edges with a flow greater or equal this threshold 52 | dna_heads = 1 # Number of attention heads to be used for DNA convolutions 53 | dna_groups = 1 # Number of channel groups to be used for DNA convolutions 54 | include_edge_flow_feat = False 55 | drop_prob = 0.5 56 | weighted_loss = False 57 | regression_loss = "L1" # other option: "L2" 58 | 59 | ### training ### 60 | cp_folder = "./checkpoints/" 61 | starting_seed = 7 62 | seed = 7 63 | lr = 0.001 64 | lr_schedule = (50, 65, 80) 65 | num_epochs = 100 66 | write_log_every = 20 67 | batch_size = 64 68 | 69 | if weighted_loss and resampling: 70 | raise ValueError("Weighted loss and resampling both set to True") 71 | 72 | @staticmethod 73 | def update_setting(**settings): 74 | for key, value in settings.items(): 75 | if not hasattr(TrainingSettings, key): 76 | raise ValueError(f"Attribute {key} not a valid hyperparameter.") 77 | setattr(TrainingSettings, key, value) 78 | if TrainingSettings.weighted_loss and TrainingSettings.resampling: 79 | raise ValueError("Weighted loss and resampling both set to True") 80 | 81 | @staticmethod 82 | def settigns_dict(): 83 | return {attr: getattr(TrainingSettings, attr) 84 | for attr in dir(TrainingSettings) 85 | if (not attr.startswith("__") 86 | and not callable(getattr(TrainingSettings, 87 | attr)))} 88 | 89 | @staticmethod 90 | def settings_description(): 91 | settings_dict = TrainingSettings.settigns_dict() 92 | return parameter_description_from_dict(settings_dict) 93 | 94 | @staticmethod 95 | def write_summary_file(filepath): 96 | filepath = (Path(filepath) if not isinstance(filepath, Path) 97 | else filepath) 98 | settings_dict = TrainingSettings.settigns_dict() 99 | with filepath.with_suffix(".pk").open("wb") as fd: 100 | pk.dump(settings_dict, fd) 101 | settings_description = TrainingSettings.settings_description() 102 | with filepath.open("w") as fd: 103 | fd.write(settings_description) 104 | 105 | 106 | class TrainingEnvironment: 107 | 108 | @staticmethod 109 | def _process_settings(hyperparam_settings): 110 | """ 111 | Takes a dictionary of lists/scalars and turns it into a list of 112 | dictionaries of scalars. 113 | """ 114 | max_length = max([len(l) 115 | for l in hyperparam_settings.values() 116 | if type(l) is list]+[1]) 117 | hyperparam_settings = {k: (l if type(l) is list else [l]*max_length) 118 | for k, l in hyperparam_settings.items()} 119 | # Go from dictionary of lists to list of dictionaries 120 | dicts = [dict(zip(hyperparam_settings, x)) 121 | for x in zip(*hyperparam_settings.values())] 122 | return dicts 123 | 124 | @staticmethod 125 | def hyperparameter_search(training_settings, runs, start_experiment_f): 126 | """ 127 | :param training_settings: Dictionary containing hyperparameters and 128 | othr training settings. Keys may only be attributes of the 129 | TrainingSettings class. Values may be scalars or lists. Using lists 130 | allows to specify the search space. All lists must have the same 131 | length. 132 | :param runs: Number of runs to perform for each training setting. 133 | :param start_experiment_f: Method for running an experiment. Must 134 | return a single measure of performance. 135 | :return: 136 | """ 137 | dicts = TrainingEnvironment._process_settings(training_settings) 138 | performances = [defaultdict(list) for _ in range(len(dicts))] 139 | # Make runs outer-loop to ensure initial results for all settings are 140 | # obtained asap 141 | for run_idx in range(runs): 142 | for settings_idx, settings_dict in enumerate(dicts): 143 | TrainingSettings.update_setting(**settings_dict) 144 | TrainingSettings.seed = (TrainingSettings.starting_seed 145 | * 3**run_idx) 146 | logger = start_experiment_f() 147 | min_epoch = np.argmin(logger[logger.minimizer]) 148 | performances[settings_idx]["min_epoch"].append(min_epoch) 149 | for key, values in logger.logs_dict.items(): 150 | performances[settings_idx][key].append(values[min_epoch]) 151 | # Write results (so far) to file 152 | TrainingEnvironment.write_summary(dicts, performances) 153 | 154 | @staticmethod 155 | def write_summary(settings_dicts, performances): 156 | """ 157 | :param settings_dicts: 158 | :param performances: 159 | :return: 160 | """ 161 | summary = "" 162 | for settings, performances in zip(settings_dicts, performances): 163 | description = parameter_description_from_dict(settings) 164 | summary += description + "\n\n" 165 | for key in sorted(performances.keys()): 166 | mean = np.mean(performances[key], axis=0) 167 | std = np.std(performances[key], axis=0) 168 | if isinstance(mean, np.ndarray): 169 | summary += f"{key}: {mean} +/- {std}\n" 170 | else: 171 | summary += f"{key}: {mean:.5f} +/- {std:.5f}\n" 172 | if not os.path.exists(TrainingSettings.cp_folder): 173 | os.makedirs(TrainingSettings.cp_folder) 174 | with open(os.path.join(TrainingSettings.cp_folder, "summary.txt"), "w") as fd: 175 | fd.write(summary) 176 | 177 | 178 | class PerformanceLogger: 179 | def __init__(self, metric_funcs, minimizer, log_filepath, write_every=20): 180 | """ 181 | :param metric_funcs: Dictionary of metric functions. A metric function 182 | accepts exactly one argument which is a dictionary of the metrics from 183 | a single epoch. It returns the corresponding metric. 184 | :param minimizer: Key of the variable that determines the final value 185 | stored in the summary. Usually the validation loss. 186 | """ 187 | self._metric_funcs = metric_funcs 188 | self._current_epoch_metrics = defaultdict(list) 189 | self.logs_dict = defaultdict(list) 190 | self.minimizer = minimizer 191 | self.log_filepath = log_filepath 192 | self.write_every = write_every 193 | self._write_countdown = write_every 194 | self._start_time = time.time() 195 | self._current_epoch = 0 196 | 197 | if "duration" in metric_funcs: 198 | raise ValueError("Key \"duration\" is a reserved key for internal" 199 | "use of PerformanceLogger.") 200 | 201 | def __getitem__(self, key): 202 | return self.logs_dict[key] 203 | 204 | def __setitem__(self, key, value): 205 | self.logs_dict[key] = value 206 | 207 | def add_values(self, metric_dict): 208 | for key, metric_batch in metric_dict.items(): 209 | self._current_epoch_metrics[key].append(metric_batch) 210 | # Write log if necessary 211 | self._write_countdown -= 1 212 | if self._write_countdown == 0: 213 | self.write(self.log_filepath) 214 | self._write_countdown = self.write_every 215 | 216 | def complete_epoch(self): 217 | """ 218 | Marks the epoch as finished and computes the epoch's metrics. 219 | """ 220 | for key, metric_func in self._metric_funcs.items(): 221 | metric = metric_func(self._current_epoch_metrics) 222 | self.logs_dict[key].append(metric) 223 | # add duration as additional metric 224 | duration = time.time() - self._start_time 225 | self.logs_dict["duration"].append(duration) 226 | self._start_time = time.time() 227 | 228 | self._current_epoch_metrics = defaultdict(list) 229 | self._current_epoch += 1 230 | 231 | def epoch_summary(self): 232 | summary = f"{self._current_epoch}:" 233 | for key, vals in self.logs_dict.items(): 234 | if key == "duration": 235 | continue 236 | if isinstance(vals[-1], np.ndarray): 237 | summary += f"\t{key}: {vals[-1]}\n" 238 | else: 239 | summary += f"\t{key}: {vals[-1]:.5f}\n" 240 | duration = self.logs_dict["duration"][-1] 241 | summary += f"\t[{duration:.2f}s]" 242 | return summary 243 | 244 | def min(self, key): 245 | if key not in self.logs_dict or len(self.logs_dict[key]) == 0: 246 | return 10e8 # Find nicer way 247 | return min(self.logs_dict[key]) 248 | 249 | def write(self, filepath): 250 | with open(filepath, "wb") as fd: 251 | pk.dump(self.logs_dict, fd) 252 | 253 | def write_summary(self, filepath, settings_description=""): 254 | """ 255 | Writes a text file containing a summary of the run. 256 | :param filepath: Filepath of the summary. 257 | """ 258 | min_idx = np.argmin(self.logs_dict[self.minimizer]) 259 | summary = settings_description + "\n\n" 260 | for key, value in self.logs_dict.items(): 261 | min_val = value[min_idx] 262 | summary += f"{key}: {min_val}\n" 263 | with open(filepath, "w") as fd: 264 | fd.write(summary) 265 | 266 | 267 | class OutputLogger: 268 | """ 269 | Overrides normal stdout, i.e. `sys.stdout = OutputLogger(...)`. After that, 270 | print writes all output to stdout *and* the specified log-file. 271 | """ 272 | def __init__(self, log_filepath): 273 | self.terminal = sys.stdout 274 | self.log = open(log_filepath, "a") 275 | 276 | def write(self, message): 277 | self.terminal.write(message) 278 | self.log.write(message) 279 | 280 | def flush(self): 281 | self.terminal.flush() 282 | self.log.flush() 283 | 284 | def close(self): 285 | self.log.flush() 286 | self.log.close() 287 | 288 | 289 | def parameter_description_from_dict(dict): 290 | return dict.__str__()[1:-1].replace("\'", "").replace(": ", "=") 291 | 292 | 293 | def checkpoint_filepath(directory, basename, script_file, parameters, 294 | file_ending, add_timestamp=True): 295 | """ 296 | Automatically creates a checkpoint filepath with the given attributes. 297 | :param directory: Directory in which the file should be stored. 298 | :param basename: Base name of the file, which will be appended by some 299 | extra parameters (see below). 300 | :param script_file: Name of the script as returned by __file__. 301 | :param parameters: Dictionary of additional parameters. 302 | :param file_ending: File ending of the file. 303 | :return: Filepath as a string. 304 | """ 305 | script_name = os.path.basename(script_file) 306 | script_name = script_name[:script_name.find(".")] 307 | 308 | param_descr = parameter_description_from_dict(parameters) 309 | 310 | if file_ending[0] != ".": 311 | file_ending = "." + file_ending 312 | 313 | if add_timestamp: 314 | time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") 315 | filename = (script_name + "," + param_descr + "_" 316 | + time_stamp + "_" + basename + file_ending) 317 | else: 318 | filename = script_name + "," + param_descr + "_" + basename + file_ending 319 | 320 | filepath = os.path.join(directory, filename) 321 | return filepath 322 | 323 | 324 | if __name__ == '__main__': 325 | TrainingSettings.update_setting(**{"lr": 0.1}) 326 | -------------------------------------------------------------------------------- /regression_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import argparse 4 | import os 5 | import warnings 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | from collections import defaultdict 10 | import pickle as pk 11 | 12 | from torch.nn import Parameter 13 | 14 | from layers import DNANodeRepModule, ConvNodeRepModule 15 | from metrics import compute_mae, compute_mape, compute_ssi, compute_geh, \ 16 | compute_cpl, compute_cpc, compute_binned_metric, compute_macro_metric, \ 17 | mae_metric, cpc_metric, cpl_metric, geh_metric, ssi_metric, mape_metric 18 | from dataset import UrbanPlanningDataset 19 | from training_environment import TrainingSettings as ts, PerformanceLogger, NodeConvType, \ 20 | JKType 21 | from training_environment import checkpoint_filepath, OutputLogger 22 | 23 | from torch_geometric.nn import JumpingKnowledge 24 | 25 | 26 | parser = argparse.ArgumentParser(description='UP') 27 | parser.add_argument('--enable-cuda', action='store_true', 28 | help='Enable CUDA') 29 | args = parser.parse_args() 30 | args.device = None 31 | if args.enable_cuda and torch.cuda.is_available(): 32 | args.device = torch.device('cuda') 33 | else: 34 | args.device = torch.device('cpu') 35 | 36 | 37 | class EdgeRegressor(nn.Module): 38 | 39 | def __init__(self, num_node_features, num_edge_features, node_rep_size, 40 | hidden_dim): 41 | super(EdgeRegressor, self).__init__() 42 | 43 | # Linear layer to transform target edge features 44 | self.fc_edges = nn.Sequential( 45 | nn.Linear(num_edge_features + 2 * num_node_features, hidden_dim), 46 | nn.ReLU(), 47 | nn.BatchNorm1d(hidden_dim), 48 | nn.Dropout(p=ts.drop_prob), 49 | nn.Linear(hidden_dim, hidden_dim), 50 | ) 51 | concat_hidden_dim = hidden_dim 52 | 53 | if ts.include_node_reps: 54 | if ts.node_conv_type == NodeConvType.GraphConvolution: 55 | self.node_rep_module = ConvNodeRepModule(num_node_features, 56 | node_rep_size, 57 | ts.num_node_rep_layers, 58 | ts.improved_gcn, 59 | ts.drop_prob) 60 | elif ts.node_conv_type == NodeConvType.DNAConvolution: 61 | self.node_rep_module = DNANodeRepModule(num_node_features, 62 | node_rep_size, 63 | ts.num_node_rep_layers, 64 | ts.dna_heads, 65 | ts.dna_groups, 66 | ts.drop_prob) 67 | concat_hidden_dim += 2 * node_rep_size 68 | 69 | if ts.jk_type is not JKType.NoJK: 70 | self.jk = JumpingKnowledge(ts.jk_type.value, channels=8, 71 | num_layers=ts.num_node_rep_layers) 72 | lin_size = node_rep_size 73 | if ts.jk_type is JKType.Concat: 74 | lin_size = ts.num_node_rep_layers*node_rep_size 75 | self.jk_lin = nn.Linear(lin_size, node_rep_size) 76 | 77 | self.node_weight = Parameter(torch.from_numpy(np.array(1.0, dtype=np.float32))) 78 | self.edge_weight = Parameter(torch.from_numpy(np.array(1.0, dtype=np.float32))) 79 | 80 | self.regression_head = nn.Sequential( 81 | nn.ReLU(), 82 | nn.BatchNorm1d(hidden_dim), 83 | nn.Dropout(p=ts.drop_prob), 84 | nn.Linear(hidden_dim, hidden_dim), 85 | nn.ReLU(), 86 | nn.BatchNorm1d(hidden_dim), 87 | nn.Dropout(p=ts.drop_prob), 88 | nn.Linear(hidden_dim, 1) 89 | ) 90 | 91 | def forward(self, x_nodes, x_edges_batch, edge_indices_batch, edge_indices, 92 | edge_weight=None): 93 | """ 94 | :param x_nodes: Node features of shape [N, D] 95 | :param x_edges_batch: Edge features of shape [B, K] 96 | :param edge_indices_batch: Matrix of shape [B, 2] indicating the 97 | indices of the nodes connected by each edge. 98 | :param edge_indices: Matrix of shape [2, E] indicating for each edge 99 | in the graph the two node IDs it connects. 100 | :param edge_weight: Vector of shape [E] containing the edge weight for 101 | each edge in the graph. 102 | :return: Predictions for edges with shape [B, 1] 103 | """ 104 | 105 | # Compute hidden representation of target edge 106 | x_nodes_left = x_nodes[edge_indices_batch[:, 0]] 107 | x_nodes_right = x_nodes[edge_indices_batch[:, 1]] 108 | x_concat = torch.cat([x_nodes_left, x_edges_batch, x_nodes_right], dim=-1) 109 | h_edges = self.fc_edges(x_concat) 110 | h_total = self.node_weight * h_edges 111 | 112 | # Compute hidden representations of nodes 113 | if ts.include_node_reps: 114 | intermediate_node_reps = self.node_rep_module(x_nodes, 115 | edge_indices.t(), 116 | edge_weight) 117 | if ts.jk_type is JKType.NoJK: 118 | h_nodes = intermediate_node_reps[-1] 119 | else: 120 | h_nodes = self.jk(intermediate_node_reps) 121 | h_nodes = self.jk_lin(h_nodes) 122 | # Get hidden representations of nodes incident to target edges 123 | h_nodes_left = h_nodes[edge_indices_batch[:, 0]] 124 | h_nodes_right = h_nodes[edge_indices_batch[:, 1]] 125 | h_total += self.edge_weight * h_nodes_left 126 | h_total += self.edge_weight * h_nodes_right 127 | 128 | regression_output = self.regression_head(h_total) 129 | 130 | return regression_output.squeeze(-1) 131 | 132 | 133 | def train_epoch(epoch, predictor, data, optimizer, loss_criterion, logger, 134 | lr_schedule): 135 | predictor.train() 136 | 137 | for (edge_idcs_batch, x_edges_batch, edge_labels_batch, 138 | _) in data.train_loader: 139 | edge_idcs_batch = edge_idcs_batch.to(device=args.device) 140 | x_edges_batch = x_edges_batch.to(device=args.device) 141 | edge_labels_batch = edge_labels_batch.to(device=args.device) 142 | 143 | optimizer.zero_grad() 144 | reg_out = predictor(data.node_feats, x_edges_batch, edge_idcs_batch, 145 | data.flow_topology.edge_indices, 146 | edge_weight=data.flow_topology.edge_weights) 147 | loss = loss_criterion(reg_out, edge_labels_batch) 148 | loss.backward() 149 | optimizer.step() 150 | logger.add_values({"train_loss": loss.item()}) 151 | lr_schedule.step() 152 | 153 | 154 | def validate_epoch(epoch, predictor, data, loss_criterion, data_loader, logger, 155 | test): 156 | predictor.eval() 157 | prefix = "test" if test else "val" 158 | 159 | for (edge_idcs_batch, x_edges_batch, edge_labels_batch, edge_buckets_batch) in data_loader: 160 | edge_idcs_batch = edge_idcs_batch.to(device=args.device) 161 | x_edges_batch = x_edges_batch.to(device=args.device) 162 | edge_labels_batch = edge_labels_batch.to(device=args.device) 163 | 164 | reg_out = predictor(data.node_feats, x_edges_batch, edge_idcs_batch, 165 | data.flow_topology.edge_indices, 166 | edge_weight=data.flow_topology.edge_weights) 167 | loss = loss_criterion(reg_out, edge_labels_batch) 168 | logger.add_values({ 169 | prefix + "_loss": loss.item(), 170 | prefix + "_predictions": reg_out.detach().cpu().numpy(), 171 | prefix + "_labels": edge_labels_batch.detach().cpu().numpy(), 172 | prefix + "_bins": edge_buckets_batch.detach().cpu().numpy() 173 | }) 174 | if test: 175 | with open("preds_labels.pk", "wb") as fd: 176 | preds = data.label_scaler.inverse_transform(np.concatenate(logger._current_epoch_metrics["test_predictions"], axis=-1).reshape(-1, 1)) 177 | labels = data.label_scaler.inverse_transform(np.concatenate(logger._current_epoch_metrics["test_labels"], axis=-1).reshape(-1, 1)) 178 | pk.dump((preds, labels, logger._current_epoch_metrics["test_node_idcs"]), fd) 179 | 180 | 181 | def run_training(): 182 | # Set up training environment 183 | if not os.path.exists(ts.cp_folder): 184 | os.makedirs(ts.cp_folder) 185 | log_filepath = checkpoint_filepath(ts.cp_folder, "log", __file__, {}, 186 | ".pk") 187 | summary_filepath = checkpoint_filepath(ts.cp_folder, "summary", __file__, 188 | {}, ".txt") 189 | output_logger = OutputLogger(checkpoint_filepath(ts.cp_folder, "output", 190 | __file__, {}, ".txt")) 191 | sys.stdout = output_logger 192 | ts.write_summary_file(checkpoint_filepath(ts.cp_folder, "hyperparams", 193 | __file__, {}, "txt")) 194 | print(ts.settings_description()) 195 | 196 | # Load data 197 | ds = UrbanPlanningDataset(ts.data_base_path, ts.num_bins, ts.batch_size, 198 | ts.n_quantiles, ts.resampling, 199 | ts.excluded_node_feature_columns, 200 | ts.excluded_edge_feature_columns, False, 201 | ts.include_edge_flow_feat, ts.adj_flow_threshold, 202 | ts.seed) 203 | # Preprocess data 204 | ds.to(args.device) 205 | 206 | def _get_metric_funcs(prefix): 207 | preds_key = prefix+"_predictions" 208 | labels_key = prefix+"_labels" 209 | bins_key = prefix+"_bins" 210 | return { 211 | prefix+"_loss": (lambda m: np.nanmean(m[prefix+"_loss"])), 212 | prefix + "_mae": (lambda m: compute_mae(m[preds_key], m[labels_key], ds)), 213 | prefix + "_binned_mae": (lambda m: compute_binned_metric(mae_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 214 | prefix + "_macro_mae": (lambda m: compute_macro_metric(mae_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 215 | prefix + "_mape": (lambda m: compute_mape(m[preds_key], m[labels_key], ds)), 216 | prefix + "_binned_mape": (lambda m: compute_binned_metric(mape_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 217 | prefix + "_macro_mape": (lambda m: compute_macro_metric(mape_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 218 | prefix + "_ssi": (lambda m: compute_ssi(m[preds_key], m[labels_key], ds)), 219 | prefix + "_binned_ssi": (lambda m: compute_binned_metric(ssi_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 220 | prefix + "_macro_ssi": (lambda m: compute_macro_metric(ssi_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 221 | prefix + "_geh": (lambda m: compute_geh(m[preds_key], m[labels_key], ds)), 222 | prefix + "_binned_geh": (lambda m: compute_binned_metric(geh_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 223 | prefix + "_macro_geh": (lambda m: compute_macro_metric(geh_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 224 | prefix + "_cpl": (lambda m: compute_cpl(m[preds_key], m[labels_key], ds)), 225 | prefix + "_binned_cpl": (lambda m: compute_binned_metric(cpl_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 226 | prefix + "_macro_cpl": (lambda m: compute_macro_metric(cpl_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 227 | prefix + "_cpc": (lambda m: compute_cpc(m[preds_key], m[labels_key], ds)), 228 | prefix + "_binned_cpc": (lambda m: compute_binned_metric(cpc_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 229 | prefix + "_macro_cpc": (lambda m: compute_macro_metric(cpc_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)), 230 | } 231 | metric_funcs = { 232 | "train_loss": (lambda m: np.nanmean(m["train_loss"])), 233 | **_get_metric_funcs("val"), 234 | **_get_metric_funcs("test"), 235 | } 236 | logger = PerformanceLogger(metric_funcs, "val_macro_mae", log_filepath, 237 | write_every=ts.write_log_every) 238 | 239 | predictor = EdgeRegressor(ds.num_node_feats, ds.num_edge_feats, 240 | hidden_dim=ts.hidden_dim, 241 | node_rep_size=ts.node_rep_size) 242 | predictor = predictor.to(device=args.device) 243 | 244 | optimizer = torch.optim.Adam(predictor.parameters(), lr=ts.lr) 245 | lr_schedule = torch.optim.lr_scheduler.MultiStepLR(optimizer, 246 | list(ts.lr_schedule)) 247 | loss_criterion = (nn.L1Loss() if ts.regression_loss == "L1" 248 | else nn.MSELoss()) 249 | 250 | print("Start training") 251 | for epoch in range(-1, ts.num_epochs): 252 | if epoch >= 0: 253 | train_epoch(epoch, predictor, ds, optimizer, loss_criterion, 254 | logger, lr_schedule) 255 | validate_epoch(epoch, predictor, ds, loss_criterion, ds.val_loader, 256 | logger, test=False) 257 | validate_epoch(epoch, predictor, ds, loss_criterion, ds.test_loader, 258 | logger, test=True) 259 | 260 | logger.complete_epoch() 261 | print(logger.epoch_summary()) 262 | if epoch % ts.write_log_every == 0: 263 | logger.write(log_filepath) 264 | logger.write(log_filepath) 265 | logger.write_summary(summary_filepath, ts.settings_description()) 266 | return logger 267 | 268 | 269 | if __name__ == '__main__': 270 | run_training() 271 | -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.sparse as sp 6 | from torch.nn.parameter import Parameter 7 | from torch.nn.modules.module import Module 8 | from torch_geometric.nn import GCNConv, DNAConv 9 | 10 | 11 | def tensor_normalize(matrix): 12 | row_sum = sp.sum(matrix, dim=1).to_dense() 13 | # if torch.any(row_sum == 0.0): 14 | # raise ValueError("Matrix contains rows with sum 0.") 15 | r_inv = row_sum.pow(-1).flatten() 16 | r_inv[torch.isinf(r_inv)] = 0.0 17 | norm_matrix = torch.matmul(torch.diag(r_inv), matrix.to_dense()) 18 | return norm_matrix 19 | 20 | 21 | class ConvNodeRepModule(Module): 22 | def __init__(self, in_dim, hidden_dim, num_layers, improved_gcn, 23 | drop_prob): 24 | super(ConvNodeRepModule, self).__init__() 25 | self.conv_layers = [] 26 | for idx in range(num_layers): 27 | cur_in_dim = in_dim if idx == 0 else hidden_dim 28 | cur_layer = NormalizedRegularizedGCNLayer(cur_in_dim, hidden_dim, 29 | improved_gcn, drop_prob) 30 | self.conv_layers.append(cur_layer) 31 | self.conv_layers = nn.ModuleList(self.conv_layers) 32 | 33 | def forward(self, node_features, edge_indices, edge_weight=None): 34 | h = node_features 35 | intermediate_reps = [] 36 | for layer in self.conv_layers: 37 | h = layer(h, edge_indices, edge_weight) 38 | intermediate_reps.append(h) 39 | return intermediate_reps 40 | 41 | 42 | class NormalizedRegularizedGCNLayer(Module): 43 | def __init__(self, in_dim, out_dim, improved_gcn, drop_prob): 44 | super(NormalizedRegularizedGCNLayer, self).__init__() 45 | self.gcn = GCNConv(in_dim, out_dim, improved_gcn) 46 | self.bn = nn.BatchNorm1d(out_dim) 47 | self.drop = nn.Dropout(drop_prob) 48 | 49 | def forward(self, node_features, edge_indices, edge_weight=None): 50 | h = self.gcn(node_features, edge_indices, edge_weight) 51 | h = F.relu(h) 52 | h = self.bn(h) 53 | h = self.drop(h) 54 | return h 55 | 56 | 57 | class DNANodeRepModule(Module): 58 | """ 59 | Applies a given number of DNA convolutions on the given data. Returns a 60 | list of all the intermediate representations after each layer. 61 | """ 62 | 63 | def __init__(self, in_dim, hidden_dim, num_layers, dna_heads, dna_groups, 64 | drop_prob): 65 | super(DNANodeRepModule, self).__init__() 66 | self.hidden_dim = hidden_dim 67 | self.pre_lin = nn.Linear(in_dim, hidden_dim) 68 | self.pre_drop = nn.Dropout(drop_prob) 69 | self.conv_layers = [] 70 | for _ in range(num_layers): 71 | cur_layer = NormalizedRegularizedDNALayer(hidden_dim, dna_heads, 72 | dna_groups, drop_prob) 73 | self.conv_layers.append(cur_layer) 74 | self.conv_layers = nn.ModuleList(self.conv_layers) 75 | 76 | def forward(self, input, edge_indices): 77 | h = F.relu(self.pre_lin(input)) 78 | h = self.pre_drop(h) 79 | h = h.view(-1, 1, self.hidden_dim) 80 | intermediate_reps = [] 81 | for conv in self.conv_layers: 82 | h_new = conv(h, edge_indices) 83 | intermediate_reps.append(h_new) 84 | h_new = h_new.view(-1, 1, self.hidden_dim) 85 | h = torch.cat([h, h_new], dim=1) 86 | return intermediate_reps 87 | 88 | 89 | class NormalizedRegularizedDNALayer(Module): 90 | def __init__(self, channels, heads, groups, dropout): 91 | super(NormalizedRegularizedDNALayer, self).__init__() 92 | self.dna = DNAConv(channels, heads, groups, dropout) 93 | self.bn = nn.BatchNorm1d(channels) 94 | 95 | def forward(self, all_node_features, edge_indices): 96 | h = self.dna(all_node_features, edge_indices) 97 | h = F.relu(h) 98 | h = self.bn(h) 99 | return h 100 | 101 | 102 | class GraphConvolution(Module): 103 | """ 104 | Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 105 | Adapted from https://github.com/tkipf/pygcn/blob/master/pygcn/layers.py 106 | """ 107 | 108 | def __init__(self, in_features, out_features, adj_matrix, edge_feat_matrix, 109 | attention_scores=None, bias=True): 110 | """ 111 | :param in_features: 112 | :param out_features: 113 | :param bias: 114 | :param attention_scores: Sparse tensor containing for each pair of 115 | nodes the attention score between the nodes. Shape [N, N]. 116 | :param adj_matrix: 117 | """ 118 | super(GraphConvolution, self).__init__() 119 | self.in_features = in_features 120 | self.out_features = out_features 121 | self.weight = Parameter(torch.FloatTensor(in_features, out_features)) 122 | if bias: 123 | self.bias = Parameter(torch.FloatTensor(out_features)) 124 | else: 125 | self.register_parameter('bias', None) 126 | self.adj_matrix = tensor_normalize(adj_matrix) 127 | # Pre-compute attention adjacency matrix 128 | if attention_scores is not None: 129 | with torch.no_grad(): 130 | self.adj_matrix = tensor_normalize(adj_matrix * attention_scores) 131 | self.reset_parameters() 132 | 133 | def reset_parameters(self): 134 | stdv = 1. / math.sqrt(self.weight.size(1)) 135 | self.weight.data.uniform_(-stdv, stdv) 136 | if self.bias is not None: 137 | self.bias.data.uniform_(-stdv, stdv) 138 | 139 | def forward(self, input): 140 | """ 141 | :param input: Node features of shape [N, D] 142 | """ 143 | support = torch.mm(input, self.weight) # Shape [N, K] 144 | output = torch.matmul(self.adj_matrix, support) 145 | if self.bias is not None: 146 | return output + self.bias 147 | else: 148 | return output 149 | 150 | def __repr__(self): 151 | return self.__class__.__name__ + ' (' \ 152 | + str(self.in_features) + ' -> ' \ 153 | + str(self.out_features) + ')' 154 | 155 | 156 | class GraphNodeEdgeConvolution(Module): 157 | 158 | def __init__(self, node_input_size, edge_input_size, output_size, 159 | adj_matrix, edge_feat_matrix, bias=True): 160 | """ 161 | :param node_input_size: 162 | :param edge_input_size: 163 | :param output_size: 164 | :param adj_matrix: 165 | :param edge_feat_matrix: [N, N, K] 166 | :param bias: 167 | """ 168 | super(GraphNodeEdgeConvolution, self).__init__() 169 | self.input_size = node_input_size + edge_input_size 170 | self.output_size = output_size 171 | self.weight = Parameter(torch.FloatTensor(self.input_size, output_size)) 172 | if bias: 173 | self.bias = Parameter(torch.FloatTensor(output_size)) 174 | else: 175 | self.register_parameter('bias', None) 176 | self.adj_matrix = tensor_normalize(adj_matrix) 177 | self.edge_feat_matrix = edge_feat_matrix 178 | self.reset_parameters() 179 | 180 | def reset_parameters(self): 181 | stdv = 1. / math.sqrt(self.weight.size(1)) 182 | self.weight.data.uniform_(-stdv, stdv) 183 | if self.bias is not None: 184 | self.bias.data.uniform_(-stdv, stdv) 185 | 186 | def forward(self, node_features): 187 | num_nodes = node_features.shape[0] 188 | node_feats = node_features.view(1, num_nodes, -1).expand(num_nodes, -1, -1) 189 | combined_feats = torch.cat([self.edge_feat_matrix, node_feats], dim=-1) 190 | support = torch.matmul(combined_feats, self.weight) # shape [N, N, D] 191 | output = torch.matmul(support.transpose(2, 0), self.adj_matrix) # shape [D, N, N] 192 | output = torch.diagonal(output, dim1=1, dim2=2) 193 | output = output.transpose(1, 0) 194 | return output 195 | 196 | 197 | class EdgeConvolution(nn.Module): 198 | def __init__(self, in_features, out_features, inc_matrix): 199 | """ 200 | :param in_features: 201 | :param out_features: 202 | :param inc_matrix: Sparse incidence matrix of the graph of shape 203 | [N, E]. 204 | """ 205 | super(EdgeConvolution, self).__init__() 206 | self.weight = nn.parameter.Parameter(torch.FloatTensor(in_features, 207 | out_features)) 208 | self.bias = nn.parameter.Parameter(torch.FloatTensor(out_features)) 209 | self.inc_matrix = inc_matrix 210 | self.reset_parameters() 211 | 212 | def reset_parameters(self): 213 | stdv = 1. / math.sqrt(self.weight.size(1)) 214 | self.weight.data.uniform_(-stdv, stdv) 215 | if self.bias is not None: 216 | self.bias.data.uniform_(-stdv, stdv) 217 | 218 | def forward(self, edge_nodes, edge_feats): 219 | """ 220 | :param edge_nodes: Matrix indicating the nodes which each edge in the 221 | batch connects. Shape [B, N]. 222 | :param edge_feats: Features of *all* edges in the graph. Shape [E, D]. 223 | :return: Hidden representation of shape [B, K]. 224 | """ 225 | # Get edges incident to the left and right nodes of each edge in the 226 | # batch. Result has shape [B, E]. 227 | batch_edge_idcs = sp.mm(self.inc_matrix.transpose(1, 0), 228 | edge_nodes.transpose(1, 0)).transpose(1, 0) 229 | # Normalise matrix row-wise such that edge features are averaged, not 230 | # summed. 231 | row_sum = torch.sum(batch_edge_idcs, dim=1) 232 | inv = 1.0 / row_sum 233 | inv[torch.isinf(inv)] = 0.0 234 | batch_edge_idcs = batch_edge_idcs * inv.view(-1, 1) 235 | 236 | # Compute hidden representations from edge_features 237 | h_edges = torch.mm(edge_feats, self.weight) + self.bias # [E, K] 238 | 239 | # Obtain features of each of these edges 240 | h = torch.spmm(batch_edge_idcs, h_edges) # [B, K] 241 | 242 | return h 243 | 244 | 245 | class DeepGraphConvolution(Module): 246 | """ 247 | Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 248 | Adapted from https://github.com/tkipf/pygcn/blob/master/pygcn/layers.py 249 | """ 250 | 251 | def __init__(self, in_features, hidden_features, out_features, adj_matrix, 252 | attention_scores=None, bias=True, num_layers=1): 253 | """ 254 | :param in_features: 255 | :param out_features: 256 | :param bias: 257 | :param attention_scores: Sparse tensor containing for each pair of 258 | nodes the attention score between the nodes. Shape [N, N]. 259 | :param adj_matrix: 260 | """ 261 | super(DeepGraphConvolution, self).__init__() 262 | self.in_features = in_features 263 | self.out_features = out_features 264 | self.lin_layers = [] 265 | self.bns = [] 266 | for idx in range(num_layers): 267 | in_size = in_features if idx == 0 else hidden_features 268 | out_size = out_features if idx == num_layers-1 else hidden_features 269 | self.lin_layers.append(nn.Linear(in_size, out_size, bias=bias)) 270 | self.bns.append(nn.BatchNorm1d(out_size)) 271 | self.lin_layers = nn.ModuleList(self.lin_layers) 272 | self.bns = nn.ModuleList(self.bns) 273 | # Pre-compute adjacency matrix with weighting if necessary 274 | self.adj_matrix = adj_matrix 275 | if attention_scores is not None: 276 | self.adj_matrix = tensor_normalize(adj_matrix * attention_scores) 277 | 278 | def forward(self, input): 279 | support = input 280 | for idx in range(len(self.lin_layers)): 281 | support = self.lin_layers[idx](support) 282 | if idx < len(self.lin_layers)-1: 283 | support = F.relu(support) 284 | support = self.bns[idx](support) 285 | output = torch.matmul(self.adj_matrix, support) 286 | return output 287 | 288 | def __repr__(self): 289 | return self.__class__.__name__ + ' (' \ 290 | + str(self.in_features) + ' -> ' \ 291 | + str(self.out_features) + ')' 292 | 293 | 294 | class DeepEdgeConvolution(nn.Module): 295 | def __init__(self, in_features, hidden_features, out_features, inc_matrix, 296 | bias=True, num_layers=1): 297 | """ 298 | :param in_features: 299 | :param out_features: 300 | :param inc_matrix: Sparse incidence matrix of the graph of shape 301 | [N, E]. 302 | """ 303 | super(DeepEdgeConvolution, self).__init__() 304 | self.lin_layers = [] 305 | self.bns = [] 306 | for idx in range(num_layers): 307 | in_size = in_features if idx == 0 else hidden_features 308 | out_size = out_features if idx == num_layers-1 else hidden_features 309 | self.lin_layers.append(nn.Linear(in_size, out_size, bias=bias)) 310 | self.bns.append(nn.BatchNorm1d(out_size)) 311 | self.lin_layers = nn.ModuleList(self.lin_layers) 312 | self.bns = nn.ModuleList(self.bns) 313 | self.inc_matrix = inc_matrix 314 | 315 | def forward(self, edge_nodes, edge_feats): 316 | """ 317 | :param edge_nodes: Matrix indicating the nodes which each edge in the 318 | batch connects. Shape [B, N]. 319 | :param edge_feats: Features of *all* edges in the graph. Shape [E, D]. 320 | :return: Hidden representation of shape [B, K]. 321 | """ 322 | # Get edges incident to the left and right nodes of each edge in the 323 | # batch. Result has shape [B, E]. 324 | batch_edge_idcs = sp.mm(self.inc_matrix.transpose(1, 0), 325 | edge_nodes.transpose(1, 0)).transpose(1, 0) 326 | # Normalise matrix row-wise such that edge features are averaged, not 327 | # summed. 328 | row_sum = torch.sum(batch_edge_idcs, dim=1) 329 | inv = 1.0 / row_sum 330 | inv[torch.isinf(inv)] = 0.0 331 | batch_edge_idcs = batch_edge_idcs * inv.view(-1, 1) 332 | 333 | # Compute hidden representations from edge_features 334 | h_edges = edge_feats 335 | for idx in range(len(self.lin_layers)): 336 | h_edges = self.lin_layers[h_edges] 337 | if idx < len(self.lin_layers)-1: 338 | h_edges = F.relu(h_edges) 339 | h_edges = self.bns[idx](h_edges) 340 | 341 | # Obtain features of each of these edges 342 | h = torch.spmm(batch_edge_idcs, h_edges) # [B, K] 343 | 344 | return h 345 | 346 | 347 | class PatchToPatchEdgeConvolution(nn.Module): 348 | def __init__(self, in_features, out_features): 349 | super(PatchToPatchEdgeConvolution, self).__init__() 350 | self.weight = nn.parameter.Parameter(torch.FloatTensor(in_features, 351 | out_features)) 352 | self.bias = nn.parameter.Parameter(torch.FloatTensor(out_features)) 353 | self.reset_parameters() 354 | 355 | def reset_parameters(self): 356 | stdv = 1. / math.sqrt(self.weight.size(1)) 357 | self.weight.data.uniform_(-stdv, stdv) 358 | if self.bias is not None: 359 | self.bias.data.uniform_(-stdv, stdv) 360 | 361 | def forward(self, edge_nodes, adj_matrix, inc_matrix, edge_feats): 362 | """ 363 | :param edge_nodes: Matrix indicating the nodes which each edge in the 364 | batch connects. Shape [B, N] 365 | :param adj_matrix: Sparse adjacency matrix of the graph of shape 366 | [N, N]. Must contain only 1-entries (i.e. should not be normalised). 367 | :param inc_matrix: Sparse incidence matrix of the graph of shape 368 | [N, E]. 369 | :param edge_feats: Features of *all* edges in the graph. Shape [E, D]. 370 | :return: Hidden representation of shape [B, K]. 371 | """ 372 | # Get edges incident to the left and right nodes of each edge in the 373 | # batch. Result has shape [B, E]. 374 | # In essence, it computes BxN * NxN * NxE 375 | # = edge_nodes * adj_matrix * inc_matrix. 376 | batch_edge_idcs = sp.mm(adj_matrix.transpose(1, 0), 377 | edge_nodes.transpose(1, 0)) 378 | batch_edge_idcs = sp.mm(inc_matrix.transpose(1, 0), 379 | batch_edge_idcs).transpose(1, 0) 380 | # Find exactly those edges which are two "hops" away from the edge 381 | # in the batch 382 | batch_edge_idcs = (batch_edge_idcs == 2.0).float() 383 | # Normalise matrix row-wise such that edge features are averaged, not 384 | # summed. 385 | row_sum = torch.sum(batch_edge_idcs, dim=1) 386 | inv = 1.0 / row_sum 387 | inv[torch.isinf(inv)] = 0.0 388 | batch_edge_idcs = batch_edge_idcs * inv.view(-1, 1) 389 | 390 | # Compute hidden representations from edge_features 391 | h_edges = torch.mm(edge_feats, self.weight) + self.bias # [E, K] 392 | 393 | # Obtain features of each of these edges 394 | h = torch.spmm(batch_edge_idcs, h_edges) # [B, K] 395 | 396 | return h 397 | 398 | 399 | class GraphAttentionLayer(nn.Module): 400 | """ 401 | Simple GAT layer, similar to https://arxiv.org/abs/1710.10903 402 | """ 403 | 404 | def __init__(self, in_features, out_features, adj_matrix, dropout, alpha, 405 | edge_feats, edge_idcs, concat=True): 406 | super(GraphAttentionLayer, self).__init__() 407 | self.dropout = dropout 408 | self.in_features = in_features 409 | self.out_features = out_features 410 | self.adj_matrix = adj_matrix 411 | self.alpha = alpha 412 | self.concat = concat 413 | num_nodes = adj_matrix.shape[0] 414 | self.edge_feats = torch.zeros( 415 | (num_nodes, num_nodes, edge_feats.shape[1])).to( 416 | device=edge_idcs.device) 417 | self.edge_feats[edge_idcs[:, 0], edge_idcs[:, 1], :] = edge_feats 418 | 419 | self.W = nn.Parameter(torch.zeros(size=(in_features, out_features))) 420 | nn.init.xavier_uniform_(self.W.data, gain=1.414) 421 | self.a = nn.Parameter(torch.zeros( 422 | size=(2 * out_features + self.edge_feats.shape[-1], 1))) 423 | nn.init.xavier_uniform_(self.a.data, gain=1.414) 424 | 425 | self.leakyrelu = nn.LeakyReLU(self.alpha) 426 | 427 | def forward(self, input): 428 | h = torch.mm(input, self.W) 429 | N = h.size()[0] 430 | 431 | a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features) 432 | a_input = torch.cat([a_input, self.edge_feats], dim=-1) 433 | e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2)) 434 | 435 | zero_vec = -9e15*torch.ones_like(e) 436 | attention = torch.where(self.adj_matrix > 0, e, zero_vec) 437 | attention = F.softmax(attention, dim=1) 438 | attention = F.dropout(attention, self.dropout, training=self.training) 439 | h_prime = torch.matmul(attention, h) 440 | 441 | if self.concat: 442 | return F.elu(h_prime) 443 | else: 444 | return h_prime 445 | 446 | -------------------------------------------------------------------------------- /dataset_preparation.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides functionality for creating a data split of a given input 3 | graph into training, validation and test sets of edges. 4 | """ 5 | import pathlib 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from utils import bin_data 10 | 11 | 12 | def prepare_dataset(): 13 | np.random.seed(7) 14 | city = "London_high" 15 | bin_bounds = [10.0, 100.0, 1000.0, 30000.0] 16 | include_spatial_lag = False 17 | data_path = pathlib.Path("Data/" + city) 18 | 19 | # Load adjacency matrix specifying which nodes lie in the geographical 20 | # neighborhood of each other 21 | geo_adj_matrix, geo_adj_idcs = _load_geo_adj_matrix(city) 22 | 23 | # Load pandas data frames containing node and edge data 24 | node_data, edge_data = _load_dataframes(city) 25 | true_flows = edge_data["flows"].values # also contains 0-valued flows 26 | edge_idcs = edge_data[["location_1", "location_2"]].values 27 | 28 | num_nodes = geo_adj_matrix.shape[0] 29 | num_geo_edges = geo_adj_idcs.shape[1] 30 | 31 | print(f"node columns: {node_data.columns}") 32 | print(node_data.head()) 33 | print(f"edge columns: {edge_data.columns}") 34 | print(edge_data.head()) 35 | 36 | (val_node_idcs, test_node_idcs, val_edge_idcs, 37 | test_edge_idcs, train_edge_idcs, bin_idcs) = _get_node_split(node_data, 38 | edge_data, 39 | bin_bounds) 40 | 41 | known_flows = _compute_known_flows(true_flows, edge_idcs, edge_data, 42 | val_node_idcs, test_node_idcs) 43 | 44 | if include_spatial_lag: 45 | raise NotImplementedError 46 | # approx_flows, node_data = _substitute_in_approximations( 47 | # val_node_idcs, test_node_idcs, flow_adj_idcs, flows, 48 | # edge_data["origin_to_neighbourhood"].values, 49 | # edge_data["neighbourhood_to_destination"].values, node_data) 50 | 51 | # Drop unused node and edge features 52 | node_data, edge_data = _filter_feature_data(node_data, edge_data, 53 | include_spatial_lag) 54 | 55 | # Remove 0-valued edges that are not in the training, validation, or test 56 | # set and update the set indices accordingly 57 | (train_edge_idcs, val_edge_idcs, test_edge_idcs, edge_idcs, true_flows, 58 | known_flows, edge_data) = _remove_unused_zero_edges(train_edge_idcs, 59 | val_edge_idcs, 60 | test_edge_idcs, 61 | edge_idcs, true_flows, 62 | known_flows, 63 | edge_data) 64 | flow_adj_idcs, flow_adj_values = _compute_flow_adj_matrix(known_flows, edge_idcs) 65 | num_flow_edges = flow_adj_idcs.shape[1] 66 | 67 | # Compute incidence graphs from adjacency matrices 68 | flow_inc_indices = _compute_incidence_matrix(flow_adj_idcs, num_nodes, 69 | num_flow_edges) 70 | geo_inc_indices = _compute_incidence_matrix(geo_adj_idcs, num_nodes, 71 | num_geo_edges) 72 | 73 | _store_dataset_files(data_path, edge_data, node_data, true_flows, 74 | flow_adj_idcs, flow_adj_values, geo_adj_idcs, 75 | flow_inc_indices, geo_inc_indices, train_edge_idcs, 76 | val_edge_idcs, test_edge_idcs) 77 | 78 | 79 | def _load_geo_adj_matrix(city): 80 | path = f'../raw_data/{city}/geo_adj_matrix.csv' 81 | geo_adj_matrix = np.genfromtxt(path, delimiter=',')[1:, 1:] 82 | geo_adj_indices = np.stack(np.nonzero(geo_adj_matrix)) 83 | return geo_adj_matrix, geo_adj_indices 84 | 85 | 86 | def _load_dataframes(city): 87 | node_path = f'../raw_data/{city}/node_data.csv' 88 | node_data = pd.read_csv(node_path, header=0, index_col=0) 89 | 90 | edge_path = f'../raw_data/{city}/edge_data.csv' 91 | edge_data = pd.read_csv(edge_path, header=0, index_col=0) 92 | 93 | return node_data, edge_data 94 | 95 | 96 | def _get_node_split(node_data, edge_data, bin_bounds): 97 | flows = edge_data["flows"].values 98 | edge_idcs = edge_data.values[:, :2].astype(np.int) # Ex2 array indicating the two nodes an edge connects 99 | num_edges = len(edge_idcs) 100 | num_nodes = len(node_data) 101 | num_bins = len(bin_bounds) 102 | bin_idcs = bin_data(flows, num_bins, scale="custom", bin_bounds=bin_bounds) 103 | bin_counts = np.bincount(bin_idcs) 104 | bin_counts[0] = np.sum((flows > 0) & (flows < bin_bounds[0])) # Special case: When it comes to compute the fraction of edges of the smallest bin, we exclude the huge number of 0-valued edges 105 | smallest_bin_idx = np.argmin(bin_counts) 106 | bin_samples, = np.where(bin_idcs == smallest_bin_idx) 107 | np.random.shuffle(bin_samples) 108 | 109 | test_edge_set, test_node_set = _create_node_set(set(), set(), 110 | smallest_bin_idx, 111 | bin_samples, bin_idcs, 112 | edge_idcs, num_bins, 113 | 0.2 * bin_counts) 114 | val_edge_set, val_node_set = _create_node_set(test_node_set, test_edge_set, 115 | smallest_bin_idx, 116 | bin_samples, bin_idcs, 117 | edge_idcs, num_bins, 118 | 0.1 * bin_counts) 119 | 120 | # Create training set by selecting all non-zero-valued edges and a limited 121 | # number of zero-valued edges 122 | non_train_node_set = val_node_set.union(test_node_set) 123 | train_edge_set = set() 124 | max_num_zero = 10000 # include limited number of zero-valued edges 125 | num_zero = 0 126 | for edge_idx, (flow, edge_idcs) in enumerate(zip(flows, edge_idcs)): 127 | if (not edge_idcs[0] in non_train_node_set 128 | and not edge_idcs[1] in non_train_node_set): 129 | if flow >= 1.0: 130 | train_edge_set.add(edge_idx) 131 | elif num_zero < max_num_zero: 132 | train_edge_set.add(edge_idx) 133 | num_zero += 1 134 | 135 | assert len(test_edge_set.intersection(val_edge_set)) == 0 136 | assert len(test_edge_set.intersection(train_edge_set)) == 0 137 | assert len(val_edge_set.intersection(train_edge_set)) == 0 138 | assert len(test_node_set.intersection(val_node_set)) == 0 139 | 140 | val_node_idcs = np.array(list(val_node_set), dtype=np.int) 141 | test_node_idcs = np.array(list(test_node_set), dtype=np.int) 142 | val_edge_idcs = np.array(list(val_edge_set), dtype=np.int) 143 | test_edge_idcs = np.array(list(test_edge_set), dtype=np.int) 144 | train_edge_idcs = np.array(list(train_edge_set), dtype=np.int) 145 | return (val_node_idcs, test_node_idcs, val_edge_idcs, test_edge_idcs, 146 | train_edge_idcs, bin_idcs) 147 | 148 | 149 | def _create_node_set(unavailable_nodes, unavailable_edges, smallest_bin_idx, 150 | bin_samples, bin_idcs, edge_idcs, num_bins, max_per_bin, 151 | MAX_PER_BIN_AND_NODE=110): 152 | """ 153 | We create a validation/test set of edges by randomly drawing edges from 154 | the bin with the smallest number of samples in them. For each sampled edge, 155 | we choose one of the incident nodes and add them to a set of nodes excluded 156 | from training. Then we take the edges incident to that node and add them 157 | to the validation/test set (except when we already added enough edges for 158 | a bucket and we also only add a maximum number of edges of the same bucket 159 | per node; otherwise the most frequent bucket type would contain mostly 160 | edges of the first few nodes). 161 | :param unavailable_nodes: Set of node indices that are no longer available 162 | for being included in the new validation/test set. 163 | :param unavailable_edges: Set of edge indices that are no longer available 164 | for being included in the new validation/test set. 165 | :param smallest_bin_idx: Index of the bin with the fewest samples. 166 | Determines how to greedily select edges. 167 | :param bin_samples: NumPy array of edge indices belonging to the smallest 168 | bin that are used to guide the creation of the node set. 169 | :param bin_idcs: E-shaped vector specifying for each edge which bin it 170 | belongs to. 171 | :param edge_idcs: Ex2-shaped tensor indicating the indices of the two nodes 172 | an edge connects. 173 | :param num_bins: Number of bins. 174 | :param max_per_bin: Maximum number of edges to find for each bin. 175 | :param MAX_PER_BIN_AND_NODE: Maximum number of edges that a single node 176 | may add to a single bin. 177 | :return: 178 | """ 179 | def add_incident_edges(inc_edge_idcs, current_set_edges, 180 | current_set_bin_counts): 181 | """ 182 | Adds the edges given in `inc_edge_idcs` to `current_set_edges` subject 183 | to some conditions. 184 | :param inc_edge_idcs: 185 | :param current_set_edges: 186 | :param current_set_bin_counts: 187 | :return: 188 | """ 189 | added_count = 0 # Number of edges actually added 190 | node_bin_counts = np.zeros(num_bins) # Bin counts for edges actually added 191 | for inc_edge_idx in inc_edge_idcs: 192 | edge_bin = bin_idcs[inc_edge_idx] 193 | if (inc_edge_idx not in current_set_edges 194 | and inc_edge_idx not in unavailable_edges 195 | and node_bin_counts[edge_bin] < MAX_PER_BIN_AND_NODE 196 | and current_set_bin_counts[edge_bin]+node_bin_counts[edge_bin] < max_per_bin[edge_bin]): 197 | added_count += 1 198 | node_bin_counts[edge_bin] += 1 199 | current_set_edges.add(inc_edge_idx) 200 | return added_count, node_bin_counts 201 | 202 | # Samples in smallest bin 203 | set_nodes, set_edges = set(), set() # Nodes and edges selected for validation/test 204 | set_bin_counts = np.zeros(num_bins) # Counts of the edges in validation/test set for each bin 205 | 206 | for idx, edge_idx in enumerate(bin_samples): 207 | # If we have enough edges of the rarest (smallest) type, we can stop 208 | # adding edges. 209 | if set_bin_counts[smallest_bin_idx] >= max_per_bin[smallest_bin_idx]: 210 | break 211 | # If the edge is already in a different set, do not include it 212 | if edge_idx in unavailable_edges: 213 | continue 214 | out_node, in_node = tuple(edge_idcs[edge_idx]) 215 | 216 | # If both nodes are no longer available, go to next edge 217 | if in_node in unavailable_nodes and out_node in unavailable_nodes: 218 | continue 219 | # Decide which of the two nodes to add based on whether one is already 220 | # in the node set or no longer available 221 | if in_node in set_nodes or out_node in unavailable_nodes: # We have already added in_node to set_nodes in a previous iteration OR in_node belongs to an excluded set (e.g. the validation set created in a previous call to this method). 222 | node_to_add = in_node 223 | elif out_node in set_nodes or in_node in unavailable_nodes: 224 | node_to_add = out_node 225 | else: 226 | node_to_add = in_node 227 | 228 | # Add node to the set 229 | set_nodes.add(node_to_add) 230 | # Now add all the edges incident to the new node to the edge set 231 | # (subject to some conditions). 232 | # For outgoing edges 233 | out_edge_idcs, = np.where(edge_idcs[:, 0] == node_to_add) 234 | add_set_out_count, add_node_bin_counts = add_incident_edges(out_edge_idcs, set_edges, set_bin_counts) 235 | set_bin_counts += add_node_bin_counts 236 | # For incoming edges 237 | in_edge_idcs, = np.where(edge_idcs[:, 1] == node_to_add) 238 | add_set_in_count, add_node_bin_counts = add_incident_edges(in_edge_idcs, set_edges, set_bin_counts) 239 | set_bin_counts += add_node_bin_counts 240 | 241 | if np.any(set_bin_counts >= max_per_bin): 242 | print(f"One bin full after adding {idx+1} edges.") 243 | return set_edges, set_nodes 244 | 245 | 246 | def _compute_known_flows(true_flows, edge_idcs, edge_data, val_node_idcs, 247 | test_node_idcs): 248 | known_flows = np.copy(true_flows) 249 | unknown_nodes = np.concatenate((val_node_idcs, test_node_idcs)) 250 | loc1_unknown = np.isin(edge_idcs[:, 0], unknown_nodes) 251 | loc2_unknown = np.isin(edge_idcs[:, 1], unknown_nodes) 252 | known_flows[loc1_unknown] = edge_data["neighbourhood_to_location2"].iloc[loc1_unknown] 253 | known_flows[loc2_unknown] = edge_data["location1_to_neighbourhood"].iloc[loc2_unknown] 254 | return known_flows 255 | 256 | 257 | def _substitute_in_approximations(val_nodes, test_nodes, adj_idcs, flows, 258 | o2n_flow_approx, n2d_flow_approx, node_data): 259 | train_flows = np.copy(flows) 260 | 261 | # Replace flows for edges incident to validation nodes by approximations 262 | val_outgoing_edge_idcs = np.isin(adj_idcs[0], val_nodes) 263 | train_flows[val_outgoing_edge_idcs] = n2d_flow_approx[val_outgoing_edge_idcs] 264 | val_incoming_edge_idcs = np.isin(adj_idcs[1], val_nodes) 265 | train_flows[val_outgoing_edge_idcs] = o2n_flow_approx[val_incoming_edge_idcs] 266 | 267 | # Replace flows for edges incident to test nodes by approximations 268 | test_outgoing_edges = np.isin(adj_idcs[0], test_nodes) 269 | train_flows[test_outgoing_edges] = n2d_flow_approx[test_outgoing_edges] 270 | test_incoming_edges = np.isin(adj_idcs[1], test_nodes) 271 | train_flows[test_outgoing_edges] = o2n_flow_approx[test_incoming_edges] 272 | 273 | # Set flows of edge between nodes within the two sets to 0 274 | union_nodes = np.concatenate((val_nodes, test_nodes)) 275 | inner_edges = np.logical_and((np.isin(adj_idcs[0], union_nodes)), (np.isin(adj_idcs[1], union_nodes))) 276 | train_flows[inner_edges] = 0.0 277 | 278 | # In node features, replace flow-dependent values of validation/test nodes 279 | # by their spatial-lag approximation 280 | node_data.loc[node_data["nodeID"].isin(union_nodes), "out_total"] = node_data["out_total_spatial_lag"] 281 | node_data.loc[node_data["nodeID"].isin(union_nodes), "in_total"] = node_data["in_total_spatial_lag"] 282 | node_data.loc[node_data["nodeID"].isin(union_nodes), "gyration_radius"] = node_data["gyration_radius_spatial_lag"] 283 | 284 | return train_flows, node_data 285 | 286 | 287 | def _remove_unused_zero_edges(train_edge_idcs, val_edge_idcs, test_edge_idcs, 288 | edge_idcs, true_flows, known_flows, edge_data): 289 | """ 290 | We need to change 0-valued edges that are in the training, validation, or 291 | test set to have a non-zero value because the data set class uses the 292 | entries of the sparse matrix for some computations and 0-values would just 293 | be removed, hence messing up the indexing. In particular, we want to only 294 | specify the edge indices that make up each of these sets. But we cannot 295 | have 0-valued edges vanish because that would mess up the indexing. 296 | :param train_edge_idcs: 297 | :param val_edge_idcs: 298 | :param test_edge_idcs: 299 | :param edge_idcs: 300 | :param true_flows: 301 | :param known_flows: 302 | :param edge_data: 303 | :return: 304 | """ 305 | # all edges in either the training, validation or test set 306 | num_edges = len(true_flows) 307 | tvt_edges = np.concatenate((train_edge_idcs, val_edge_idcs, 308 | test_edge_idcs), axis=-1) 309 | tvt_edges = np.isin(np.arange(num_edges), tvt_edges) # convert to boolean array 310 | 311 | # Find out the indices of the training, validation, and test edges within 312 | # the filtered set of edges 313 | indices = np.zeros(num_edges, dtype=np.int) 314 | indices[train_edge_idcs] = 1 315 | indices[val_edge_idcs] = 2 316 | indices[test_edge_idcs] = 3 317 | # Remove 0-valued edges that are not in the training, validation, or test 318 | # set, i.e. keep edges that are in one of the sets or have a non-zero exact 319 | # or approximate flow. 320 | retained_edges = tvt_edges | (known_flows != 0.0) | (true_flows != 0.0) 321 | indices = indices[retained_edges] 322 | edge_idcs = edge_idcs[retained_edges] 323 | true_flows = true_flows[retained_edges] 324 | known_flows = known_flows[retained_edges] 325 | edge_data = edge_data.iloc[retained_edges] 326 | # Update indices 327 | train_edge_idcs = np.where(indices == 1)[0] 328 | val_edge_idcs = np.where(indices == 2)[0] 329 | test_edge_idcs = np.where(indices == 3)[0] 330 | 331 | return (train_edge_idcs, val_edge_idcs, test_edge_idcs, edge_idcs, 332 | true_flows, known_flows, edge_data) 333 | 334 | 335 | def _compute_flow_adj_matrix(known_flows, edge_idcs): 336 | """ 337 | :param known_flows: Specifies for each edge the known flow (i.e. true flow 338 | or spatial lag flow). Shape [E]. 339 | :param edge_idcs: Specifies for each edge the indices of the two incident 340 | nodes. Shape [E, 2]. 341 | :return: 342 | - flow_adj_idcs: Indices of non-zero entries in the flow adjacenty 343 | matrix. Shape [2E, 2]. 344 | - know_flows: Specifies the non-zero values in the flow matrix. Shape 345 | [2E]. 346 | """ 347 | upper_triag_idcs = edge_idcs[known_flows > 0.0] 348 | known_flows = known_flows[known_flows > 0.0] 349 | lower_triag_idcs = np.stack((upper_triag_idcs[:, 1], upper_triag_idcs[:, 0]), axis=1) 350 | flow_adj_idcs = np.concatenate((upper_triag_idcs, lower_triag_idcs), axis=0) 351 | flow_adj_values = np.concatenate((known_flows, np.copy(known_flows))) 352 | return flow_adj_idcs, flow_adj_values 353 | 354 | 355 | def _compute_incidence_matrix(adj_indices, num_nodes, num_edges): 356 | # For both incoming and outgoing edges 357 | inc_matrix = np.zeros((num_nodes, num_edges)) 358 | inc_matrix[adj_indices[0], np.arange(num_edges)] = 1 359 | inc_matrix[adj_indices[1], np.arange(num_edges)] = 1 360 | inc_indices = np.stack(np.nonzero(inc_matrix)) 361 | return inc_indices 362 | 363 | 364 | def _compute_node_idcs_matrix(node_idcs, edge_idcs_set, num_nodes): 365 | num_edges = len(edge_idcs_set) 366 | node_idcs_matrix = np.zeros(num_edges, num_nodes) 367 | node_idcs_matrix[np.arange(num_edges), node_idcs] = 1.0 368 | node_idcs_matrix[np.arange(num_edges), node_idcs] = 1.0 369 | return node_idcs_matrix 370 | 371 | 372 | def _filter_feature_data(node_data, edge_data, include_spatial_lag): 373 | edge_data = edge_data.drop(["flows"], axis=1) 374 | node_data = node_data.drop(["nodeID", "in_total_spatial_lag", 375 | "out_total_spatial_lag", 376 | "gyration_radius_spatial_lag"], axis=1) 377 | if not include_spatial_lag: 378 | node_data = node_data.drop(["in_total", "out_total", "gyration_rad"], 379 | axis=1) 380 | 381 | return node_data, edge_data 382 | 383 | 384 | def _store_dataset_files(data_path, edge_data, node_data, flows, flow_adj_idcs, 385 | flow_adj_values, geo_adj_idcs, flow_inc_indices, 386 | geo_inc_indices, train_edge_idcs, val_edge_idcs, 387 | test_edge_idcs): 388 | data_path.mkdir(exist_ok=True) 389 | pd.to_pickle(edge_data, data_path / "edge_data.pk") 390 | pd.to_pickle(node_data, data_path / "node_data.pk") 391 | np.save(data_path / "flows.npy", flows) 392 | np.save(data_path / "flow_adj_indices.npy", flow_adj_idcs) 393 | np.save(data_path / "flow_adj_values.npy", flow_adj_values) 394 | np.save(data_path / "geo_adj_indices.npy", geo_adj_idcs) 395 | np.save(data_path / "flow_inc_indices.npy", flow_inc_indices) 396 | np.save(data_path / "geo_inc_indices.npy", geo_inc_indices) 397 | np.save(data_path / "train_edge_indices.npy", train_edge_idcs) 398 | np.save(data_path / "val_edge_indices.npy", val_edge_idcs) 399 | np.save(data_path / "test_edge_indices.npy", test_edge_idcs) 400 | 401 | 402 | if __name__ == '__main__': 403 | prepare_dataset() 404 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import numpy as np 4 | import pandas as pd 5 | import torch 6 | from torch.utils.data import TensorDataset, DataLoader, BatchSampler 7 | from torch.utils.data import WeightedRandomSampler 8 | import scipy.sparse as ssp 9 | import sklearn.preprocessing as prep 10 | import sklearn.pipeline as ppln 11 | from sklearn.utils import class_weight 12 | import matplotlib.pyplot as plt 13 | 14 | from utils import to_sparse_tensor, bin_data, normalize, split_bucketed_data, \ 15 | summarize_feature_matrix 16 | 17 | 18 | def get_composite_transformer(n_quantiles): 19 | transformer = ppln.Pipeline([ 20 | ("quantile", prep.QuantileTransformer(output_distribution="normal", 21 | n_quantiles=n_quantiles)), 22 | ("normalize", prep.StandardScaler()) 23 | ]) 24 | return transformer 25 | 26 | 27 | class BinnedTransformer: 28 | 29 | def __init__(self, num_bins, create_transformer_f): 30 | self.num_bins = num_bins 31 | 32 | self.transformers = [create_transformer_f() for _ in range(num_bins)] 33 | 34 | def fit_transform(self, x_reg, x_class): 35 | transformed_x_reg = np.copy(x_reg) 36 | for bin_idx in range(self.num_bins): 37 | sample_idcs = x_class == bin_idx 38 | transformer = self.transformers[bin_idx] 39 | transformed_x_reg[sample_idcs] = transformer.fit_transform( 40 | transformed_x_reg[sample_idcs]) 41 | return transformed_x_reg 42 | 43 | def inverse_transform(self, x_reg, x_class): 44 | x_reg = x_reg.reshape(-1) 45 | transformed_x_reg = np.copy(x_reg) 46 | for bin_idx in range(self.num_bins): 47 | sample_idcs = x_class == bin_idx 48 | if np.sum(sample_idcs) == 0: continue # no sample of that class 49 | transformer = self.transformers[bin_idx] 50 | transformed_x_reg[sample_idcs] = transformer.inverse_transform( 51 | x_reg[sample_idcs].reshape(-1, 1)).reshape(-1) 52 | return transformed_x_reg 53 | 54 | 55 | class GraphTopologicalData: 56 | 57 | def __init__(self, adj_matrix=None, unweighted_adj_matrix=None, 58 | inc_matrix=None, inc_matrix_dense=None, edge_indices=None, 59 | edge_weights=None): 60 | self.adj_matrix = adj_matrix # NxN sparse matrix 61 | self.unweighted_adj_matrix = unweighted_adj_matrix # NxN sparse matrix 62 | self.inc_matrix = inc_matrix # NxE sparse matrix 63 | self.inc_matrix_dense = inc_matrix_dense # NxE dense matrix 64 | self.edge_indices = edge_indices # Ex2 dense matrix 65 | self.edge_weights = edge_weights # E dense vector 66 | 67 | 68 | class UrbanPlanningDataset: 69 | 70 | def __init__(self, data_base_path="Data/", num_bins=4, batch_size=32, 71 | n_quantiles=1000, resample=False, 72 | excluded_node_feature_columns=tuple(), 73 | excluded_edge_feature_columns=tuple(), 74 | use_binned_transformer=False, include_approx_flows=False, 75 | flow_adj_threshold=0, seed=7): 76 | """ 77 | Loads city data set. 78 | :param data_base_path: Location at which to find the node features, 79 | edge features, and the adjacency matrix. 80 | :param num_bins: Number of bins for dividing the data set labels. The 81 | bin index may be a classification target or for computing MAEs for each 82 | bin separately. 83 | :param batch_size: 84 | :param n_quantiles: Number of quantiles to use for the quantile 85 | transformer that preprocesses features and labels. 86 | :param excluded_node_feature_columns: Tuple of names of the columns 87 | to remove from the node feature data set. 88 | :param excluded_edge_feature_columns: Tuple of names of the columns to 89 | remove from the edge feature data set. 90 | :param resample: If True, we use a weighted random sampler to ensure 91 | that each epoch contains an equal number of samples from each bin. 92 | :param use_binned_transformer: If True, the edge labels are rescaled 93 | using an individual transformer for each bin. Inverting the 94 | transformation then requires both a regression and classification 95 | prediction. 96 | :param include_approx_flows: If True, the edge features include the 97 | approximate flows (normally used just for flow adjacency matrix). 98 | :param flow_adj_threshold: When constructing the unweighted flow 99 | adjacency matrix, only include edges with a flow greater or equal that 100 | threshold. 101 | :param seed: Random seed to always obtain the same split into training, 102 | validation, and test set. 103 | :return: Tuple consisting of 104 | - Node features of shape [N, K] 105 | - Sparse adjacency matrix of shape [N, N] 106 | - Loader for the training set of edges 107 | - Loader for the validation set of edges 108 | - Loader for the test set of edges 109 | - Number of node features 110 | - Number of edge features 111 | - Scaler used for edge labels 112 | """ 113 | print("Loading data") 114 | 115 | self.num_bins = num_bins 116 | self.batch_size = batch_size 117 | self.n_quantiles = n_quantiles 118 | self.use_binned_transformer = use_binned_transformer 119 | 120 | get_composite_transformer_f = lambda: get_composite_transformer( 121 | n_quantiles=n_quantiles) 122 | 123 | # Load node data 124 | (self.node_feats, self.num_nodes, self.num_node_feats, 125 | self.node_scaler) = self._load_node_data(data_base_path, 126 | get_composite_transformer_f, 127 | excluded_node_feature_columns) 128 | 129 | # Load edge data 130 | (flow_edge_indices, self.edge_feats, self.edge_labels, 131 | self.edge_labels_unscaled, self.label_scaler, self.edge_scaler, 132 | self.num_edges, self.num_edge_feats) = self._load_edge_data( 133 | data_base_path, 134 | get_composite_transformer_f, 135 | include_approx_flows, 136 | excluded_edge_feature_columns) 137 | self.max_label = np.max(self.edge_labels_unscaled) 138 | print(f"\tMax label {self.max_label}") 139 | 140 | (train_idcs, val_idcs, test_idcs) = self._load_dataset_split( 141 | data_base_path) 142 | 143 | # Load flow graph data 144 | (flow_adj_matrix, flow_inc_matrix, flow_adj_indices, 145 | unweighted_flow_adj_matrix, 146 | flow_adj_values) = self._load_flow_graph_data( 147 | data_base_path, self.num_nodes, self.num_edges, flow_adj_threshold) 148 | self.flow_topology = GraphTopologicalData( 149 | adj_matrix=flow_adj_matrix, 150 | edge_indices=flow_adj_indices, 151 | unweighted_adj_matrix=unweighted_flow_adj_matrix, 152 | inc_matrix=flow_inc_matrix, 153 | edge_weights=flow_adj_values 154 | ) 155 | 156 | # Load geographical graph data 157 | (geo_adj_matrix, geo_inc_matrix, 158 | geo_edge_indices, geo_adj_values) = self._load_geo_graph_data( 159 | data_base_path, self.num_nodes, self.num_edges, self.flow_topology) 160 | self.geo_topology = GraphTopologicalData( 161 | adj_matrix=geo_adj_matrix, 162 | inc_matrix=geo_inc_matrix, 163 | edge_indices=geo_edge_indices, 164 | edge_weights=geo_adj_values) 165 | 166 | # Load bin data 167 | self.bin_bounds = [10.0, 100.0, 1000.0, 10000.0] 168 | (self.edge_buckets, self.train_bin_weights, self.val_bin_weights, 169 | self.test_bin_weights) = self._load_bin_data(self.bin_bounds, 170 | self.edge_labels_unscaled, 171 | num_bins, train_idcs, 172 | val_idcs, test_idcs) 173 | print(f"\tBin counts: {np.array([np.sum(self.edge_buckets == i) for i in range(num_bins)])}") 174 | print(f"\tTraining bin weights: {self.train_bin_weights}") 175 | print(f"\tValidation bin weights: {self.val_bin_weights}") 176 | print(f"\tTest bin weights: {self.test_bin_weights}") 177 | 178 | # If specified, use the binned transformer to transform labels 179 | if use_binned_transformer: 180 | self.label_scaler = BinnedTransformer(self.num_bins, 181 | get_composite_transformer_f) 182 | self.edge_labels = self.label_scaler.fit_transform( 183 | self.edge_labels_unscaled.reshape(-1, 1), self.edge_buckets).reshape(-1) 184 | # plt.hist(self.edge_labels, bins=100) 185 | # plt.show() 186 | 187 | # Create edge feature matrix 188 | indices = flow_edge_indices.transpose(1, 0) 189 | values = self.edge_feats 190 | edge_feat_matrix = torch.sparse.FloatTensor(torch.from_numpy(indices), torch.from_numpy(values)) 191 | self.edge_feat_matrix = edge_feat_matrix.to_dense() 192 | 193 | # Convert numpy arrays to tensors 194 | self.node_feats = torch.from_numpy(self.node_feats) 195 | self.edge_feats = torch.from_numpy(self.edge_feats) 196 | flow_edge_indices = torch.from_numpy(flow_edge_indices) 197 | self.flow_topology.edge_indices = torch.from_numpy(self.flow_topology.edge_indices) 198 | self.flow_topology.edge_weights = torch.from_numpy(self.flow_topology.edge_weights) 199 | self.geo_topology.edge_indices = torch.from_numpy(self.geo_topology.edge_indices) 200 | self.geo_topology.edge_weights = torch.from_numpy(self.geo_topology.edge_weights) 201 | self.edge_labels = torch.from_numpy(self.edge_labels) 202 | self.edge_labels_unscaled = torch.from_numpy(self.edge_labels_unscaled) 203 | self.edge_buckets = torch.from_numpy(self.edge_buckets) 204 | self.train_bin_weights = torch.from_numpy(self.train_bin_weights) 205 | self.val_bin_weights = torch.from_numpy(self.val_bin_weights) 206 | self.test_bin_weights = torch.from_numpy(self.test_bin_weights) 207 | # Matrices 208 | self.geo_topology.adj_matrix = to_sparse_tensor(normalize(self.geo_topology.adj_matrix)) 209 | self.geo_topology.inc_matrix = to_sparse_tensor(self.geo_topology.inc_matrix) 210 | self.flow_topology.adj_matrix = to_sparse_tensor(self.flow_topology.adj_matrix) # Sparse tensor of shape [N, N] containing the flow values between nodes. 211 | self.flow_topology.unweighted_adj_matrix = to_sparse_tensor(self.flow_topology.unweighted_adj_matrix) 212 | self.flow_topology.inc_matrix = to_sparse_tensor(self.flow_topology.inc_matrix) 213 | self._check_data_consistency() 214 | 215 | # Create data loaders 216 | (self.train_loader, self.val_loader, 217 | self.test_loader) = self._create_data_loaders(train_idcs, val_idcs, 218 | test_idcs, 219 | self.train_bin_weights, 220 | flow_edge_indices, # different from flow_graph_topology.edge_indices because of additional 0-flows 221 | self.edge_feats, 222 | self.edge_labels, 223 | self.edge_buckets, 224 | batch_size, resample, 225 | seed) 226 | 227 | print("Finished loading data") 228 | 229 | def _check_data_consistency(self): 230 | tensors = [self.node_feats, self.edge_feats, 231 | self.flow_topology.edge_indices, 232 | self.geo_topology.edge_indices, self.edge_labels, 233 | self.edge_labels_unscaled, self.edge_buckets, 234 | self.train_bin_weights, self.val_bin_weights, 235 | self.test_bin_weights, self.geo_topology.adj_matrix, 236 | self.geo_topology.inc_matrix, self.flow_topology.adj_matrix, 237 | self.flow_topology.unweighted_adj_matrix, 238 | self.flow_topology.inc_matrix, self.edge_feat_matrix] 239 | print("Checking ", end="") 240 | for idx, tensor in enumerate(tensors): 241 | print(f"{idx}, ", end="") 242 | if (isinstance(tensor, torch.sparse.FloatTensor) or 243 | isinstance(tensor, torch.sparse.LongTensor)): 244 | assert not torch.isnan(tensor.coalesce().indices()).any() 245 | assert not torch.isnan(tensor.coalesce().values()).any() 246 | else: 247 | assert not torch.isnan(tensor).any() 248 | print("done") 249 | 250 | def to(self, device): 251 | """ 252 | Moves all tensors of the dataset that will not be iterated over in 253 | minibatch to the specified device. 254 | :param device: Device specifier. 255 | """ 256 | self.node_feats = self.node_feats.to(device=device) 257 | self.edge_feats = self.edge_feats.to(device=device) 258 | self.flow_topology.edge_indices = self.flow_topology.edge_indices.to(device=device) 259 | self.geo_topology.edge_indices = self.geo_topology.edge_indices.to(device=device) 260 | self.train_bin_weights = self.train_bin_weights.to(device=device) 261 | self.geo_topology.adj_matrix = self.geo_topology.adj_matrix.to(device=device) 262 | self.geo_topology.inc_matrix = self.geo_topology.inc_matrix.to(device=device) 263 | self.geo_topology.edge_weights = self.geo_topology.edge_weights.to(device=device) 264 | self.flow_topology.adj_matrix = self.flow_topology.adj_matrix.to(device=device) 265 | self.flow_topology.unweighted_adj_matrix = self.flow_topology.unweighted_adj_matrix.to( 266 | device=device) 267 | self.flow_topology.inc_matrix = self.flow_topology.inc_matrix.to(device=device) 268 | self.flow_topology.edge_weights = self.flow_topology.edge_weights.to(device=device) 269 | self.edge_feat_matrix = self.edge_feat_matrix.to(device=device) 270 | 271 | @staticmethod 272 | def _load_node_data(data_base_path, get_composite_transformer_f, 273 | excluded_columns): 274 | # Node features 275 | node_data = pd.read_pickle(os.path.join(data_base_path, "node_data.pk")) 276 | if len(excluded_columns) > 0: 277 | node_data.drop(list(excluded_columns), axis=1, inplace=True) 278 | node_feats = node_data.values 279 | # Rescale continuous features 280 | node_scaler = get_composite_transformer_f() 281 | cont_feature_idcs = UrbanPlanningDataset._get_continuous_feature_idcs(node_data) 282 | node_feats[:, cont_feature_idcs] = node_scaler.fit_transform(node_feats[:, cont_feature_idcs]) 283 | node_feats = node_feats.astype(np.float32) 284 | num_nodes = node_feats.shape[0] 285 | num_node_feats = node_feats.shape[1] 286 | return node_feats, num_nodes, num_node_feats, node_scaler 287 | 288 | @staticmethod 289 | def _load_edge_data(data_base_path, get_composite_transformer_f, 290 | include_approx_flows, excluded_columns): 291 | # Edge data 292 | edge_data = pd.read_pickle(os.path.join(data_base_path, "edge_data.pk")) 293 | if len(excluded_columns) > 0: 294 | edge_data.drop(list(excluded_columns), axis=1, inplace=True) 295 | edge_feats = edge_data.values 296 | edge_indices = edge_feats[:, :2].astype(np.int) 297 | edge_feats = edge_feats[:, 2:] 298 | # Load approximate flows and potentially concatenate to edge features 299 | # approx_flows = np.load(os.path.join(data_base_path, 300 | # "approx_flows.npy")) 301 | if include_approx_flows: 302 | raise NotImplementedError 303 | # edge_feats = np.concatenate((edge_feats, approx_flows.reshape(-1, 1)), 304 | # axis=-1) 305 | num_edges = edge_feats.shape[0] 306 | edge_labels = np.load(os.path.join(data_base_path, "flows.npy")) 307 | edge_labels_unscaled = np.copy(edge_labels).astype(np.float32) 308 | # Transform edge features 309 | edge_scaler = get_composite_transformer_f() 310 | cont_feature_idcs = UrbanPlanningDataset._get_continuous_feature_idcs(edge_data.iloc[:, 2:]) 311 | edge_feats[:, cont_feature_idcs] = edge_scaler.fit_transform(edge_feats)[:, cont_feature_idcs] 312 | edge_feats = edge_feats.astype(np.float32) 313 | # Transform edge labels 314 | edge_labels = edge_labels.astype(np.float32) 315 | label_scaler = get_composite_transformer_f() 316 | edge_labels = label_scaler.fit_transform( 317 | edge_labels.reshape(-1, 1)).reshape(-1) 318 | num_edge_feats = edge_feats.shape[1] 319 | return (edge_indices, edge_feats, edge_labels, edge_labels_unscaled, 320 | label_scaler, edge_scaler, num_edges, num_edge_feats) 321 | 322 | @staticmethod 323 | def _load_dataset_split(data_base_path): 324 | data_base_path = pathlib.Path(data_base_path) 325 | train_idcs = np.load(data_base_path / "train_edge_indices.npy") 326 | val_idcs = np.load(data_base_path / "val_edge_indices.npy") 327 | test_idcs = np.load(data_base_path / "test_edge_indices.npy") 328 | return train_idcs, val_idcs, test_idcs 329 | 330 | @staticmethod 331 | def _load_bin_data(bin_bounds, edge_labels_unscaled, num_bins, 332 | train_idcs, val_idcs, test_idcs): 333 | # Get edge buckets (assign each edge to a bucket based on magnitude of 334 | # flow) 335 | edge_buckets = bin_data(edge_labels_unscaled, num_bins, 336 | scale="custom", bin_bounds=bin_bounds) 337 | # Compute weights for each bucket to counterbalance the imbalanced 338 | # class/bin distribution 339 | train_bin_weights = class_weight.compute_class_weight('balanced', 340 | np.unique(edge_buckets), 341 | edge_buckets[train_idcs]) 342 | val_bin_weights = class_weight.compute_class_weight('balanced', 343 | np.unique(edge_buckets), 344 | edge_buckets[val_idcs]) 345 | test_bin_weights = class_weight.compute_class_weight('balanced', 346 | np.unique(edge_buckets), 347 | edge_buckets[test_idcs]) 348 | train_bin_weights = train_bin_weights.astype(np.float32) 349 | val_bin_weights = val_bin_weights.astype(np.float32) 350 | test_bin_weights = test_bin_weights.astype(np.float32) 351 | return edge_buckets, train_bin_weights, val_bin_weights, test_bin_weights 352 | 353 | @staticmethod 354 | def _load_flow_graph_data(data_base_path, num_nodes, num_edges, 355 | flow_adj_threshold): 356 | # Flow adjacency matrix 357 | flow_adj_indices = np.load(os.path.join(data_base_path, 358 | "flow_adj_indices.npy")).T 359 | flow_adj_values = np.load(os.path.join(data_base_path, 360 | "flow_adj_values.npy")) 361 | flow_adj_matrix = ssp.coo_matrix((flow_adj_values, 362 | (flow_adj_indices[0], 363 | flow_adj_indices[1])), 364 | shape=(num_nodes, num_nodes)) 365 | flow_adj_matrix = flow_adj_matrix.tocsr() 366 | 367 | unweighted_flow_adj_indices = flow_adj_indices[:, 368 | flow_adj_values >= flow_adj_threshold] 369 | flow_adj_values = flow_adj_values[flow_adj_values >= flow_adj_threshold] 370 | unweighted_flow_adj_matrix = ssp.coo_matrix( 371 | (flow_adj_values, 372 | (unweighted_flow_adj_indices[0], unweighted_flow_adj_indices[1])), 373 | shape=(num_nodes, num_nodes)) 374 | unweighted_flow_adj_matrix.setdiag(np.ones(num_nodes)) 375 | flow_adj_values = unweighted_flow_adj_matrix.tocoo().data 376 | flow_adj_indices = np.stack((unweighted_flow_adj_matrix.row, 377 | unweighted_flow_adj_matrix.col), axis=-1) 378 | flow_adj_indices = flow_adj_indices.astype(np.int64) 379 | flow_adj_values = flow_adj_values.astype(np.float32) 380 | unweighted_flow_adj_matrix = (unweighted_flow_adj_matrix > 0.0).astype(np.float) 381 | 382 | # Flow incidence matrix for all edges 383 | flow_inc_indices = np.load(os.path.join(data_base_path, 384 | "flow_inc_indices.npy")) 385 | flow_inc_matrix = ssp.coo_matrix( 386 | (np.ones(flow_inc_indices.shape[1]), 387 | (flow_inc_indices[0], 388 | flow_inc_indices[1])), 389 | shape=(num_nodes, num_edges)) 390 | flow_inc_matrix = flow_inc_matrix.tocsr() 391 | 392 | return (flow_adj_matrix, flow_inc_matrix, flow_adj_indices, 393 | unweighted_flow_adj_matrix, flow_adj_values) 394 | 395 | @staticmethod 396 | def _load_geo_graph_data(data_base_path, num_nodes, num_edges, 397 | flow_topology): 398 | # Geographical adjacency matrix 399 | geo_adj_indices = np.load(os.path.join(data_base_path, 400 | "geo_adj_indices.npy")) 401 | geo_adj_matrix = ssp.coo_matrix((np.ones(geo_adj_indices.shape[1]), 402 | (geo_adj_indices[0], 403 | geo_adj_indices[1])), 404 | shape=(num_nodes, num_nodes)) 405 | geo_adj_matrix = geo_adj_matrix.tocsr() 406 | 407 | # Geographical incidence matrix for all edges 408 | geo_inc_indices = np.load(os.path.join(data_base_path, 409 | "geo_inc_indices.npy")) 410 | geo_inc_matrix = ssp.coo_matrix( 411 | (np.ones(geo_inc_indices.shape[1]), 412 | (geo_inc_indices[0], 413 | geo_inc_indices[1])), 414 | shape=(num_nodes, num_edges)) 415 | geo_inc_matrix = geo_inc_matrix.tocsr() 416 | 417 | # Get flows for the geographical edges 418 | all_edges = np.array(flow_topology.adj_matrix.todense()).reshape(-1) # N^2 matrix 419 | geo_indices_of_edges = np.array(geo_adj_matrix.todense()).reshape(-1).nonzero() # N^2 matrix 420 | geo_flows = all_edges[geo_indices_of_edges] 421 | del all_edges 422 | all_edges = None 423 | del geo_indices_of_edges 424 | geo_indices_of_edges = None 425 | geo_flows = (geo_flows+1e-5).astype(np.float32) 426 | 427 | return geo_adj_matrix, geo_inc_matrix, geo_adj_indices.T, geo_flows 428 | 429 | @staticmethod 430 | def _create_data_loaders(train_idcs, val_idcs, test_idcs, 431 | train_bin_weights, edge_indices, edge_feats, 432 | edge_labels, edge_buckets, batch_size, resample, 433 | seed): 434 | """ 435 | :param train_idcs: 436 | :param val_idcs: 437 | :param test_idcs: 438 | :param train_bin_weights: 439 | :param edge_indices: 440 | :param edge_feats: 441 | :param edge_labels: 442 | :param edge_buckets: 443 | :param flow_node_edges_matrix: Transpose of the incidence matrix 444 | for incoming edges. Shape [E, N]. 445 | :param batch_size: 446 | :param resample: 447 | :param seed: 448 | :return: 449 | """ 450 | assert (len(edge_indices) == len(edge_feats) == len(edge_labels) 451 | == len(edge_buckets)) 452 | 453 | train_idcs = torch.from_numpy(train_idcs) 454 | val_idcs = torch.from_numpy(val_idcs) 455 | test_idcs = torch.from_numpy(test_idcs) 456 | 457 | # Sample weights 458 | train_sample_weights = train_bin_weights[edge_buckets[train_idcs]] 459 | 460 | # Compute split into training, validation, and test set 461 | np.random.seed(seed) 462 | if resample: 463 | train_sampler = BatchSampler( 464 | WeightedRandomSampler(train_sample_weights, 465 | train_idcs.shape[0]), 466 | batch_size=batch_size, drop_last=False) 467 | train_loader = DataLoader(TensorDataset(edge_indices[train_idcs], 468 | edge_feats[train_idcs], 469 | edge_labels[train_idcs], 470 | edge_buckets[train_idcs]), 471 | batch_sampler=train_sampler) 472 | else: 473 | train_loader = DataLoader(TensorDataset(edge_indices[train_idcs], 474 | edge_feats[train_idcs], 475 | edge_labels[train_idcs], 476 | edge_buckets[train_idcs]), 477 | batch_size=batch_size, shuffle=False) 478 | val_loader = DataLoader(TensorDataset(edge_indices[val_idcs], 479 | edge_feats[val_idcs], 480 | edge_labels[val_idcs], 481 | edge_buckets[val_idcs]), 482 | batch_size=batch_size, shuffle=False) 483 | test_loader = DataLoader(TensorDataset(edge_indices[test_idcs], 484 | edge_feats[test_idcs], 485 | edge_labels[test_idcs], 486 | edge_buckets[test_idcs]), 487 | batch_size=batch_size, shuffle=False) 488 | return train_loader, val_loader, test_loader 489 | 490 | @staticmethod 491 | def _get_continuous_feature_idcs(df): 492 | continuous_feature_idcs = [] 493 | for idx, col in enumerate(df.columns): 494 | if len(df[col].unique()) > 2: 495 | continuous_feature_idcs.append(idx) 496 | return continuous_feature_idcs 497 | 498 | 499 | if __name__ == '__main__': 500 | ds = UrbanPlanningDataset(data_base_path="Data/London_high/", 501 | use_binned_transformer=True, 502 | excluded_node_feature_columns=tuple()) 503 | 504 | print("\n\nNode features") 505 | summarize_feature_matrix(ds.node_feats.numpy()) 506 | print("\n\nEdge features") 507 | summarize_feature_matrix(ds.edge_feats.numpy()) --------------------------------------------------------------------------------