├── README.md
├── LICENSE
├── main.py
├── .gitignore
├── metrics.py
├── utils.py
├── fc_regression_baseline.py
├── training_environment.py
├── regression_model.py
├── layers.py
├── dataset_preparation.py
└── dataset.py


/README.md:
--------------------------------------------------------------------------------
1 | # Mobility-Flows-Neural-Networks
2 | 
3 | Code for the paper 'Learning Mobility Flows from Urban Features with Spatial Interaction Models and Neural Networks' presented at the 6th IEEE International Conference on Smart Computing (SMARTCOMP 2020). 
4 | 
5 | Preprint available available [here](https://arxiv.org/abs/2004.11924).
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Felix Opolka
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from regression_model import run_training
 2 | from training_environment import TrainingEnvironment, NodeConvGraph, NodeConvType, JKType
 3 | 
 4 | if __name__ == '__main__':
 5 |     hyperparameters = {
 6 |         # data set
 7 |         "data_base_path": "Data/London_high",
 8 |         "resampling": True,
 9 |         "n_quantiles": 5000,
10 |         "num_bins": 4,
11 |         "excluded_node_feature_columns": tuple(),
12 |         "excluded_edge_feature_columns": tuple(),
13 |         # model
14 |         "hidden_dim": 16,
15 |         "edge_rep_size": 8,
16 |         "num_edge_rep_layers": 2,
17 |         "include_node_reps": True,
18 |         "node_rep_size": 8,
19 |         "num_node_rep_layers": 1,
20 |         "improved_gcn": True,
21 |         "jk_type": JKType.NoJK,
22 |         "node_conv_type": NodeConvType.GraphConvolution,
23 |         "adj_flow_threshold": 0,
24 |         "dna_heads": 1,
25 |         "dna_groups": 1,
26 |         "include_edge_flow_feat": False,
27 |         "drop_prob": 0.3,
28 |         "weighted_loss": False,
29 |         "regression_loss": "L2",
30 |         # training
31 |         "cp_folder": "./checkpoints/test",
32 |         "lr": 0.01,
33 |         "lr_schedule": (50, 65, 80, 95),
34 |         "num_epochs": 110,
35 |         "batch_size": 256,
36 |     }
37 |     TrainingEnvironment.hyperparameter_search(hyperparameters, 3, run_training)
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | def compute_mae(predictions, labels, data):
  5 |     preds_unscaled, y_unscaled = _unscale(predictions, labels, data)
  6 |     mae = mae_metric(preds_unscaled, y_unscaled)
  7 |     return mae
  8 | 
  9 | 
 10 | def compute_mape(predictions, labels, data):
 11 |     preds_unscaled, y_unscaled = _unscale(predictions, labels, data)
 12 |     mape = mape_metric(preds_unscaled, y_unscaled)
 13 |     return mape
 14 | 
 15 | 
 16 | def compute_ssi(predictions, labels, data):
 17 |     preds_unscaled, y_unscaled = _unscale(predictions, labels, data)
 18 |     return ssi_metric(preds_unscaled, y_unscaled)
 19 | 
 20 | 
 21 | def compute_geh(predictions, labels, data):
 22 |     preds_unscaled, y_unscaled = _unscale(predictions, labels, data)
 23 |     return geh_metric(preds_unscaled, y_unscaled)
 24 | 
 25 | 
 26 | def compute_cpl(predictions, labels, data):
 27 |     preds_unscaled, y_unscaled = _unscale(predictions, labels, data)
 28 |     return cpl_metric(preds_unscaled, y_unscaled)
 29 | 
 30 | 
 31 | def compute_cpc(predictions, labels, data):
 32 |     preds_unscaled, y_unscaled = _unscale(predictions, labels, data)
 33 |     return cpc_metric(preds_unscaled, y_unscaled)
 34 | 
 35 | 
 36 | def compute_binned_metric(metric_f, predictions, labels, bins, data, num_bins):
 37 |     bins = np.concatenate(bins, axis=0).reshape(-1)
 38 |     preds_unscaled, y_unscaled = _unscale(predictions, labels, data)
 39 |     binned_metric = _compute_binned_metric(preds_unscaled, y_unscaled,
 40 |                                            bins, num_bins, metric_f)
 41 |     return binned_metric
 42 | 
 43 | 
 44 | def compute_macro_metric(metric_f, predictions, labels, bins, data, num_bins):
 45 |     binned_metric = compute_binned_metric(metric_f, predictions, labels, bins,
 46 |                                           data, num_bins)
 47 |     macro_metric = (np.nanmean(binned_metric)
 48 |                  if not np.all(np.isnan(binned_metric))
 49 |                  else np.nan)
 50 |     return macro_metric
 51 | 
 52 | 
 53 | def _unscale(preds, y, data):
 54 |     preds_unscaled = np.concatenate(preds, axis=0).reshape(-1, 1)
 55 |     preds_unscaled = data.label_scaler.inverse_transform(preds_unscaled)
 56 |     preds_unscaled = preds_unscaled.reshape(-1)
 57 |     y_unscaled = np.concatenate(y, axis=0).reshape(-1, 1)
 58 |     y_unscaled = data.label_scaler.inverse_transform(y_unscaled)
 59 |     y_unscaled = y_unscaled.reshape(-1)
 60 |     return preds_unscaled, y_unscaled
 61 | 
 62 | 
 63 | def mae_metric(preds_unscaled, y_unscaled):
 64 |     mae = np.absolute(preds_unscaled.reshape(-1) - y_unscaled.reshape(-1))
 65 |     mae = np.mean(mae)
 66 |     return mae
 67 | 
 68 | 
 69 | def mape_metric(preds_unscaled, y_unscaled):
 70 |     non_zero_target_idcs = y_unscaled > 1e-5
 71 |     if np.sum(non_zero_target_idcs) == 0:
 72 |         return np.nan
 73 |     non_zero_targets = y_unscaled[non_zero_target_idcs]
 74 |     predicted = preds_unscaled[non_zero_target_idcs]
 75 |     mape = np.absolute(predicted - non_zero_targets) / non_zero_targets
 76 |     mape = np.mean(mape, axis=0)
 77 |     return mape
 78 | 
 79 | 
 80 | def ssi_metric(preds_unscaled, y_unscaled):
 81 |     preds_unscaled = preds_unscaled[y_unscaled > 0]
 82 |     y_unscaled = y_unscaled[y_unscaled > 0]
 83 |     ssi = (np.sum(2 * np.minimum(preds_unscaled, y_unscaled)
 84 |                   / (preds_unscaled + y_unscaled))
 85 |            / len(y_unscaled))
 86 |     return ssi
 87 | 
 88 | 
 89 | def geh_metric(preds_unscaled, y_unscaled):
 90 |     geh = np.sqrt(2 * (preds_unscaled - y_unscaled)**2
 91 |                   / (preds_unscaled + y_unscaled))
 92 |     geh_percentage = len(geh[geh < 5]) / len(geh)
 93 |     return geh_percentage
 94 | 
 95 | 
 96 | def cpl_metric(preds_unscaled, y_unscaled):
 97 |     cpl = (2 * np.sum(preds_unscaled * y_unscaled > 1e-8)
 98 |            / (np.sum(preds_unscaled > 1e-8) + np.sum(y_unscaled > 1e-8)))
 99 |     return cpl
100 | 
101 | 
102 | def cpc_metric(preds_unscaled, y_unscaled):
103 |     cpc = (np.sum(2 * np.minimum(preds_unscaled, y_unscaled))
104 |            / (np.sum(preds_unscaled) + np.sum(y_unscaled)))
105 |     return cpc
106 | 
107 | 
108 | def _compute_binned_metric(out, y, bins, num_bins, metric_f):
109 |     """
110 |     Computes the given metric for each bin individually.
111 |     :param out: NumPy array containing model predictions.
112 |     :param y: NumPy array containing labels.
113 |     :param bins: NumPy array containing the bins that each label belongs to.
114 |     :param num_bins: Total number of bins.
115 |     :param metric_f: Function which receives the model predictions and labels
116 |     as arguments (in that order) and returns a scalar metric value.
117 |     :return: NumPy array of shape [num_bins] containing the metric value for
118 |     each bin.
119 |     """
120 |     metric_vals = []
121 |     for bin_idx in range(num_bins):
122 |         mask = bins == bin_idx
123 |         if np.sum(mask) > 0:
124 |             vals = metric_f(out[mask], y[mask])
125 |             metric_vals.append(vals)
126 |         else:
127 |             metric_vals.append(np.nan)
128 |     return np.array(metric_vals)
129 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | import torch.sparse as sp
  5 | import scipy.sparse as ssp
  6 | 
  7 | 
  8 | def split_bucketed_data(bin_idcs):
  9 |     """
 10 |     Splits a given set of samples into the specified number of buckets of
 11 |     equal size. Samples are assigned to buckets based on their label. Each
 12 |     bucket is split into train, validation, and test set and the overall
 13 |     training, validation, and test sets are the concatenation of the individual
 14 |     bucket subsets.
 15 |     This ensures that train, validation, and test set all contain the same
 16 |     number of samples of all sizes.
 17 |     :param bin_idcs: Specifies for each label the bucket it belongs to
 18 |     :return: Arrays specifying the indices of samples belonging to the
 19 |     training, validation, and test set respectively.
 20 |     """
 21 |     all_train_idcs = []
 22 |     all_val_idcs = []
 23 |     all_test_idcs = []
 24 |     num_bins = torch.max(bin_idcs) + 1
 25 |     for idx in range(num_bins):
 26 |         bucket_samples, = np.where(bin_idcs == idx)
 27 |         np.random.shuffle(bucket_samples)
 28 |         split1 = int(0.7 * len(bucket_samples))
 29 |         split2 = int(0.8 * len(bucket_samples))
 30 |         train_idcs = bucket_samples[:split1]
 31 |         val_idcs = bucket_samples[split1:split2]
 32 |         test_idcs = bucket_samples[split2:]
 33 |         all_train_idcs.append(train_idcs)
 34 |         all_val_idcs.append(val_idcs)
 35 |         all_test_idcs.append(test_idcs)
 36 |     return (np.concatenate(all_train_idcs), np.concatenate(all_val_idcs),
 37 |             np.concatenate(all_test_idcs))
 38 | 
 39 | 
 40 | def bin_data(labels, num_buckets, scale="linear", base=10, bin_bounds=None):
 41 |     """
 42 |     Splits the data into specified number of buckets of equal size. Returns for
 43 |     each sample the index of the the bucket it belongs to.
 44 |     :param labels: Unscaled labels used for computing bucket boundaries and
 45 |     assigning samples to buckets.
 46 |     :param num_buckets:
 47 |     :param scale: Whether to use separate the label domain into buckets on a
 48 |     linear or logarithmic scale. Hence the two options are either "linear" or
 49 |     "logarithmic".
 50 |     :param base: Only relevant if scale="logarithmic". Specifies the base of
 51 |     the logarithm.
 52 |     :param bin_bounds: Only relevant if scale="custom".
 53 |     :return: Array of the same length as labels specifying for each sample
 54 |     which bucket it belongs to.
 55 |     """
 56 |     max_label = np.max(labels)
 57 |     if scale == "logarithmic":
 58 |         bin_bounds = []
 59 |         base_size = max_label / (base**(num_buckets-1))
 60 |         for bin_idx in range(num_buckets):
 61 |             bin_bounds.append(base**bin_idx * base_size)
 62 |         bin_bounds[-1] = bin_bounds[-1] + 1.0
 63 |     elif scale == "linear":
 64 |         bin_size = int(math.ceil(float(max_label) / float(num_buckets)))
 65 |         bin_bounds = [bin_size * idx for idx in range(1, num_buckets+1)]
 66 |     elif scale == "custom" and bin_bounds != None:
 67 |         if len(bin_bounds) != num_buckets:
 68 |             raise ValueError(f"Error: Specified number of bins {num_buckets} "
 69 |                              f"does not match specified bin_bounds "
 70 |                              f"(length {len(bin_bounds)})")
 71 |     else:
 72 |         raise ValueError(f"Unknown scale type {scale}")
 73 |     print(f"\tBin bounds: {bin_bounds}")
 74 |     bin_idcs = np.digitize(labels, bin_bounds)
 75 |     return bin_idcs
 76 | 
 77 | 
 78 | def to_sparse_tensor(mat):
 79 |     """
 80 |     Converts a SciPy sparse matrix into a torch sparse tensor.
 81 |     """
 82 |     if isinstance(mat, ssp.csr_matrix) or isinstance(mat, ssp.csc_matrix):
 83 |         mat = mat.tocoo()
 84 |     data = mat.data
 85 |     indices = np.concatenate((mat.row.reshape(1, -1), mat.col.reshape(1, -1)),
 86 |                              axis=0)
 87 |     sparse_mat = sp.FloatTensor(torch.LongTensor(indices),
 88 |                                 torch.FloatTensor(data),
 89 |                                 torch.Size(mat.shape))
 90 |     return sparse_mat
 91 | 
 92 | 
 93 | def normalize(mx):
 94 |     """
 95 |     Row-normalize sparse matrix. Adapted from
 96 |     https://github.com/tkipf/pygcn/blob/master/pygcn/utils.py.
 97 |     """
 98 |     rowsum = np.array(mx.sum(1))
 99 |     r_inv = np.power(rowsum, -1).flatten()
100 |     r_inv[np.isinf(r_inv)] = 0.
101 |     r_mat_inv = ssp.diags(r_inv)
102 |     mx = r_mat_inv.dot(mx)
103 |     return mx
104 | 
105 | 
106 | def summarize_tensor(x, title=""):
107 |     with torch.no_grad():
108 |         print("-"*10, title, "-"*10, sep="")
109 |         shape = x.shape
110 |         print(f"Shape: {shape}")
111 | 
112 |         nans = torch.sum(torch.isnan(x))
113 |         print(f"NaNs: {nans}")
114 | 
115 |         nnz = torch.sum(x < 1e-8)
116 |         print(f"NNZ: {nnz}")
117 | 
118 |         mean = torch.mean(x)
119 |         print(f"Mean: {mean}")
120 |         std = torch.std(x)
121 |         print(f"Std: {std}")
122 |         median = torch.median(x)
123 |         print(f"Median: {median}")
124 | 
125 |         min = torch.min(x)
126 |         print(f"Min: {min}")
127 |         max = torch.max(x)
128 |         print(f"Max: {max}")
129 |         print("-"*(20+len(title)))
130 | 
131 | 
132 | def summarize_feature_matrix(features):
133 |     for col_idx in range(features.shape[1]):
134 |         values = features[:, col_idx]
135 |         mean = np.mean(values)
136 |         std = np.std(values)
137 |         min_val = np.min(values)
138 |         max_val = np.max(values)
139 |         num_values = len(np.unique(values))
140 |         is_integer = np.sum(np.ceil(values) - values) <= 1e-10
141 |         print("column index:", col_idx)
142 |         print(f"statistics: {mean:.3f} +/- {std:.3f}")
143 |         print(f"min, max: [{min_val:.3f}, {max_val:.3f}]")
144 |         print(f"num unique values: {num_values}")
145 |         print(f"data type:", "integer" if is_integer else "float")
146 |         print()


--------------------------------------------------------------------------------
/fc_regression_baseline.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | import os
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | 
  8 | from dataset import UrbanPlanningDataset
  9 | from metrics import compute_mae, compute_mape, \
 10 |     compute_ssi, compute_geh, compute_cpl, \
 11 |     compute_cpc, compute_binned_metric, compute_macro_metric, mae_metric, \
 12 |     mape_metric, ssi_metric, geh_metric, cpl_metric, cpc_metric
 13 | from training_environment import TrainingSettings as ts, PerformanceLogger, \
 14 |     OutputLogger
 15 | from training_environment import checkpoint_filepath
 16 | from regression_model import validate_epoch, train_epoch
 17 | 
 18 | 
 19 | parser = argparse.ArgumentParser(description='UP')
 20 | parser.add_argument('--enable-cuda', action='store_true',
 21 |                     help='Enable CUDA')
 22 | args = parser.parse_args()
 23 | args.device = None
 24 | if args.enable_cuda and torch.cuda.is_available():
 25 |     args.device = torch.device('cuda')
 26 | else:
 27 |     args.device = torch.device('cpu')
 28 | 
 29 | 
 30 | class FCEdgeRegressor(nn.Module):
 31 | 
 32 |     def __init__(self, num_node_features, num_edge_features, hidden_dim):
 33 |         super(FCEdgeRegressor, self).__init__()
 34 | 
 35 |         self.core = nn.Sequential(
 36 |             nn.Linear(num_edge_features + 2 * num_node_features,
 37 |                       hidden_dim),
 38 |             nn.ReLU(),
 39 |             nn.BatchNorm1d(hidden_dim),
 40 |             nn.Dropout(p=ts.drop_prob),
 41 |             nn.Linear(hidden_dim, hidden_dim),
 42 |             nn.ReLU(),
 43 |             nn.BatchNorm1d(hidden_dim),
 44 |             nn.Dropout(p=ts.drop_prob),
 45 |             nn.Linear(hidden_dim, hidden_dim),
 46 |             nn.ReLU(),
 47 |             nn.BatchNorm1d(hidden_dim),
 48 |             nn.Dropout(p=ts.drop_prob),
 49 |             nn.Linear(hidden_dim, 1),
 50 |         )
 51 | 
 52 |     def forward(self, x_nodes, x_edges_batch, edge_indices_batch, edge_indices,
 53 |                 edge_weight=None):
 54 |         """
 55 |         :param x_nodes: Node features of shape [N, D]
 56 |         :param x_edges_batch: Edge features of shape [B, K]
 57 |         :param edge_indices_batch: Matrix of shape [B, 2] indicating the
 58 |         indices of the nodes connected by each edge.
 59 |         :param edge_indices: Matrix of shape [2, E] indicating for each edge
 60 |         in the graph the two node IDs it connects.
 61 |         :return: Predictions for edges with shape [B, 1]
 62 |         """
 63 | 
 64 |         x_nodes_left = x_nodes[edge_indices_batch[:, 0]]
 65 |         x_nodes_right = x_nodes[edge_indices_batch[:, 1]]
 66 |         x_concat = torch.cat([x_nodes_left, x_edges_batch, x_nodes_right], dim=-1)
 67 | 
 68 |         out = self.core(x_concat)
 69 | 
 70 |         return out.squeeze(-1)
 71 | 
 72 | 
 73 | def run_training():
 74 |     # Set up training environment
 75 |     if not os.path.exists(ts.cp_folder):
 76 |         os.makedirs(ts.cp_folder)
 77 |     log_filepath = checkpoint_filepath(ts.cp_folder, "log", __file__, {},
 78 |                                        ".pk")
 79 |     summary_filepath = checkpoint_filepath(ts.cp_folder, "summary", __file__,
 80 |                                            {}, ".txt")
 81 |     output_logger = OutputLogger(checkpoint_filepath(ts.cp_folder, "output",
 82 |                                                      __file__, {}, ".txt"))
 83 |     sys.stdout = output_logger
 84 |     ts.write_summary_file(checkpoint_filepath(ts.cp_folder, "hyperparams",
 85 |                                               __file__, {}, "txt"))
 86 |     print(ts.settings_description())
 87 | 
 88 |     # Load data
 89 |     ds = UrbanPlanningDataset(ts.data_base_path, ts.num_bins, ts.batch_size,
 90 |                               ts.n_quantiles, ts.resampling,
 91 |                               ts.excluded_node_feature_columns,
 92 |                               ts.excluded_edge_feature_columns, False,
 93 |                               ts.include_edge_flow_feat, ts.adj_flow_threshold,
 94 |                               ts.seed)
 95 |     # Preprocess data
 96 |     ds.to(args.device)
 97 | 
 98 |     def _get_metric_funcs(prefix):
 99 |         preds_key = prefix+"_predictions"
100 |         labels_key = prefix+"_labels"
101 |         bins_key = prefix+"_bins"
102 |         return {
103 |             prefix+"_loss": (lambda m: np.nanmean(m[prefix+"_loss"])),
104 |             prefix + "_mae": (lambda m: compute_mae(m[preds_key], m[labels_key], ds)),
105 |             prefix + "_binned_mae": (lambda m: compute_binned_metric(mae_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
106 |             prefix + "_macro_mae": (lambda m: compute_macro_metric(mae_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
107 |             prefix + "_mape": (lambda m: compute_mape(m[preds_key], m[labels_key], ds)),
108 |             prefix + "_binned_mape": (lambda m: compute_binned_metric(mape_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
109 |             prefix + "_macro_mape": (lambda m: compute_macro_metric(mape_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
110 |             prefix + "_ssi": (lambda m: compute_ssi(m[preds_key], m[labels_key], ds)),
111 |             prefix + "_binned_ssi": (lambda m: compute_binned_metric(ssi_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
112 |             prefix + "_macro_ssi": (lambda m: compute_macro_metric(ssi_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
113 |             prefix + "_geh": (lambda m: compute_geh(m[preds_key], m[labels_key], ds)),
114 |             prefix + "_binned_geh": (lambda m: compute_binned_metric(geh_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
115 |             prefix + "_macro_geh": (lambda m: compute_macro_metric(geh_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
116 |             prefix + "_cpl": (lambda m: compute_cpl(m[preds_key], m[labels_key], ds)),
117 |             prefix + "_binned_cpl": (lambda m: compute_binned_metric(cpl_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
118 |             prefix + "_macro_cpl": (lambda m: compute_macro_metric(cpl_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
119 |             prefix + "_cpc": (lambda m: compute_cpc(m[preds_key], m[labels_key], ds)),
120 |             prefix + "_binned_cpc": (lambda m: compute_binned_metric(cpc_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
121 |             prefix + "_macro_cpc": (lambda m: compute_macro_metric(cpc_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
122 |         }
123 |     metric_funcs = {
124 |         "train_loss": (lambda m: np.nanmean(m["train_loss"])),
125 |         **_get_metric_funcs("val"),
126 |         **_get_metric_funcs("test"),
127 |     }
128 | 
129 |     logger = PerformanceLogger(metric_funcs, "val_macro_mae", log_filepath,
130 |                                write_every=ts.write_log_every)
131 | 
132 |     predictor = FCEdgeRegressor(ds.num_node_feats, ds.num_edge_feats,
133 |                                 hidden_dim=ts.hidden_dim)
134 |     predictor = predictor.to(device=args.device)
135 | 
136 |     optimizer = torch.optim.Adam(predictor.parameters(), lr=ts.lr)
137 |     lr_schedule = torch.optim.lr_scheduler.MultiStepLR(optimizer,
138 |                                                        list(ts.lr_schedule))
139 |     loss_criterion = (nn.L1Loss() if ts.regression_loss == "L1"
140 |                       else nn.MSELoss())
141 | 
142 |     print("Start training")
143 |     for epoch in range(-1, ts.num_epochs):
144 |         if epoch >= 0:
145 |             train_epoch(epoch, predictor, ds, optimizer, loss_criterion,
146 |                         logger, lr_schedule)
147 |         validate_epoch(epoch, predictor, ds, loss_criterion, ds.val_loader,
148 |                        logger, test=False)
149 |         validate_epoch(epoch, predictor, ds, loss_criterion, ds.test_loader,
150 |                        logger, test=True)
151 | 
152 |         logger.complete_epoch()
153 |         print(logger.epoch_summary())
154 |         if epoch % ts.write_log_every == 0:
155 |             logger.write(log_filepath)
156 |     logger.write(log_filepath)
157 |     logger.write_summary(summary_filepath, ts.settings_description())
158 |     return logger
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     run_training()
163 | 


--------------------------------------------------------------------------------
/training_environment.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | from datetime import datetime
  4 | 
  5 | import numpy as np
  6 | import os
  7 | import pickle as pk
  8 | from collections import defaultdict
  9 | import enum
 10 | from pathlib import Path
 11 | 
 12 | 
 13 | class NodeConvGraph(enum.Enum):
 14 |     Geo = 1
 15 |     UnweightedFlow = 2
 16 | 
 17 | 
 18 | class NodeConvType(enum.Enum):
 19 |     GraphConvolution = 1
 20 |     GraphAttention = 2
 21 |     GraphNodeEdgeConvolution = 3
 22 |     DNAConvolution = 4
 23 | 
 24 | 
 25 | class JKType(enum.Enum):
 26 |     NoJK = ""
 27 |     Concat = "cat"
 28 |     MaxPool = "max"
 29 |     LSTM = "lstm"
 30 | 
 31 | 
 32 | class TrainingSettings:
 33 |     ### data set ###
 34 |     data_base_path = "Data/London_high"
 35 |     resampling = True
 36 |     n_quantiles = 1000
 37 |     num_bins = 4
 38 |     excluded_node_feature_columns = tuple()
 39 |     excluded_edge_feature_columns = tuple()
 40 | 
 41 |     ### model ###
 42 |     hidden_dim = 16             # Dimensionality of any intermediate layers
 43 |     edge_rep_size = 16          # Hidden size of the target edge representation
 44 |     num_edge_rep_layers = 2     # Number of linear layers for computing the target edge representation
 45 |     include_node_reps = True    # Wheather to include node representations at all
 46 |     node_rep_size = 16          # Hidden size of the node feature representations
 47 |     num_node_rep_layers = 1     # Number of GNN layers for computing node representations
 48 |     improved_gcn = False        # Whether to use improved GCN convolutions (i.e. 2 on the adj-matrix diagonal)
 49 |     jk_type = JKType.NoJK       # Whether to use JumpingKnowledge skip connections at all and if yet, which type
 50 |     node_conv_type = NodeConvType.GraphConvolution
 51 |     adj_flow_threshold = 0      # When computing node convolutions based on the flow adjancency matrix, only include edges with a flow greater or equal this threshold
 52 |     dna_heads = 1               # Number of attention heads to be used for DNA convolutions
 53 |     dna_groups = 1              # Number of channel groups to be used for DNA convolutions
 54 |     include_edge_flow_feat = False
 55 |     drop_prob = 0.5
 56 |     weighted_loss = False
 57 |     regression_loss = "L1"      # other option: "L2"
 58 | 
 59 |     ### training ###
 60 |     cp_folder = "./checkpoints/"
 61 |     starting_seed = 7
 62 |     seed = 7
 63 |     lr = 0.001
 64 |     lr_schedule = (50, 65, 80)
 65 |     num_epochs = 100
 66 |     write_log_every = 20
 67 |     batch_size = 64
 68 | 
 69 |     if weighted_loss and resampling:
 70 |         raise ValueError("Weighted loss and resampling both set to True")
 71 | 
 72 |     @staticmethod
 73 |     def update_setting(**settings):
 74 |         for key, value in settings.items():
 75 |             if not hasattr(TrainingSettings, key):
 76 |                 raise ValueError(f"Attribute {key} not a valid hyperparameter.")
 77 |             setattr(TrainingSettings, key, value)
 78 |         if TrainingSettings.weighted_loss and TrainingSettings.resampling:
 79 |             raise ValueError("Weighted loss and resampling both set to True")
 80 | 
 81 |     @staticmethod
 82 |     def settigns_dict():
 83 |         return {attr: getattr(TrainingSettings, attr)
 84 |                 for attr in dir(TrainingSettings)
 85 |                 if (not attr.startswith("__")
 86 |                     and not callable(getattr(TrainingSettings,
 87 |                                              attr)))}
 88 | 
 89 |     @staticmethod
 90 |     def settings_description():
 91 |         settings_dict = TrainingSettings.settigns_dict()
 92 |         return parameter_description_from_dict(settings_dict)
 93 | 
 94 |     @staticmethod
 95 |     def write_summary_file(filepath):
 96 |         filepath = (Path(filepath) if not isinstance(filepath, Path)
 97 |                     else filepath)
 98 |         settings_dict = TrainingSettings.settigns_dict()
 99 |         with filepath.with_suffix(".pk").open("wb") as fd:
100 |             pk.dump(settings_dict, fd)
101 |         settings_description = TrainingSettings.settings_description()
102 |         with filepath.open("w") as fd:
103 |             fd.write(settings_description)
104 | 
105 | 
106 | class TrainingEnvironment:
107 | 
108 |     @staticmethod
109 |     def _process_settings(hyperparam_settings):
110 |         """
111 |         Takes a dictionary of lists/scalars and turns it into a list of
112 |         dictionaries of scalars.
113 |         """
114 |         max_length = max([len(l)
115 |                           for l in hyperparam_settings.values()
116 |                           if type(l) is list]+[1])
117 |         hyperparam_settings = {k: (l if type(l) is list else [l]*max_length)
118 |                                for k, l in hyperparam_settings.items()}
119 |         # Go from dictionary of lists to list of dictionaries
120 |         dicts = [dict(zip(hyperparam_settings, x))
121 |                  for x in zip(*hyperparam_settings.values())]
122 |         return dicts
123 | 
124 |     @staticmethod
125 |     def hyperparameter_search(training_settings, runs, start_experiment_f):
126 |         """
127 |         :param training_settings: Dictionary containing hyperparameters and
128 |         othr training settings. Keys may only be attributes of the
129 |         TrainingSettings class. Values may be scalars or lists. Using lists
130 |         allows to specify the search space. All lists must have the same
131 |         length.
132 |         :param runs: Number of runs to perform for each training setting.
133 |         :param start_experiment_f: Method for running an experiment. Must
134 |         return a single measure of performance.
135 |         :return:
136 |         """
137 |         dicts = TrainingEnvironment._process_settings(training_settings)
138 |         performances = [defaultdict(list) for _ in range(len(dicts))]
139 |         # Make runs outer-loop to ensure initial results for all settings are
140 |         # obtained asap
141 |         for run_idx in range(runs):
142 |             for settings_idx, settings_dict in enumerate(dicts):
143 |                 TrainingSettings.update_setting(**settings_dict)
144 |                 TrainingSettings.seed = (TrainingSettings.starting_seed
145 |                                          * 3**run_idx)
146 |                 logger = start_experiment_f()
147 |                 min_epoch = np.argmin(logger[logger.minimizer])
148 |                 performances[settings_idx]["min_epoch"].append(min_epoch)
149 |                 for key, values in logger.logs_dict.items():
150 |                     performances[settings_idx][key].append(values[min_epoch])
151 |             # Write results (so far) to file
152 |             TrainingEnvironment.write_summary(dicts, performances)
153 | 
154 |     @staticmethod
155 |     def write_summary(settings_dicts, performances):
156 |         """
157 |         :param settings_dicts:
158 |         :param performances:
159 |         :return:
160 |         """
161 |         summary = ""
162 |         for settings, performances in zip(settings_dicts, performances):
163 |             description = parameter_description_from_dict(settings)
164 |             summary += description + "\n\n"
165 |             for key in sorted(performances.keys()):
166 |                 mean = np.mean(performances[key], axis=0)
167 |                 std = np.std(performances[key], axis=0)
168 |                 if isinstance(mean, np.ndarray):
169 |                     summary += f"{key}: {mean} +/- {std}\n"
170 |                 else:
171 |                     summary += f"{key}: {mean:.5f} +/- {std:.5f}\n"
172 |         if not os.path.exists(TrainingSettings.cp_folder):
173 |             os.makedirs(TrainingSettings.cp_folder)
174 |         with open(os.path.join(TrainingSettings.cp_folder, "summary.txt"), "w") as fd:
175 |             fd.write(summary)
176 | 
177 | 
178 | class PerformanceLogger:
179 |     def __init__(self, metric_funcs, minimizer, log_filepath, write_every=20):
180 |         """
181 |         :param metric_funcs: Dictionary of metric functions. A metric function
182 |         accepts exactly one argument which is a dictionary of the metrics from
183 |         a single epoch. It returns the corresponding metric.
184 |         :param minimizer: Key of the variable that determines the final value
185 |         stored in the summary. Usually the validation loss.
186 |         """
187 |         self._metric_funcs = metric_funcs
188 |         self._current_epoch_metrics = defaultdict(list)
189 |         self.logs_dict = defaultdict(list)
190 |         self.minimizer = minimizer
191 |         self.log_filepath = log_filepath
192 |         self.write_every = write_every
193 |         self._write_countdown = write_every
194 |         self._start_time = time.time()
195 |         self._current_epoch = 0
196 | 
197 |         if "duration" in metric_funcs:
198 |             raise ValueError("Key \"duration\" is a reserved key for internal"
199 |                              "use of PerformanceLogger.")
200 | 
201 |     def __getitem__(self, key):
202 |         return self.logs_dict[key]
203 | 
204 |     def __setitem__(self, key, value):
205 |         self.logs_dict[key] = value
206 | 
207 |     def add_values(self, metric_dict):
208 |         for key, metric_batch in metric_dict.items():
209 |             self._current_epoch_metrics[key].append(metric_batch)
210 |         # Write log if necessary
211 |         self._write_countdown -= 1
212 |         if self._write_countdown == 0:
213 |             self.write(self.log_filepath)
214 |             self._write_countdown = self.write_every
215 | 
216 |     def complete_epoch(self):
217 |         """
218 |         Marks the epoch as finished and computes the epoch's metrics.
219 |         """
220 |         for key, metric_func in self._metric_funcs.items():
221 |             metric = metric_func(self._current_epoch_metrics)
222 |             self.logs_dict[key].append(metric)
223 |         # add duration as additional metric
224 |         duration = time.time() - self._start_time
225 |         self.logs_dict["duration"].append(duration)
226 |         self._start_time = time.time()
227 | 
228 |         self._current_epoch_metrics = defaultdict(list)
229 |         self._current_epoch += 1
230 | 
231 |     def epoch_summary(self):
232 |         summary = f"{self._current_epoch}:"
233 |         for key, vals in self.logs_dict.items():
234 |             if key == "duration":
235 |                 continue
236 |             if isinstance(vals[-1], np.ndarray):
237 |                 summary += f"\t{key}: {vals[-1]}\n"
238 |             else:
239 |                 summary += f"\t{key}: {vals[-1]:.5f}\n"
240 |         duration = self.logs_dict["duration"][-1]
241 |         summary += f"\t[{duration:.2f}s]"
242 |         return summary
243 | 
244 |     def min(self, key):
245 |         if key not in self.logs_dict or len(self.logs_dict[key]) == 0:
246 |             return 10e8     # Find nicer way
247 |         return min(self.logs_dict[key])
248 | 
249 |     def write(self, filepath):
250 |         with open(filepath, "wb") as fd:
251 |             pk.dump(self.logs_dict, fd)
252 | 
253 |     def write_summary(self, filepath, settings_description=""):
254 |         """
255 |         Writes a text file containing a summary of the run.
256 |         :param filepath: Filepath of the summary.
257 |         """
258 |         min_idx = np.argmin(self.logs_dict[self.minimizer])
259 |         summary = settings_description + "\n\n"
260 |         for key, value in self.logs_dict.items():
261 |             min_val = value[min_idx]
262 |             summary += f"{key}: {min_val}\n"
263 |         with open(filepath, "w") as fd:
264 |             fd.write(summary)
265 | 
266 | 
267 | class OutputLogger:
268 |     """
269 |     Overrides normal stdout, i.e. `sys.stdout = OutputLogger(...)`. After that,
270 |     print writes all output to stdout *and* the specified log-file.
271 |     """
272 |     def __init__(self, log_filepath):
273 |         self.terminal = sys.stdout
274 |         self.log = open(log_filepath, "a")
275 | 
276 |     def write(self, message):
277 |         self.terminal.write(message)
278 |         self.log.write(message)
279 | 
280 |     def flush(self):
281 |         self.terminal.flush()
282 |         self.log.flush()
283 | 
284 |     def close(self):
285 |         self.log.flush()
286 |         self.log.close()
287 | 
288 | 
289 | def parameter_description_from_dict(dict):
290 |     return dict.__str__()[1:-1].replace("\'", "").replace(": ", "=")
291 | 
292 | 
293 | def checkpoint_filepath(directory, basename, script_file, parameters,
294 |                         file_ending, add_timestamp=True):
295 |     """
296 |     Automatically creates a checkpoint filepath with the given attributes.
297 |     :param directory: Directory in which the file should be stored.
298 |     :param basename: Base name of the file, which will be appended by some
299 |     extra parameters (see below).
300 |     :param script_file: Name of the script as returned by __file__.
301 |     :param parameters: Dictionary of additional parameters.
302 |     :param file_ending: File ending of the file.
303 |     :return: Filepath as a string.
304 |     """
305 |     script_name = os.path.basename(script_file)
306 |     script_name = script_name[:script_name.find(".")]
307 | 
308 |     param_descr = parameter_description_from_dict(parameters)
309 | 
310 |     if file_ending[0] != ".":
311 |         file_ending = "." + file_ending
312 | 
313 |     if add_timestamp:
314 |         time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
315 |         filename = (script_name + "," + param_descr + "_"
316 |                     + time_stamp + "_" + basename + file_ending)
317 |     else:
318 |         filename = script_name + "," + param_descr + "_" + basename + file_ending
319 | 
320 |     filepath = os.path.join(directory, filename)
321 |     return filepath
322 | 
323 | 
324 | if __name__ == '__main__':
325 |     TrainingSettings.update_setting(**{"lr": 0.1})
326 | 


--------------------------------------------------------------------------------
/regression_model.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import argparse
  4 | import os
  5 | import warnings
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | from collections import defaultdict
 10 | import pickle as pk
 11 | 
 12 | from torch.nn import Parameter
 13 | 
 14 | from layers import DNANodeRepModule, ConvNodeRepModule
 15 | from metrics import compute_mae, compute_mape, compute_ssi, compute_geh, \
 16 |     compute_cpl, compute_cpc, compute_binned_metric, compute_macro_metric, \
 17 |     mae_metric, cpc_metric, cpl_metric, geh_metric, ssi_metric, mape_metric
 18 | from dataset import UrbanPlanningDataset
 19 | from training_environment import TrainingSettings as ts, PerformanceLogger, NodeConvType, \
 20 |     JKType
 21 | from training_environment import checkpoint_filepath, OutputLogger
 22 | 
 23 | from torch_geometric.nn import JumpingKnowledge
 24 | 
 25 | 
 26 | parser = argparse.ArgumentParser(description='UP')
 27 | parser.add_argument('--enable-cuda', action='store_true',
 28 |                     help='Enable CUDA')
 29 | args = parser.parse_args()
 30 | args.device = None
 31 | if args.enable_cuda and torch.cuda.is_available():
 32 |     args.device = torch.device('cuda')
 33 | else:
 34 |     args.device = torch.device('cpu')
 35 | 
 36 | 
 37 | class EdgeRegressor(nn.Module):
 38 | 
 39 |     def __init__(self, num_node_features, num_edge_features, node_rep_size,
 40 |                  hidden_dim):
 41 |         super(EdgeRegressor, self).__init__()
 42 | 
 43 |         # Linear layer to transform target edge features
 44 |         self.fc_edges = nn.Sequential(
 45 |             nn.Linear(num_edge_features + 2 * num_node_features, hidden_dim),
 46 |             nn.ReLU(),
 47 |             nn.BatchNorm1d(hidden_dim),
 48 |             nn.Dropout(p=ts.drop_prob),
 49 |             nn.Linear(hidden_dim, hidden_dim),
 50 |         )
 51 |         concat_hidden_dim = hidden_dim
 52 | 
 53 |         if ts.include_node_reps:
 54 |             if ts.node_conv_type == NodeConvType.GraphConvolution:
 55 |                 self.node_rep_module = ConvNodeRepModule(num_node_features,
 56 |                                                          node_rep_size,
 57 |                                                          ts.num_node_rep_layers,
 58 |                                                          ts.improved_gcn,
 59 |                                                          ts.drop_prob)
 60 |             elif ts.node_conv_type == NodeConvType.DNAConvolution:
 61 |                 self.node_rep_module = DNANodeRepModule(num_node_features,
 62 |                                                         node_rep_size,
 63 |                                                         ts.num_node_rep_layers,
 64 |                                                         ts.dna_heads,
 65 |                                                         ts.dna_groups,
 66 |                                                         ts.drop_prob)
 67 |             concat_hidden_dim += 2 * node_rep_size
 68 | 
 69 |             if ts.jk_type is not JKType.NoJK:
 70 |                 self.jk = JumpingKnowledge(ts.jk_type.value, channels=8,
 71 |                                            num_layers=ts.num_node_rep_layers)
 72 |                 lin_size = node_rep_size
 73 |                 if ts.jk_type is JKType.Concat:
 74 |                     lin_size = ts.num_node_rep_layers*node_rep_size
 75 |                 self.jk_lin = nn.Linear(lin_size, node_rep_size)
 76 | 
 77 |         self.node_weight = Parameter(torch.from_numpy(np.array(1.0, dtype=np.float32)))
 78 |         self.edge_weight = Parameter(torch.from_numpy(np.array(1.0, dtype=np.float32)))
 79 | 
 80 |         self.regression_head = nn.Sequential(
 81 |             nn.ReLU(),
 82 |             nn.BatchNorm1d(hidden_dim),
 83 |             nn.Dropout(p=ts.drop_prob),
 84 |             nn.Linear(hidden_dim, hidden_dim),
 85 |             nn.ReLU(),
 86 |             nn.BatchNorm1d(hidden_dim),
 87 |             nn.Dropout(p=ts.drop_prob),
 88 |             nn.Linear(hidden_dim, 1)
 89 |         )
 90 | 
 91 |     def forward(self, x_nodes, x_edges_batch, edge_indices_batch, edge_indices,
 92 |                 edge_weight=None):
 93 |         """
 94 |         :param x_nodes: Node features of shape [N, D]
 95 |         :param x_edges_batch: Edge features of shape [B, K]
 96 |         :param edge_indices_batch: Matrix of shape [B, 2] indicating the
 97 |         indices of the nodes connected by each edge.
 98 |         :param edge_indices: Matrix of shape [2, E] indicating for each edge
 99 |         in the graph the two node IDs it connects.
100 |         :param edge_weight: Vector of shape [E] containing the edge weight for
101 |         each edge in the graph.
102 |         :return: Predictions for edges with shape [B, 1]
103 |         """
104 | 
105 |         # Compute hidden representation of target edge
106 |         x_nodes_left = x_nodes[edge_indices_batch[:, 0]]
107 |         x_nodes_right = x_nodes[edge_indices_batch[:, 1]]
108 |         x_concat = torch.cat([x_nodes_left, x_edges_batch, x_nodes_right], dim=-1)
109 |         h_edges = self.fc_edges(x_concat)
110 |         h_total = self.node_weight * h_edges
111 | 
112 |         # Compute hidden representations of nodes
113 |         if ts.include_node_reps:
114 |             intermediate_node_reps = self.node_rep_module(x_nodes,
115 |                                                           edge_indices.t(),
116 |                                                           edge_weight)
117 |             if ts.jk_type is JKType.NoJK:
118 |                 h_nodes = intermediate_node_reps[-1]
119 |             else:
120 |                 h_nodes = self.jk(intermediate_node_reps)
121 |                 h_nodes = self.jk_lin(h_nodes)
122 |             # Get hidden representations of nodes incident to target edges
123 |             h_nodes_left = h_nodes[edge_indices_batch[:, 0]]
124 |             h_nodes_right = h_nodes[edge_indices_batch[:, 1]]
125 |             h_total += self.edge_weight * h_nodes_left
126 |             h_total += self.edge_weight * h_nodes_right
127 | 
128 |         regression_output = self.regression_head(h_total)
129 | 
130 |         return regression_output.squeeze(-1)
131 | 
132 | 
133 | def train_epoch(epoch, predictor, data, optimizer, loss_criterion, logger,
134 |                 lr_schedule):
135 |     predictor.train()
136 | 
137 |     for (edge_idcs_batch, x_edges_batch, edge_labels_batch,
138 |          _) in data.train_loader:
139 |         edge_idcs_batch = edge_idcs_batch.to(device=args.device)
140 |         x_edges_batch = x_edges_batch.to(device=args.device)
141 |         edge_labels_batch = edge_labels_batch.to(device=args.device)
142 | 
143 |         optimizer.zero_grad()
144 |         reg_out = predictor(data.node_feats, x_edges_batch, edge_idcs_batch,
145 |                             data.flow_topology.edge_indices,
146 |                             edge_weight=data.flow_topology.edge_weights)
147 |         loss = loss_criterion(reg_out, edge_labels_batch)
148 |         loss.backward()
149 |         optimizer.step()
150 |         logger.add_values({"train_loss": loss.item()})
151 |     lr_schedule.step()
152 | 
153 | 
154 | def validate_epoch(epoch, predictor, data, loss_criterion, data_loader, logger,
155 |                    test):
156 |     predictor.eval()
157 |     prefix = "test" if test else "val"
158 | 
159 |     for (edge_idcs_batch, x_edges_batch, edge_labels_batch, edge_buckets_batch) in data_loader:
160 |         edge_idcs_batch = edge_idcs_batch.to(device=args.device)
161 |         x_edges_batch = x_edges_batch.to(device=args.device)
162 |         edge_labels_batch = edge_labels_batch.to(device=args.device)
163 | 
164 |         reg_out = predictor(data.node_feats, x_edges_batch, edge_idcs_batch,
165 |                             data.flow_topology.edge_indices,
166 |                             edge_weight=data.flow_topology.edge_weights)
167 |         loss = loss_criterion(reg_out, edge_labels_batch)
168 |         logger.add_values({
169 |             prefix + "_loss": loss.item(),
170 |             prefix + "_predictions": reg_out.detach().cpu().numpy(),
171 |             prefix + "_labels": edge_labels_batch.detach().cpu().numpy(),
172 |             prefix + "_bins": edge_buckets_batch.detach().cpu().numpy()
173 |         })
174 |     if test:
175 |         with open("preds_labels.pk", "wb") as fd:
176 |             preds = data.label_scaler.inverse_transform(np.concatenate(logger._current_epoch_metrics["test_predictions"], axis=-1).reshape(-1, 1))
177 |             labels = data.label_scaler.inverse_transform(np.concatenate(logger._current_epoch_metrics["test_labels"], axis=-1).reshape(-1, 1))
178 |             pk.dump((preds, labels, logger._current_epoch_metrics["test_node_idcs"]), fd)
179 | 
180 | 
181 | def run_training():
182 |     # Set up training environment
183 |     if not os.path.exists(ts.cp_folder):
184 |         os.makedirs(ts.cp_folder)
185 |     log_filepath = checkpoint_filepath(ts.cp_folder, "log", __file__, {},
186 |                                        ".pk")
187 |     summary_filepath = checkpoint_filepath(ts.cp_folder, "summary", __file__,
188 |                                            {}, ".txt")
189 |     output_logger = OutputLogger(checkpoint_filepath(ts.cp_folder, "output",
190 |                                                      __file__, {}, ".txt"))
191 |     sys.stdout = output_logger
192 |     ts.write_summary_file(checkpoint_filepath(ts.cp_folder, "hyperparams",
193 |                                               __file__, {}, "txt"))
194 |     print(ts.settings_description())
195 | 
196 |     # Load data
197 |     ds = UrbanPlanningDataset(ts.data_base_path, ts.num_bins, ts.batch_size,
198 |                               ts.n_quantiles, ts.resampling,
199 |                               ts.excluded_node_feature_columns,
200 |                               ts.excluded_edge_feature_columns, False,
201 |                               ts.include_edge_flow_feat, ts.adj_flow_threshold,
202 |                               ts.seed)
203 |     # Preprocess data
204 |     ds.to(args.device)
205 | 
206 |     def _get_metric_funcs(prefix):
207 |         preds_key = prefix+"_predictions"
208 |         labels_key = prefix+"_labels"
209 |         bins_key = prefix+"_bins"
210 |         return {
211 |             prefix+"_loss": (lambda m: np.nanmean(m[prefix+"_loss"])),
212 |             prefix + "_mae": (lambda m: compute_mae(m[preds_key], m[labels_key], ds)),
213 |             prefix + "_binned_mae": (lambda m: compute_binned_metric(mae_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
214 |             prefix + "_macro_mae": (lambda m: compute_macro_metric(mae_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
215 |             prefix + "_mape": (lambda m: compute_mape(m[preds_key], m[labels_key], ds)),
216 |             prefix + "_binned_mape": (lambda m: compute_binned_metric(mape_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
217 |             prefix + "_macro_mape": (lambda m: compute_macro_metric(mape_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
218 |             prefix + "_ssi": (lambda m: compute_ssi(m[preds_key], m[labels_key], ds)),
219 |             prefix + "_binned_ssi": (lambda m: compute_binned_metric(ssi_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
220 |             prefix + "_macro_ssi": (lambda m: compute_macro_metric(ssi_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
221 |             prefix + "_geh": (lambda m: compute_geh(m[preds_key], m[labels_key], ds)),
222 |             prefix + "_binned_geh": (lambda m: compute_binned_metric(geh_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
223 |             prefix + "_macro_geh": (lambda m: compute_macro_metric(geh_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
224 |             prefix + "_cpl": (lambda m: compute_cpl(m[preds_key], m[labels_key], ds)),
225 |             prefix + "_binned_cpl": (lambda m: compute_binned_metric(cpl_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
226 |             prefix + "_macro_cpl": (lambda m: compute_macro_metric(cpl_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
227 |             prefix + "_cpc": (lambda m: compute_cpc(m[preds_key], m[labels_key], ds)),
228 |             prefix + "_binned_cpc": (lambda m: compute_binned_metric(cpc_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
229 |             prefix + "_macro_cpc": (lambda m: compute_macro_metric(cpc_metric, m[preds_key], m[labels_key], m[bins_key], ds, ts.num_bins)),
230 |         }
231 |     metric_funcs = {
232 |         "train_loss": (lambda m: np.nanmean(m["train_loss"])),
233 |         **_get_metric_funcs("val"),
234 |         **_get_metric_funcs("test"),
235 |     }
236 |     logger = PerformanceLogger(metric_funcs, "val_macro_mae", log_filepath,
237 |                                write_every=ts.write_log_every)
238 | 
239 |     predictor = EdgeRegressor(ds.num_node_feats, ds.num_edge_feats,
240 |                               hidden_dim=ts.hidden_dim,
241 |                               node_rep_size=ts.node_rep_size)
242 |     predictor = predictor.to(device=args.device)
243 | 
244 |     optimizer = torch.optim.Adam(predictor.parameters(), lr=ts.lr)
245 |     lr_schedule = torch.optim.lr_scheduler.MultiStepLR(optimizer,
246 |                                                        list(ts.lr_schedule))
247 |     loss_criterion = (nn.L1Loss() if ts.regression_loss == "L1"
248 |                       else nn.MSELoss())
249 | 
250 |     print("Start training")
251 |     for epoch in range(-1, ts.num_epochs):
252 |         if epoch >= 0:
253 |             train_epoch(epoch, predictor, ds, optimizer, loss_criterion,
254 |                         logger, lr_schedule)
255 |         validate_epoch(epoch, predictor, ds, loss_criterion, ds.val_loader,
256 |                        logger, test=False)
257 |         validate_epoch(epoch, predictor, ds, loss_criterion, ds.test_loader,
258 |                        logger, test=True)
259 | 
260 |         logger.complete_epoch()
261 |         print(logger.epoch_summary())
262 |         if epoch % ts.write_log_every == 0:
263 |             logger.write(log_filepath)
264 |     logger.write(log_filepath)
265 |     logger.write_summary(summary_filepath, ts.settings_description())
266 |     return logger
267 | 
268 | 
269 | if __name__ == '__main__':
270 |     run_training()
271 | 


--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.sparse as sp
  6 | from torch.nn.parameter import Parameter
  7 | from torch.nn.modules.module import Module
  8 | from torch_geometric.nn import GCNConv, DNAConv
  9 | 
 10 | 
 11 | def tensor_normalize(matrix):
 12 |     row_sum = sp.sum(matrix, dim=1).to_dense()
 13 |     # if torch.any(row_sum == 0.0):
 14 |     #     raise ValueError("Matrix contains rows with sum 0.")
 15 |     r_inv = row_sum.pow(-1).flatten()
 16 |     r_inv[torch.isinf(r_inv)] = 0.0
 17 |     norm_matrix = torch.matmul(torch.diag(r_inv), matrix.to_dense())
 18 |     return norm_matrix
 19 | 
 20 | 
 21 | class ConvNodeRepModule(Module):
 22 |     def __init__(self, in_dim, hidden_dim, num_layers, improved_gcn,
 23 |                  drop_prob):
 24 |         super(ConvNodeRepModule, self).__init__()
 25 |         self.conv_layers = []
 26 |         for idx in range(num_layers):
 27 |             cur_in_dim = in_dim if idx == 0 else hidden_dim
 28 |             cur_layer = NormalizedRegularizedGCNLayer(cur_in_dim, hidden_dim,
 29 |                                                       improved_gcn, drop_prob)
 30 |             self.conv_layers.append(cur_layer)
 31 |         self.conv_layers = nn.ModuleList(self.conv_layers)
 32 | 
 33 |     def forward(self, node_features, edge_indices, edge_weight=None):
 34 |         h = node_features
 35 |         intermediate_reps = []
 36 |         for layer in self.conv_layers:
 37 |             h = layer(h, edge_indices, edge_weight)
 38 |             intermediate_reps.append(h)
 39 |         return intermediate_reps
 40 | 
 41 | 
 42 | class NormalizedRegularizedGCNLayer(Module):
 43 |     def __init__(self, in_dim, out_dim, improved_gcn, drop_prob):
 44 |         super(NormalizedRegularizedGCNLayer, self).__init__()
 45 |         self.gcn = GCNConv(in_dim, out_dim, improved_gcn)
 46 |         self.bn = nn.BatchNorm1d(out_dim)
 47 |         self.drop = nn.Dropout(drop_prob)
 48 | 
 49 |     def forward(self, node_features, edge_indices, edge_weight=None):
 50 |         h = self.gcn(node_features, edge_indices, edge_weight)
 51 |         h = F.relu(h)
 52 |         h = self.bn(h)
 53 |         h = self.drop(h)
 54 |         return h
 55 | 
 56 | 
 57 | class DNANodeRepModule(Module):
 58 |     """
 59 |     Applies a given number of DNA convolutions on the given data. Returns a
 60 |     list of all the intermediate representations after each layer.
 61 |     """
 62 | 
 63 |     def __init__(self, in_dim, hidden_dim, num_layers, dna_heads, dna_groups,
 64 |                  drop_prob):
 65 |         super(DNANodeRepModule, self).__init__()
 66 |         self.hidden_dim = hidden_dim
 67 |         self.pre_lin = nn.Linear(in_dim, hidden_dim)
 68 |         self.pre_drop = nn.Dropout(drop_prob)
 69 |         self.conv_layers = []
 70 |         for _ in range(num_layers):
 71 |             cur_layer = NormalizedRegularizedDNALayer(hidden_dim, dna_heads,
 72 |                                                       dna_groups, drop_prob)
 73 |             self.conv_layers.append(cur_layer)
 74 |         self.conv_layers = nn.ModuleList(self.conv_layers)
 75 | 
 76 |     def forward(self, input, edge_indices):
 77 |         h = F.relu(self.pre_lin(input))
 78 |         h = self.pre_drop(h)
 79 |         h = h.view(-1, 1, self.hidden_dim)
 80 |         intermediate_reps = []
 81 |         for conv in self.conv_layers:
 82 |             h_new = conv(h, edge_indices)
 83 |             intermediate_reps.append(h_new)
 84 |             h_new = h_new.view(-1, 1, self.hidden_dim)
 85 |             h = torch.cat([h, h_new], dim=1)
 86 |         return intermediate_reps
 87 | 
 88 | 
 89 | class NormalizedRegularizedDNALayer(Module):
 90 |     def __init__(self, channels, heads, groups, dropout):
 91 |         super(NormalizedRegularizedDNALayer, self).__init__()
 92 |         self.dna = DNAConv(channels, heads, groups, dropout)
 93 |         self.bn = nn.BatchNorm1d(channels)
 94 | 
 95 |     def forward(self, all_node_features, edge_indices):
 96 |         h = self.dna(all_node_features, edge_indices)
 97 |         h = F.relu(h)
 98 |         h = self.bn(h)
 99 |         return h
100 | 
101 | 
102 | class GraphConvolution(Module):
103 |     """
104 |     Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
105 |     Adapted from https://github.com/tkipf/pygcn/blob/master/pygcn/layers.py
106 |     """
107 | 
108 |     def __init__(self, in_features, out_features, adj_matrix, edge_feat_matrix,
109 |                  attention_scores=None, bias=True):
110 |         """
111 |         :param in_features:
112 |         :param out_features:
113 |         :param bias:
114 |         :param attention_scores: Sparse tensor containing for each pair of
115 |         nodes the attention score between the nodes. Shape [N, N].
116 |         :param adj_matrix:
117 |         """
118 |         super(GraphConvolution, self).__init__()
119 |         self.in_features = in_features
120 |         self.out_features = out_features
121 |         self.weight = Parameter(torch.FloatTensor(in_features, out_features))
122 |         if bias:
123 |             self.bias = Parameter(torch.FloatTensor(out_features))
124 |         else:
125 |             self.register_parameter('bias', None)
126 |         self.adj_matrix = tensor_normalize(adj_matrix)
127 |         # Pre-compute attention adjacency matrix
128 |         if attention_scores is not None:
129 |             with torch.no_grad():
130 |                 self.adj_matrix = tensor_normalize(adj_matrix * attention_scores)
131 |         self.reset_parameters()
132 | 
133 |     def reset_parameters(self):
134 |         stdv = 1. / math.sqrt(self.weight.size(1))
135 |         self.weight.data.uniform_(-stdv, stdv)
136 |         if self.bias is not None:
137 |             self.bias.data.uniform_(-stdv, stdv)
138 | 
139 |     def forward(self, input):
140 |         """
141 |         :param input: Node features of shape [N, D]
142 |         """
143 |         support = torch.mm(input, self.weight)  # Shape [N, K]
144 |         output = torch.matmul(self.adj_matrix, support)
145 |         if self.bias is not None:
146 |             return output + self.bias
147 |         else:
148 |             return output
149 | 
150 |     def __repr__(self):
151 |         return self.__class__.__name__ + ' (' \
152 |                + str(self.in_features) + ' -> ' \
153 |                + str(self.out_features) + ')'
154 | 
155 | 
156 | class GraphNodeEdgeConvolution(Module):
157 | 
158 |     def __init__(self, node_input_size, edge_input_size, output_size,
159 |                  adj_matrix, edge_feat_matrix, bias=True):
160 |         """
161 |         :param node_input_size:
162 |         :param edge_input_size:
163 |         :param output_size:
164 |         :param adj_matrix:
165 |         :param edge_feat_matrix: [N, N, K]
166 |         :param bias:
167 |         """
168 |         super(GraphNodeEdgeConvolution, self).__init__()
169 |         self.input_size = node_input_size + edge_input_size
170 |         self.output_size = output_size
171 |         self.weight = Parameter(torch.FloatTensor(self.input_size, output_size))
172 |         if bias:
173 |             self.bias = Parameter(torch.FloatTensor(output_size))
174 |         else:
175 |             self.register_parameter('bias', None)
176 |         self.adj_matrix = tensor_normalize(adj_matrix)
177 |         self.edge_feat_matrix = edge_feat_matrix
178 |         self.reset_parameters()
179 | 
180 |     def reset_parameters(self):
181 |         stdv = 1. / math.sqrt(self.weight.size(1))
182 |         self.weight.data.uniform_(-stdv, stdv)
183 |         if self.bias is not None:
184 |             self.bias.data.uniform_(-stdv, stdv)
185 | 
186 |     def forward(self, node_features):
187 |         num_nodes = node_features.shape[0]
188 |         node_feats = node_features.view(1, num_nodes, -1).expand(num_nodes, -1, -1)
189 |         combined_feats = torch.cat([self.edge_feat_matrix, node_feats], dim=-1)
190 |         support = torch.matmul(combined_feats, self.weight)  # shape [N, N, D]
191 |         output = torch.matmul(support.transpose(2, 0), self.adj_matrix)     # shape [D, N, N]
192 |         output = torch.diagonal(output, dim1=1, dim2=2)
193 |         output = output.transpose(1, 0)
194 |         return output
195 | 
196 | 
197 | class EdgeConvolution(nn.Module):
198 |     def __init__(self, in_features, out_features, inc_matrix):
199 |         """
200 |         :param in_features:
201 |         :param out_features:
202 |         :param inc_matrix: Sparse incidence matrix of the graph of shape
203 |         [N, E].
204 |         """
205 |         super(EdgeConvolution, self).__init__()
206 |         self.weight = nn.parameter.Parameter(torch.FloatTensor(in_features,
207 |                                                                out_features))
208 |         self.bias = nn.parameter.Parameter(torch.FloatTensor(out_features))
209 |         self.inc_matrix = inc_matrix
210 |         self.reset_parameters()
211 | 
212 |     def reset_parameters(self):
213 |         stdv = 1. / math.sqrt(self.weight.size(1))
214 |         self.weight.data.uniform_(-stdv, stdv)
215 |         if self.bias is not None:
216 |             self.bias.data.uniform_(-stdv, stdv)
217 | 
218 |     def forward(self, edge_nodes, edge_feats):
219 |         """
220 |         :param edge_nodes: Matrix indicating the nodes which each edge in the
221 |         batch connects. Shape [B, N].
222 |         :param edge_feats: Features of *all* edges in the graph. Shape [E, D].
223 |         :return: Hidden representation of shape [B, K].
224 |         """
225 |         # Get edges incident to the left and right nodes of each edge in the
226 |         # batch. Result has shape [B, E].
227 |         batch_edge_idcs = sp.mm(self.inc_matrix.transpose(1, 0),
228 |                                 edge_nodes.transpose(1, 0)).transpose(1, 0)
229 |         # Normalise matrix row-wise such that edge features are averaged, not
230 |         # summed.
231 |         row_sum = torch.sum(batch_edge_idcs, dim=1)
232 |         inv = 1.0 / row_sum
233 |         inv[torch.isinf(inv)] = 0.0
234 |         batch_edge_idcs = batch_edge_idcs * inv.view(-1, 1)
235 | 
236 |         # Compute hidden representations from edge_features
237 |         h_edges = torch.mm(edge_feats, self.weight) + self.bias     # [E, K]
238 | 
239 |         # Obtain features of each of these edges
240 |         h = torch.spmm(batch_edge_idcs, h_edges)  # [B, K]
241 | 
242 |         return h
243 | 
244 | 
245 | class DeepGraphConvolution(Module):
246 |     """
247 |     Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
248 |     Adapted from https://github.com/tkipf/pygcn/blob/master/pygcn/layers.py
249 |     """
250 | 
251 |     def __init__(self, in_features, hidden_features, out_features, adj_matrix,
252 |                  attention_scores=None, bias=True, num_layers=1):
253 |         """
254 |         :param in_features:
255 |         :param out_features:
256 |         :param bias:
257 |         :param attention_scores: Sparse tensor containing for each pair of
258 |         nodes the attention score between the nodes. Shape [N, N].
259 |         :param adj_matrix:
260 |         """
261 |         super(DeepGraphConvolution, self).__init__()
262 |         self.in_features = in_features
263 |         self.out_features = out_features
264 |         self.lin_layers = []
265 |         self.bns = []
266 |         for idx in range(num_layers):
267 |             in_size = in_features if idx == 0 else hidden_features
268 |             out_size = out_features if idx == num_layers-1 else hidden_features
269 |             self.lin_layers.append(nn.Linear(in_size, out_size, bias=bias))
270 |             self.bns.append(nn.BatchNorm1d(out_size))
271 |         self.lin_layers = nn.ModuleList(self.lin_layers)
272 |         self.bns = nn.ModuleList(self.bns)
273 |         # Pre-compute adjacency matrix with weighting if necessary
274 |         self.adj_matrix = adj_matrix
275 |         if attention_scores is not None:
276 |             self.adj_matrix = tensor_normalize(adj_matrix * attention_scores)
277 | 
278 |     def forward(self, input):
279 |         support = input
280 |         for idx in range(len(self.lin_layers)):
281 |             support = self.lin_layers[idx](support)
282 |             if idx < len(self.lin_layers)-1:
283 |                 support = F.relu(support)
284 |                 support = self.bns[idx](support)
285 |         output = torch.matmul(self.adj_matrix, support)
286 |         return output
287 | 
288 |     def __repr__(self):
289 |         return self.__class__.__name__ + ' (' \
290 |                + str(self.in_features) + ' -> ' \
291 |                + str(self.out_features) + ')'
292 | 
293 | 
294 | class DeepEdgeConvolution(nn.Module):
295 |     def __init__(self, in_features, hidden_features, out_features, inc_matrix,
296 |                  bias=True, num_layers=1):
297 |         """
298 |         :param in_features:
299 |         :param out_features:
300 |         :param inc_matrix: Sparse incidence matrix of the graph of shape
301 |         [N, E].
302 |         """
303 |         super(DeepEdgeConvolution, self).__init__()
304 |         self.lin_layers = []
305 |         self.bns = []
306 |         for idx in range(num_layers):
307 |             in_size = in_features if idx == 0 else hidden_features
308 |             out_size = out_features if idx == num_layers-1 else hidden_features
309 |             self.lin_layers.append(nn.Linear(in_size, out_size, bias=bias))
310 |             self.bns.append(nn.BatchNorm1d(out_size))
311 |         self.lin_layers = nn.ModuleList(self.lin_layers)
312 |         self.bns = nn.ModuleList(self.bns)
313 |         self.inc_matrix = inc_matrix
314 | 
315 |     def forward(self, edge_nodes, edge_feats):
316 |         """
317 |         :param edge_nodes: Matrix indicating the nodes which each edge in the
318 |         batch connects. Shape [B, N].
319 |         :param edge_feats: Features of *all* edges in the graph. Shape [E, D].
320 |         :return: Hidden representation of shape [B, K].
321 |         """
322 |         # Get edges incident to the left and right nodes of each edge in the
323 |         # batch. Result has shape [B, E].
324 |         batch_edge_idcs = sp.mm(self.inc_matrix.transpose(1, 0),
325 |                                 edge_nodes.transpose(1, 0)).transpose(1, 0)
326 |         # Normalise matrix row-wise such that edge features are averaged, not
327 |         # summed.
328 |         row_sum = torch.sum(batch_edge_idcs, dim=1)
329 |         inv = 1.0 / row_sum
330 |         inv[torch.isinf(inv)] = 0.0
331 |         batch_edge_idcs = batch_edge_idcs * inv.view(-1, 1)
332 | 
333 |         # Compute hidden representations from edge_features
334 |         h_edges = edge_feats
335 |         for idx in range(len(self.lin_layers)):
336 |             h_edges = self.lin_layers[h_edges]
337 |             if idx < len(self.lin_layers)-1:
338 |                 h_edges = F.relu(h_edges)
339 |                 h_edges = self.bns[idx](h_edges)
340 | 
341 |         # Obtain features of each of these edges
342 |         h = torch.spmm(batch_edge_idcs, h_edges)  # [B, K]
343 | 
344 |         return h
345 | 
346 | 
347 | class PatchToPatchEdgeConvolution(nn.Module):
348 |     def __init__(self, in_features, out_features):
349 |         super(PatchToPatchEdgeConvolution, self).__init__()
350 |         self.weight = nn.parameter.Parameter(torch.FloatTensor(in_features,
351 |                                                                out_features))
352 |         self.bias = nn.parameter.Parameter(torch.FloatTensor(out_features))
353 |         self.reset_parameters()
354 | 
355 |     def reset_parameters(self):
356 |         stdv = 1. / math.sqrt(self.weight.size(1))
357 |         self.weight.data.uniform_(-stdv, stdv)
358 |         if self.bias is not None:
359 |             self.bias.data.uniform_(-stdv, stdv)
360 | 
361 |     def forward(self, edge_nodes, adj_matrix, inc_matrix, edge_feats):
362 |         """
363 |         :param edge_nodes: Matrix indicating the nodes which each edge in the
364 |         batch connects. Shape [B, N]
365 |         :param adj_matrix: Sparse adjacency matrix of the graph of shape
366 |         [N, N]. Must contain only 1-entries (i.e. should not be normalised).
367 |         :param inc_matrix: Sparse incidence matrix of the graph of shape
368 |         [N, E].
369 |         :param edge_feats: Features of *all* edges in the graph. Shape [E, D].
370 |         :return: Hidden representation of shape [B, K].
371 |         """
372 |         # Get edges incident to the left and right nodes of each edge in the
373 |         # batch. Result has shape [B, E].
374 |         # In essence, it computes BxN * NxN * NxE
375 |         # = edge_nodes * adj_matrix * inc_matrix.
376 |         batch_edge_idcs = sp.mm(adj_matrix.transpose(1, 0),
377 |                                 edge_nodes.transpose(1, 0))
378 |         batch_edge_idcs = sp.mm(inc_matrix.transpose(1, 0),
379 |                                 batch_edge_idcs).transpose(1, 0)
380 |         # Find exactly those edges which are two "hops" away from the edge
381 |         # in the batch
382 |         batch_edge_idcs = (batch_edge_idcs == 2.0).float()
383 |         # Normalise matrix row-wise such that edge features are averaged, not
384 |         # summed.
385 |         row_sum = torch.sum(batch_edge_idcs, dim=1)
386 |         inv = 1.0 / row_sum
387 |         inv[torch.isinf(inv)] = 0.0
388 |         batch_edge_idcs = batch_edge_idcs * inv.view(-1, 1)
389 | 
390 |         # Compute hidden representations from edge_features
391 |         h_edges = torch.mm(edge_feats, self.weight) + self.bias     # [E, K]
392 | 
393 |         # Obtain features of each of these edges
394 |         h = torch.spmm(batch_edge_idcs, h_edges)  # [B, K]
395 | 
396 |         return h
397 | 
398 | 
399 | class GraphAttentionLayer(nn.Module):
400 |     """
401 |     Simple GAT layer, similar to https://arxiv.org/abs/1710.10903
402 |     """
403 | 
404 |     def __init__(self, in_features, out_features, adj_matrix, dropout, alpha,
405 |                  edge_feats, edge_idcs, concat=True):
406 |         super(GraphAttentionLayer, self).__init__()
407 |         self.dropout = dropout
408 |         self.in_features = in_features
409 |         self.out_features = out_features
410 |         self.adj_matrix = adj_matrix
411 |         self.alpha = alpha
412 |         self.concat = concat
413 |         num_nodes = adj_matrix.shape[0]
414 |         self.edge_feats = torch.zeros(
415 |             (num_nodes, num_nodes, edge_feats.shape[1])).to(
416 |             device=edge_idcs.device)
417 |         self.edge_feats[edge_idcs[:, 0], edge_idcs[:, 1], :] = edge_feats
418 | 
419 |         self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
420 |         nn.init.xavier_uniform_(self.W.data, gain=1.414)
421 |         self.a = nn.Parameter(torch.zeros(
422 |             size=(2 * out_features + self.edge_feats.shape[-1], 1)))
423 |         nn.init.xavier_uniform_(self.a.data, gain=1.414)
424 | 
425 |         self.leakyrelu = nn.LeakyReLU(self.alpha)
426 | 
427 |     def forward(self, input):
428 |         h = torch.mm(input, self.W)
429 |         N = h.size()[0]
430 | 
431 |         a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)
432 |         a_input = torch.cat([a_input, self.edge_feats], dim=-1)
433 |         e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))
434 | 
435 |         zero_vec = -9e15*torch.ones_like(e)
436 |         attention = torch.where(self.adj_matrix > 0, e, zero_vec)
437 |         attention = F.softmax(attention, dim=1)
438 |         attention = F.dropout(attention, self.dropout, training=self.training)
439 |         h_prime = torch.matmul(attention, h)
440 | 
441 |         if self.concat:
442 |             return F.elu(h_prime)
443 |         else:
444 |             return h_prime
445 | 
446 | 


--------------------------------------------------------------------------------
/dataset_preparation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module provides functionality for creating a data split of a given input
  3 | graph into training, validation and test sets of edges.
  4 | """
  5 | import pathlib
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | from utils import bin_data
 10 | 
 11 | 
 12 | def prepare_dataset():
 13 |     np.random.seed(7)
 14 |     city = "London_high"
 15 |     bin_bounds = [10.0, 100.0, 1000.0, 30000.0]
 16 |     include_spatial_lag = False
 17 |     data_path = pathlib.Path("Data/" + city)
 18 | 
 19 |     # Load adjacency matrix specifying which nodes lie in the geographical
 20 |     # neighborhood of each other
 21 |     geo_adj_matrix, geo_adj_idcs = _load_geo_adj_matrix(city)
 22 | 
 23 |     # Load pandas data frames containing node and edge data
 24 |     node_data, edge_data = _load_dataframes(city)
 25 |     true_flows = edge_data["flows"].values   # also contains 0-valued flows
 26 |     edge_idcs = edge_data[["location_1", "location_2"]].values
 27 | 
 28 |     num_nodes = geo_adj_matrix.shape[0]
 29 |     num_geo_edges = geo_adj_idcs.shape[1]
 30 | 
 31 |     print(f"node columns: {node_data.columns}")
 32 |     print(node_data.head())
 33 |     print(f"edge columns: {edge_data.columns}")
 34 |     print(edge_data.head())
 35 | 
 36 |     (val_node_idcs, test_node_idcs, val_edge_idcs,
 37 |      test_edge_idcs, train_edge_idcs, bin_idcs) = _get_node_split(node_data,
 38 |                                                                   edge_data,
 39 |                                                                   bin_bounds)
 40 | 
 41 |     known_flows = _compute_known_flows(true_flows, edge_idcs, edge_data,
 42 |                                        val_node_idcs, test_node_idcs)
 43 | 
 44 |     if include_spatial_lag:
 45 |         raise NotImplementedError
 46 |         # approx_flows, node_data = _substitute_in_approximations(
 47 |         #     val_node_idcs, test_node_idcs, flow_adj_idcs, flows,
 48 |         #     edge_data["origin_to_neighbourhood"].values,
 49 |         #     edge_data["neighbourhood_to_destination"].values, node_data)
 50 | 
 51 |     # Drop unused node and edge features
 52 |     node_data, edge_data = _filter_feature_data(node_data, edge_data,
 53 |                                                 include_spatial_lag)
 54 | 
 55 |     # Remove 0-valued edges that are not in the training, validation, or test
 56 |     # set and update the set indices accordingly
 57 |     (train_edge_idcs, val_edge_idcs, test_edge_idcs, edge_idcs, true_flows,
 58 |      known_flows, edge_data) = _remove_unused_zero_edges(train_edge_idcs,
 59 |                                                          val_edge_idcs,
 60 |                                                          test_edge_idcs,
 61 |                                                          edge_idcs, true_flows,
 62 |                                                          known_flows,
 63 |                                                          edge_data)
 64 |     flow_adj_idcs, flow_adj_values = _compute_flow_adj_matrix(known_flows, edge_idcs)
 65 |     num_flow_edges = flow_adj_idcs.shape[1]
 66 | 
 67 |     # Compute incidence graphs from adjacency matrices
 68 |     flow_inc_indices = _compute_incidence_matrix(flow_adj_idcs, num_nodes,
 69 |                                                  num_flow_edges)
 70 |     geo_inc_indices = _compute_incidence_matrix(geo_adj_idcs, num_nodes,
 71 |                                                 num_geo_edges)
 72 | 
 73 |     _store_dataset_files(data_path, edge_data, node_data, true_flows,
 74 |                          flow_adj_idcs, flow_adj_values, geo_adj_idcs,
 75 |                          flow_inc_indices, geo_inc_indices, train_edge_idcs,
 76 |                          val_edge_idcs, test_edge_idcs)
 77 | 
 78 | 
 79 | def _load_geo_adj_matrix(city):
 80 |     path = f'../raw_data/{city}/geo_adj_matrix.csv'
 81 |     geo_adj_matrix = np.genfromtxt(path, delimiter=',')[1:, 1:]
 82 |     geo_adj_indices = np.stack(np.nonzero(geo_adj_matrix))
 83 |     return geo_adj_matrix, geo_adj_indices
 84 | 
 85 | 
 86 | def _load_dataframes(city):
 87 |     node_path = f'../raw_data/{city}/node_data.csv'
 88 |     node_data = pd.read_csv(node_path, header=0, index_col=0)
 89 | 
 90 |     edge_path = f'../raw_data/{city}/edge_data.csv'
 91 |     edge_data = pd.read_csv(edge_path, header=0, index_col=0)
 92 | 
 93 |     return node_data, edge_data
 94 | 
 95 | 
 96 | def _get_node_split(node_data, edge_data, bin_bounds):
 97 |     flows = edge_data["flows"].values
 98 |     edge_idcs = edge_data.values[:, :2].astype(np.int) # Ex2 array indicating the two nodes an edge connects
 99 |     num_edges = len(edge_idcs)
100 |     num_nodes = len(node_data)
101 |     num_bins = len(bin_bounds)
102 |     bin_idcs = bin_data(flows, num_bins, scale="custom", bin_bounds=bin_bounds)
103 |     bin_counts = np.bincount(bin_idcs)
104 |     bin_counts[0] = np.sum((flows > 0) & (flows < bin_bounds[0])) # Special case: When it comes to compute the fraction of edges of the smallest bin, we exclude the huge number of 0-valued edges
105 |     smallest_bin_idx = np.argmin(bin_counts)
106 |     bin_samples, = np.where(bin_idcs == smallest_bin_idx)
107 |     np.random.shuffle(bin_samples)
108 | 
109 |     test_edge_set, test_node_set = _create_node_set(set(), set(),
110 |                                                     smallest_bin_idx,
111 |                                                     bin_samples, bin_idcs,
112 |                                                     edge_idcs, num_bins,
113 |                                                     0.2 * bin_counts)
114 |     val_edge_set, val_node_set = _create_node_set(test_node_set, test_edge_set,
115 |                                                   smallest_bin_idx,
116 |                                                   bin_samples, bin_idcs,
117 |                                                   edge_idcs, num_bins,
118 |                                                   0.1 * bin_counts)
119 | 
120 |     # Create training set by selecting all non-zero-valued edges and a limited
121 |     # number of zero-valued edges
122 |     non_train_node_set = val_node_set.union(test_node_set)
123 |     train_edge_set = set()
124 |     max_num_zero = 10000  # include limited number of zero-valued edges
125 |     num_zero = 0
126 |     for edge_idx, (flow, edge_idcs) in enumerate(zip(flows, edge_idcs)):
127 |         if (not edge_idcs[0] in non_train_node_set
128 |                 and not edge_idcs[1] in non_train_node_set):
129 |             if flow >= 1.0:
130 |                 train_edge_set.add(edge_idx)
131 |             elif num_zero < max_num_zero:
132 |                 train_edge_set.add(edge_idx)
133 |                 num_zero += 1
134 | 
135 |     assert len(test_edge_set.intersection(val_edge_set)) == 0
136 |     assert len(test_edge_set.intersection(train_edge_set)) == 0
137 |     assert len(val_edge_set.intersection(train_edge_set)) == 0
138 |     assert len(test_node_set.intersection(val_node_set)) == 0
139 | 
140 |     val_node_idcs = np.array(list(val_node_set), dtype=np.int)
141 |     test_node_idcs = np.array(list(test_node_set), dtype=np.int)
142 |     val_edge_idcs = np.array(list(val_edge_set), dtype=np.int)
143 |     test_edge_idcs = np.array(list(test_edge_set), dtype=np.int)
144 |     train_edge_idcs = np.array(list(train_edge_set), dtype=np.int)
145 |     return (val_node_idcs, test_node_idcs, val_edge_idcs, test_edge_idcs,
146 |             train_edge_idcs, bin_idcs)
147 | 
148 | 
149 | def _create_node_set(unavailable_nodes, unavailable_edges, smallest_bin_idx,
150 |                      bin_samples, bin_idcs, edge_idcs, num_bins, max_per_bin,
151 |                      MAX_PER_BIN_AND_NODE=110):
152 |     """
153 |     We create a validation/test set of edges by randomly drawing edges from
154 |     the bin with the smallest number of samples in them. For each sampled edge,
155 |     we choose one of the incident nodes and add them to a set of nodes excluded
156 |     from training. Then we take the edges incident to that node and add them
157 |     to the validation/test set (except when we already added enough edges for
158 |     a bucket and we also only add a maximum number of edges of the same bucket
159 |     per node; otherwise the most frequent bucket type would contain mostly
160 |     edges of the first few nodes).
161 |     :param unavailable_nodes: Set of node indices that are no longer available
162 |     for being included in the new validation/test set.
163 |     :param unavailable_edges: Set of edge indices that are no longer available
164 |     for being included in the new validation/test set.
165 |     :param smallest_bin_idx: Index of the bin with the fewest samples.
166 |     Determines how to greedily select edges.
167 |     :param bin_samples: NumPy array of edge indices belonging to the smallest
168 |     bin that are used to guide the creation of the node set.
169 |     :param bin_idcs: E-shaped vector specifying for each edge which bin it
170 |     belongs to.
171 |     :param edge_idcs: Ex2-shaped tensor indicating the indices of the two nodes
172 |     an edge connects.
173 |     :param num_bins: Number of bins.
174 |     :param max_per_bin: Maximum number of edges to find for each bin.
175 |     :param MAX_PER_BIN_AND_NODE: Maximum number of edges that a single node
176 |     may add to a single bin.
177 |     :return:
178 |     """
179 |     def add_incident_edges(inc_edge_idcs, current_set_edges,
180 |                            current_set_bin_counts):
181 |         """
182 |         Adds the edges given in `inc_edge_idcs` to `current_set_edges` subject
183 |         to some conditions.
184 |         :param inc_edge_idcs:
185 |         :param current_set_edges:
186 |         :param current_set_bin_counts:
187 |         :return:
188 |         """
189 |         added_count = 0     # Number of edges actually added
190 |         node_bin_counts = np.zeros(num_bins)    # Bin counts for edges actually added
191 |         for inc_edge_idx in inc_edge_idcs:
192 |             edge_bin = bin_idcs[inc_edge_idx]
193 |             if (inc_edge_idx not in current_set_edges
194 |                     and inc_edge_idx not in unavailable_edges
195 |                     and node_bin_counts[edge_bin] < MAX_PER_BIN_AND_NODE
196 |                     and current_set_bin_counts[edge_bin]+node_bin_counts[edge_bin] < max_per_bin[edge_bin]):
197 |                 added_count += 1
198 |                 node_bin_counts[edge_bin] += 1
199 |                 current_set_edges.add(inc_edge_idx)
200 |         return added_count, node_bin_counts
201 | 
202 |     # Samples in smallest bin
203 |     set_nodes, set_edges = set(), set()     # Nodes and edges selected for validation/test
204 |     set_bin_counts = np.zeros(num_bins)     # Counts of the edges in validation/test set for each bin
205 | 
206 |     for idx, edge_idx in enumerate(bin_samples):
207 |         # If we have enough edges of the rarest (smallest) type, we can stop
208 |         # adding edges.
209 |         if set_bin_counts[smallest_bin_idx] >= max_per_bin[smallest_bin_idx]:
210 |             break
211 |         # If the edge is already in a different set, do not include it
212 |         if edge_idx in unavailable_edges:
213 |             continue
214 |         out_node, in_node = tuple(edge_idcs[edge_idx])
215 | 
216 |         # If both nodes are no longer available, go to next edge
217 |         if in_node in unavailable_nodes and out_node in unavailable_nodes:
218 |             continue
219 |         # Decide which of the two nodes to add based on whether one is already
220 |         # in the node set or no longer available
221 |         if in_node in set_nodes or out_node in unavailable_nodes:   # We have already added in_node to set_nodes in a previous iteration OR in_node belongs to an excluded set (e.g. the validation set created in a previous call to this method).
222 |             node_to_add = in_node
223 |         elif out_node in set_nodes or in_node in unavailable_nodes:
224 |             node_to_add = out_node
225 |         else:
226 |             node_to_add = in_node
227 | 
228 |         # Add node to the set
229 |         set_nodes.add(node_to_add)
230 |         # Now add all the edges incident to the new node to the edge set
231 |         # (subject to some conditions).
232 |         # For outgoing edges
233 |         out_edge_idcs, = np.where(edge_idcs[:, 0] == node_to_add)
234 |         add_set_out_count, add_node_bin_counts = add_incident_edges(out_edge_idcs, set_edges, set_bin_counts)
235 |         set_bin_counts += add_node_bin_counts
236 |         # For incoming edges
237 |         in_edge_idcs,  = np.where(edge_idcs[:, 1] == node_to_add)
238 |         add_set_in_count, add_node_bin_counts = add_incident_edges(in_edge_idcs, set_edges, set_bin_counts)
239 |         set_bin_counts += add_node_bin_counts
240 | 
241 |         if np.any(set_bin_counts >= max_per_bin):
242 |             print(f"One bin full after adding {idx+1} edges.")
243 |     return set_edges, set_nodes
244 | 
245 | 
246 | def _compute_known_flows(true_flows, edge_idcs, edge_data, val_node_idcs,
247 |                          test_node_idcs):
248 |     known_flows = np.copy(true_flows)
249 |     unknown_nodes = np.concatenate((val_node_idcs, test_node_idcs))
250 |     loc1_unknown = np.isin(edge_idcs[:, 0], unknown_nodes)
251 |     loc2_unknown = np.isin(edge_idcs[:, 1], unknown_nodes)
252 |     known_flows[loc1_unknown] = edge_data["neighbourhood_to_location2"].iloc[loc1_unknown]
253 |     known_flows[loc2_unknown] = edge_data["location1_to_neighbourhood"].iloc[loc2_unknown]
254 |     return known_flows
255 | 
256 | 
257 | def _substitute_in_approximations(val_nodes, test_nodes, adj_idcs, flows,
258 |                                   o2n_flow_approx, n2d_flow_approx, node_data):
259 |     train_flows = np.copy(flows)
260 | 
261 |     # Replace flows for edges incident to validation nodes by approximations
262 |     val_outgoing_edge_idcs = np.isin(adj_idcs[0], val_nodes)
263 |     train_flows[val_outgoing_edge_idcs] = n2d_flow_approx[val_outgoing_edge_idcs]
264 |     val_incoming_edge_idcs = np.isin(adj_idcs[1], val_nodes)
265 |     train_flows[val_outgoing_edge_idcs] = o2n_flow_approx[val_incoming_edge_idcs]
266 | 
267 |     # Replace flows for edges incident to test nodes by approximations
268 |     test_outgoing_edges = np.isin(adj_idcs[0], test_nodes)
269 |     train_flows[test_outgoing_edges] = n2d_flow_approx[test_outgoing_edges]
270 |     test_incoming_edges = np.isin(adj_idcs[1], test_nodes)
271 |     train_flows[test_outgoing_edges] = o2n_flow_approx[test_incoming_edges]
272 | 
273 |     # Set flows of edge between nodes within the two sets to 0
274 |     union_nodes = np.concatenate((val_nodes, test_nodes))
275 |     inner_edges = np.logical_and((np.isin(adj_idcs[0], union_nodes)), (np.isin(adj_idcs[1], union_nodes)))
276 |     train_flows[inner_edges] = 0.0
277 | 
278 |     # In node features, replace flow-dependent values of validation/test nodes
279 |     # by their spatial-lag approximation
280 |     node_data.loc[node_data["nodeID"].isin(union_nodes), "out_total"] = node_data["out_total_spatial_lag"]
281 |     node_data.loc[node_data["nodeID"].isin(union_nodes), "in_total"] = node_data["in_total_spatial_lag"]
282 |     node_data.loc[node_data["nodeID"].isin(union_nodes), "gyration_radius"] = node_data["gyration_radius_spatial_lag"]
283 | 
284 |     return train_flows, node_data
285 | 
286 | 
287 | def _remove_unused_zero_edges(train_edge_idcs, val_edge_idcs, test_edge_idcs,
288 |                               edge_idcs, true_flows, known_flows, edge_data):
289 |     """
290 |     We need to change 0-valued edges that are in the training, validation, or
291 |     test set to have a non-zero value because the data set class uses the
292 |     entries of the sparse matrix for some computations and 0-values would just
293 |     be removed, hence messing up the indexing. In particular, we want to only
294 |     specify the edge indices that make up each of these sets. But we cannot
295 |     have 0-valued edges vanish because that would mess up the indexing.
296 |     :param train_edge_idcs:
297 |     :param val_edge_idcs:
298 |     :param test_edge_idcs:
299 |     :param edge_idcs:
300 |     :param true_flows:
301 |     :param known_flows:
302 |     :param edge_data:
303 |     :return:
304 |     """
305 |     # all edges in either the training, validation or test set
306 |     num_edges = len(true_flows)
307 |     tvt_edges = np.concatenate((train_edge_idcs, val_edge_idcs,
308 |                                 test_edge_idcs), axis=-1)
309 |     tvt_edges = np.isin(np.arange(num_edges), tvt_edges)   # convert to boolean array
310 | 
311 |     # Find out the indices of the training, validation, and test edges within
312 |     # the filtered set of edges
313 |     indices = np.zeros(num_edges, dtype=np.int)
314 |     indices[train_edge_idcs] = 1
315 |     indices[val_edge_idcs] = 2
316 |     indices[test_edge_idcs] = 3
317 |     # Remove 0-valued edges that are not in the training, validation, or test
318 |     # set, i.e. keep edges that are in one of the sets or have a non-zero exact
319 |     # or approximate flow.
320 |     retained_edges = tvt_edges | (known_flows != 0.0) | (true_flows != 0.0)
321 |     indices = indices[retained_edges]
322 |     edge_idcs = edge_idcs[retained_edges]
323 |     true_flows = true_flows[retained_edges]
324 |     known_flows = known_flows[retained_edges]
325 |     edge_data = edge_data.iloc[retained_edges]
326 |     # Update indices
327 |     train_edge_idcs = np.where(indices == 1)[0]
328 |     val_edge_idcs = np.where(indices == 2)[0]
329 |     test_edge_idcs = np.where(indices == 3)[0]
330 | 
331 |     return (train_edge_idcs, val_edge_idcs, test_edge_idcs, edge_idcs,
332 |             true_flows, known_flows, edge_data)
333 | 
334 | 
335 | def _compute_flow_adj_matrix(known_flows, edge_idcs):
336 |     """
337 |     :param known_flows: Specifies for each edge the known flow (i.e. true flow
338 |     or spatial lag flow). Shape [E].
339 |     :param edge_idcs: Specifies for each edge the indices of the two incident
340 |     nodes. Shape [E, 2].
341 |     :return:
342 |         - flow_adj_idcs: Indices of non-zero entries in the flow adjacenty
343 |         matrix. Shape [2E, 2].
344 |         - know_flows: Specifies the non-zero values in the flow matrix. Shape
345 |         [2E].
346 |     """
347 |     upper_triag_idcs = edge_idcs[known_flows > 0.0]
348 |     known_flows = known_flows[known_flows > 0.0]
349 |     lower_triag_idcs = np.stack((upper_triag_idcs[:, 1], upper_triag_idcs[:, 0]), axis=1)
350 |     flow_adj_idcs = np.concatenate((upper_triag_idcs, lower_triag_idcs), axis=0)
351 |     flow_adj_values = np.concatenate((known_flows, np.copy(known_flows)))
352 |     return flow_adj_idcs, flow_adj_values
353 | 
354 | 
355 | def _compute_incidence_matrix(adj_indices, num_nodes, num_edges):
356 |     # For both incoming and outgoing edges
357 |     inc_matrix = np.zeros((num_nodes, num_edges))
358 |     inc_matrix[adj_indices[0], np.arange(num_edges)] = 1
359 |     inc_matrix[adj_indices[1], np.arange(num_edges)] = 1
360 |     inc_indices = np.stack(np.nonzero(inc_matrix))
361 |     return inc_indices
362 | 
363 | 
364 | def _compute_node_idcs_matrix(node_idcs, edge_idcs_set, num_nodes):
365 |     num_edges = len(edge_idcs_set)
366 |     node_idcs_matrix = np.zeros(num_edges, num_nodes)
367 |     node_idcs_matrix[np.arange(num_edges), node_idcs] = 1.0
368 |     node_idcs_matrix[np.arange(num_edges), node_idcs] = 1.0
369 |     return node_idcs_matrix
370 | 
371 | 
372 | def _filter_feature_data(node_data, edge_data, include_spatial_lag):
373 |     edge_data = edge_data.drop(["flows"], axis=1)
374 |     node_data = node_data.drop(["nodeID", "in_total_spatial_lag",
375 |                                  "out_total_spatial_lag",
376 |                                  "gyration_radius_spatial_lag"], axis=1)
377 |     if not include_spatial_lag:
378 |         node_data = node_data.drop(["in_total", "out_total", "gyration_rad"],
379 |                                    axis=1)
380 | 
381 |     return node_data, edge_data
382 | 
383 | 
384 | def _store_dataset_files(data_path, edge_data, node_data, flows, flow_adj_idcs,
385 |                          flow_adj_values, geo_adj_idcs, flow_inc_indices,
386 |                          geo_inc_indices, train_edge_idcs, val_edge_idcs,
387 |                          test_edge_idcs):
388 |     data_path.mkdir(exist_ok=True)
389 |     pd.to_pickle(edge_data, data_path / "edge_data.pk")
390 |     pd.to_pickle(node_data, data_path / "node_data.pk")
391 |     np.save(data_path / "flows.npy", flows)
392 |     np.save(data_path / "flow_adj_indices.npy", flow_adj_idcs)
393 |     np.save(data_path / "flow_adj_values.npy", flow_adj_values)
394 |     np.save(data_path / "geo_adj_indices.npy", geo_adj_idcs)
395 |     np.save(data_path / "flow_inc_indices.npy", flow_inc_indices)
396 |     np.save(data_path / "geo_inc_indices.npy", geo_inc_indices)
397 |     np.save(data_path / "train_edge_indices.npy", train_edge_idcs)
398 |     np.save(data_path / "val_edge_indices.npy", val_edge_idcs)
399 |     np.save(data_path / "test_edge_indices.npy", test_edge_idcs)
400 | 
401 | 
402 | if __name__ == '__main__':
403 |     prepare_dataset()
404 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pathlib
  3 | import numpy as np
  4 | import pandas as pd
  5 | import torch
  6 | from torch.utils.data import TensorDataset, DataLoader, BatchSampler
  7 | from torch.utils.data import WeightedRandomSampler
  8 | import scipy.sparse as ssp
  9 | import sklearn.preprocessing as prep
 10 | import sklearn.pipeline as ppln
 11 | from sklearn.utils import class_weight
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | from utils import to_sparse_tensor, bin_data, normalize, split_bucketed_data, \
 15 |     summarize_feature_matrix
 16 | 
 17 | 
 18 | def get_composite_transformer(n_quantiles):
 19 |     transformer = ppln.Pipeline([
 20 |         ("quantile", prep.QuantileTransformer(output_distribution="normal",
 21 |                                               n_quantiles=n_quantiles)),
 22 |         ("normalize", prep.StandardScaler())
 23 |     ])
 24 |     return transformer
 25 | 
 26 | 
 27 | class BinnedTransformer:
 28 | 
 29 |     def __init__(self, num_bins, create_transformer_f):
 30 |         self.num_bins = num_bins
 31 | 
 32 |         self.transformers = [create_transformer_f() for _ in range(num_bins)]
 33 | 
 34 |     def fit_transform(self, x_reg, x_class):
 35 |         transformed_x_reg = np.copy(x_reg)
 36 |         for bin_idx in range(self.num_bins):
 37 |             sample_idcs = x_class == bin_idx
 38 |             transformer = self.transformers[bin_idx]
 39 |             transformed_x_reg[sample_idcs] = transformer.fit_transform(
 40 |                 transformed_x_reg[sample_idcs])
 41 |         return transformed_x_reg
 42 | 
 43 |     def inverse_transform(self, x_reg, x_class):
 44 |         x_reg = x_reg.reshape(-1)
 45 |         transformed_x_reg = np.copy(x_reg)
 46 |         for bin_idx in range(self.num_bins):
 47 |             sample_idcs = x_class == bin_idx
 48 |             if np.sum(sample_idcs) == 0: continue   # no sample of that class
 49 |             transformer = self.transformers[bin_idx]
 50 |             transformed_x_reg[sample_idcs] = transformer.inverse_transform(
 51 |                 x_reg[sample_idcs].reshape(-1, 1)).reshape(-1)
 52 |         return transformed_x_reg
 53 | 
 54 | 
 55 | class GraphTopologicalData:
 56 | 
 57 |     def __init__(self, adj_matrix=None, unweighted_adj_matrix=None,
 58 |                  inc_matrix=None, inc_matrix_dense=None, edge_indices=None,
 59 |                  edge_weights=None):
 60 |         self.adj_matrix = adj_matrix                        # NxN sparse matrix
 61 |         self.unweighted_adj_matrix = unweighted_adj_matrix  # NxN sparse matrix
 62 |         self.inc_matrix = inc_matrix                        # NxE sparse matrix
 63 |         self.inc_matrix_dense = inc_matrix_dense            # NxE dense matrix
 64 |         self.edge_indices = edge_indices                    # Ex2 dense matrix
 65 |         self.edge_weights = edge_weights                    # E dense vector
 66 | 
 67 | 
 68 | class UrbanPlanningDataset:
 69 | 
 70 |     def __init__(self, data_base_path="Data/", num_bins=4, batch_size=32,
 71 |                  n_quantiles=1000, resample=False,
 72 |                  excluded_node_feature_columns=tuple(),
 73 |                  excluded_edge_feature_columns=tuple(),
 74 |                  use_binned_transformer=False, include_approx_flows=False,
 75 |                  flow_adj_threshold=0, seed=7):
 76 |         """
 77 |         Loads city data set.
 78 |         :param data_base_path: Location at which to find the node features,
 79 |         edge features, and the adjacency matrix.
 80 |         :param num_bins: Number of bins for dividing the data set labels. The
 81 |         bin index may be a classification target or for computing MAEs for each
 82 |         bin separately.
 83 |         :param batch_size:
 84 |         :param n_quantiles: Number of quantiles to use for the quantile
 85 |         transformer that preprocesses features and labels.
 86 |         :param excluded_node_feature_columns: Tuple of names of the columns
 87 |         to remove from the node feature data set.
 88 |         :param excluded_edge_feature_columns: Tuple of names of the columns to
 89 |         remove from the edge feature data set.
 90 |         :param resample: If True, we use a weighted random sampler to ensure
 91 |         that each epoch contains an equal number of samples from each bin.
 92 |         :param use_binned_transformer: If True, the edge labels are rescaled
 93 |         using an individual transformer for each bin. Inverting the
 94 |         transformation then requires both a regression and classification
 95 |         prediction.
 96 |         :param include_approx_flows: If True, the edge features include the
 97 |         approximate flows (normally used just for flow adjacency matrix).
 98 |         :param flow_adj_threshold: When constructing the unweighted flow
 99 |         adjacency matrix, only include edges with a flow greater or equal that
100 |         threshold.
101 |         :param seed: Random seed to always obtain the same split into training,
102 |         validation, and test set.
103 |         :return: Tuple consisting of
104 |             - Node features of shape [N, K]
105 |             - Sparse adjacency matrix of shape [N, N]
106 |             - Loader for the training set of edges
107 |             - Loader for the validation set of edges
108 |             - Loader for the test set of edges
109 |             - Number of node features
110 |             - Number of edge features
111 |             - Scaler used for edge labels
112 |         """
113 |         print("Loading data")
114 | 
115 |         self.num_bins = num_bins
116 |         self.batch_size = batch_size
117 |         self.n_quantiles = n_quantiles
118 |         self.use_binned_transformer = use_binned_transformer
119 | 
120 |         get_composite_transformer_f = lambda: get_composite_transformer(
121 |             n_quantiles=n_quantiles)
122 | 
123 |         # Load node data
124 |         (self.node_feats, self.num_nodes, self.num_node_feats,
125 |          self.node_scaler) = self._load_node_data(data_base_path,
126 |                                                   get_composite_transformer_f,
127 |                                                   excluded_node_feature_columns)
128 | 
129 |         # Load edge data
130 |         (flow_edge_indices, self.edge_feats, self.edge_labels,
131 |          self.edge_labels_unscaled, self.label_scaler, self.edge_scaler,
132 |          self.num_edges, self.num_edge_feats) = self._load_edge_data(
133 |             data_base_path,
134 |             get_composite_transformer_f,
135 |             include_approx_flows,
136 |             excluded_edge_feature_columns)
137 |         self.max_label = np.max(self.edge_labels_unscaled)
138 |         print(f"\tMax label {self.max_label}")
139 | 
140 |         (train_idcs, val_idcs, test_idcs) = self._load_dataset_split(
141 |             data_base_path)
142 | 
143 |         # Load flow graph data
144 |         (flow_adj_matrix, flow_inc_matrix, flow_adj_indices,
145 |          unweighted_flow_adj_matrix,
146 |          flow_adj_values) = self._load_flow_graph_data(
147 |             data_base_path, self.num_nodes, self.num_edges, flow_adj_threshold)
148 |         self.flow_topology = GraphTopologicalData(
149 |             adj_matrix=flow_adj_matrix,
150 |             edge_indices=flow_adj_indices,
151 |             unweighted_adj_matrix=unweighted_flow_adj_matrix,
152 |             inc_matrix=flow_inc_matrix,
153 |             edge_weights=flow_adj_values
154 |         )
155 | 
156 |         # Load geographical graph data
157 |         (geo_adj_matrix, geo_inc_matrix,
158 |          geo_edge_indices, geo_adj_values) = self._load_geo_graph_data(
159 |             data_base_path, self.num_nodes, self.num_edges, self.flow_topology)
160 |         self.geo_topology = GraphTopologicalData(
161 |             adj_matrix=geo_adj_matrix,
162 |             inc_matrix=geo_inc_matrix,
163 |             edge_indices=geo_edge_indices,
164 |             edge_weights=geo_adj_values)
165 | 
166 |         # Load bin data
167 |         self.bin_bounds = [10.0, 100.0, 1000.0, 10000.0]
168 |         (self.edge_buckets, self.train_bin_weights, self.val_bin_weights,
169 |          self.test_bin_weights) = self._load_bin_data(self.bin_bounds,
170 |                                                       self.edge_labels_unscaled,
171 |                                                       num_bins, train_idcs,
172 |                                                       val_idcs, test_idcs)
173 |         print(f"\tBin counts: {np.array([np.sum(self.edge_buckets == i) for i in range(num_bins)])}")
174 |         print(f"\tTraining bin weights: {self.train_bin_weights}")
175 |         print(f"\tValidation bin weights: {self.val_bin_weights}")
176 |         print(f"\tTest bin weights: {self.test_bin_weights}")
177 | 
178 |         # If specified, use the binned transformer to transform labels
179 |         if use_binned_transformer:
180 |             self.label_scaler = BinnedTransformer(self.num_bins,
181 |                                                   get_composite_transformer_f)
182 |             self.edge_labels = self.label_scaler.fit_transform(
183 |                 self.edge_labels_unscaled.reshape(-1, 1), self.edge_buckets).reshape(-1)
184 |             # plt.hist(self.edge_labels, bins=100)
185 |             # plt.show()
186 | 
187 |         # Create edge feature matrix
188 |         indices = flow_edge_indices.transpose(1, 0)
189 |         values = self.edge_feats
190 |         edge_feat_matrix = torch.sparse.FloatTensor(torch.from_numpy(indices), torch.from_numpy(values))
191 |         self.edge_feat_matrix = edge_feat_matrix.to_dense()
192 | 
193 |         # Convert numpy arrays to tensors
194 |         self.node_feats = torch.from_numpy(self.node_feats)
195 |         self.edge_feats = torch.from_numpy(self.edge_feats)
196 |         flow_edge_indices = torch.from_numpy(flow_edge_indices)
197 |         self.flow_topology.edge_indices = torch.from_numpy(self.flow_topology.edge_indices)
198 |         self.flow_topology.edge_weights = torch.from_numpy(self.flow_topology.edge_weights)
199 |         self.geo_topology.edge_indices = torch.from_numpy(self.geo_topology.edge_indices)
200 |         self.geo_topology.edge_weights = torch.from_numpy(self.geo_topology.edge_weights)
201 |         self.edge_labels = torch.from_numpy(self.edge_labels)
202 |         self.edge_labels_unscaled = torch.from_numpy(self.edge_labels_unscaled)
203 |         self.edge_buckets = torch.from_numpy(self.edge_buckets)
204 |         self.train_bin_weights = torch.from_numpy(self.train_bin_weights)
205 |         self.val_bin_weights = torch.from_numpy(self.val_bin_weights)
206 |         self.test_bin_weights = torch.from_numpy(self.test_bin_weights)
207 |         # Matrices
208 |         self.geo_topology.adj_matrix = to_sparse_tensor(normalize(self.geo_topology.adj_matrix))
209 |         self.geo_topology.inc_matrix = to_sparse_tensor(self.geo_topology.inc_matrix)
210 |         self.flow_topology.adj_matrix = to_sparse_tensor(self.flow_topology.adj_matrix)   # Sparse tensor of shape [N, N] containing the flow values between nodes.
211 |         self.flow_topology.unweighted_adj_matrix = to_sparse_tensor(self.flow_topology.unweighted_adj_matrix)
212 |         self.flow_topology.inc_matrix = to_sparse_tensor(self.flow_topology.inc_matrix)
213 |         self._check_data_consistency()
214 | 
215 |         # Create data loaders
216 |         (self.train_loader, self.val_loader,
217 |          self.test_loader) = self._create_data_loaders(train_idcs, val_idcs,
218 |                                                        test_idcs,
219 |                                                        self.train_bin_weights,
220 |                                                        flow_edge_indices,       # different from flow_graph_topology.edge_indices because of additional 0-flows
221 |                                                        self.edge_feats,
222 |                                                        self.edge_labels,
223 |                                                        self.edge_buckets,
224 |                                                        batch_size, resample,
225 |                                                        seed)
226 | 
227 |         print("Finished loading data")
228 | 
229 |     def _check_data_consistency(self):
230 |         tensors = [self.node_feats, self.edge_feats,
231 |                    self.flow_topology.edge_indices,
232 |                    self.geo_topology.edge_indices, self.edge_labels,
233 |                    self.edge_labels_unscaled, self.edge_buckets,
234 |                    self.train_bin_weights, self.val_bin_weights,
235 |                    self.test_bin_weights, self.geo_topology.adj_matrix,
236 |                    self.geo_topology.inc_matrix, self.flow_topology.adj_matrix,
237 |                    self.flow_topology.unweighted_adj_matrix,
238 |                    self.flow_topology.inc_matrix, self.edge_feat_matrix]
239 |         print("Checking ", end="")
240 |         for idx, tensor in enumerate(tensors):
241 |             print(f"{idx}, ", end="")
242 |             if (isinstance(tensor, torch.sparse.FloatTensor) or
243 |                     isinstance(tensor, torch.sparse.LongTensor)):
244 |                 assert not torch.isnan(tensor.coalesce().indices()).any()
245 |                 assert not torch.isnan(tensor.coalesce().values()).any()
246 |             else:
247 |                 assert not torch.isnan(tensor).any()
248 |         print("done")
249 | 
250 |     def to(self, device):
251 |         """
252 |         Moves all tensors of the dataset that will not be iterated over in
253 |         minibatch to the specified device.
254 |         :param device: Device specifier.
255 |         """
256 |         self.node_feats = self.node_feats.to(device=device)
257 |         self.edge_feats = self.edge_feats.to(device=device)
258 |         self.flow_topology.edge_indices = self.flow_topology.edge_indices.to(device=device)
259 |         self.geo_topology.edge_indices = self.geo_topology.edge_indices.to(device=device)
260 |         self.train_bin_weights = self.train_bin_weights.to(device=device)
261 |         self.geo_topology.adj_matrix = self.geo_topology.adj_matrix.to(device=device)
262 |         self.geo_topology.inc_matrix = self.geo_topology.inc_matrix.to(device=device)
263 |         self.geo_topology.edge_weights = self.geo_topology.edge_weights.to(device=device)
264 |         self.flow_topology.adj_matrix = self.flow_topology.adj_matrix.to(device=device)
265 |         self.flow_topology.unweighted_adj_matrix = self.flow_topology.unweighted_adj_matrix.to(
266 |             device=device)
267 |         self.flow_topology.inc_matrix = self.flow_topology.inc_matrix.to(device=device)
268 |         self.flow_topology.edge_weights = self.flow_topology.edge_weights.to(device=device)
269 |         self.edge_feat_matrix = self.edge_feat_matrix.to(device=device)
270 | 
271 |     @staticmethod
272 |     def _load_node_data(data_base_path, get_composite_transformer_f,
273 |                         excluded_columns):
274 |         # Node features
275 |         node_data = pd.read_pickle(os.path.join(data_base_path, "node_data.pk"))
276 |         if len(excluded_columns) > 0:
277 |             node_data.drop(list(excluded_columns), axis=1, inplace=True)
278 |         node_feats = node_data.values
279 |         # Rescale continuous features
280 |         node_scaler = get_composite_transformer_f()
281 |         cont_feature_idcs = UrbanPlanningDataset._get_continuous_feature_idcs(node_data)
282 |         node_feats[:, cont_feature_idcs] = node_scaler.fit_transform(node_feats[:, cont_feature_idcs])
283 |         node_feats = node_feats.astype(np.float32)
284 |         num_nodes = node_feats.shape[0]
285 |         num_node_feats = node_feats.shape[1]
286 |         return node_feats, num_nodes, num_node_feats, node_scaler
287 | 
288 |     @staticmethod
289 |     def _load_edge_data(data_base_path, get_composite_transformer_f,
290 |                         include_approx_flows, excluded_columns):
291 |         # Edge data
292 |         edge_data = pd.read_pickle(os.path.join(data_base_path, "edge_data.pk"))
293 |         if len(excluded_columns) > 0:
294 |             edge_data.drop(list(excluded_columns), axis=1, inplace=True)
295 |         edge_feats = edge_data.values
296 |         edge_indices = edge_feats[:, :2].astype(np.int)
297 |         edge_feats = edge_feats[:, 2:]
298 |         # Load approximate flows and potentially concatenate to edge features
299 |         # approx_flows = np.load(os.path.join(data_base_path,
300 |         #                                     "approx_flows.npy"))
301 |         if include_approx_flows:
302 |             raise NotImplementedError
303 |             # edge_feats = np.concatenate((edge_feats, approx_flows.reshape(-1, 1)),
304 |             #                             axis=-1)
305 |         num_edges = edge_feats.shape[0]
306 |         edge_labels = np.load(os.path.join(data_base_path, "flows.npy"))
307 |         edge_labels_unscaled = np.copy(edge_labels).astype(np.float32)
308 |         # Transform edge features
309 |         edge_scaler = get_composite_transformer_f()
310 |         cont_feature_idcs = UrbanPlanningDataset._get_continuous_feature_idcs(edge_data.iloc[:, 2:])
311 |         edge_feats[:, cont_feature_idcs] = edge_scaler.fit_transform(edge_feats)[:, cont_feature_idcs]
312 |         edge_feats = edge_feats.astype(np.float32)
313 |         # Transform edge labels
314 |         edge_labels = edge_labels.astype(np.float32)
315 |         label_scaler = get_composite_transformer_f()
316 |         edge_labels = label_scaler.fit_transform(
317 |             edge_labels.reshape(-1, 1)).reshape(-1)
318 |         num_edge_feats = edge_feats.shape[1]
319 |         return (edge_indices, edge_feats, edge_labels, edge_labels_unscaled,
320 |                 label_scaler, edge_scaler, num_edges, num_edge_feats)
321 | 
322 |     @staticmethod
323 |     def _load_dataset_split(data_base_path):
324 |         data_base_path = pathlib.Path(data_base_path)
325 |         train_idcs = np.load(data_base_path / "train_edge_indices.npy")
326 |         val_idcs = np.load(data_base_path / "val_edge_indices.npy")
327 |         test_idcs = np.load(data_base_path / "test_edge_indices.npy")
328 |         return train_idcs, val_idcs, test_idcs
329 | 
330 |     @staticmethod
331 |     def _load_bin_data(bin_bounds, edge_labels_unscaled, num_bins,
332 |                        train_idcs, val_idcs, test_idcs):
333 |         # Get edge buckets (assign each edge to a bucket based on magnitude of
334 |         # flow)
335 |         edge_buckets = bin_data(edge_labels_unscaled, num_bins,
336 |                                 scale="custom", bin_bounds=bin_bounds)
337 |         # Compute weights for each bucket to counterbalance the imbalanced
338 |         # class/bin distribution
339 |         train_bin_weights = class_weight.compute_class_weight('balanced',
340 |                                                               np.unique(edge_buckets),
341 |                                                               edge_buckets[train_idcs])
342 |         val_bin_weights = class_weight.compute_class_weight('balanced',
343 |                                                               np.unique(edge_buckets),
344 |                                                               edge_buckets[val_idcs])
345 |         test_bin_weights = class_weight.compute_class_weight('balanced',
346 |                                                             np.unique(edge_buckets),
347 |                                                             edge_buckets[test_idcs])
348 |         train_bin_weights = train_bin_weights.astype(np.float32)
349 |         val_bin_weights = val_bin_weights.astype(np.float32)
350 |         test_bin_weights = test_bin_weights.astype(np.float32)
351 |         return edge_buckets, train_bin_weights, val_bin_weights, test_bin_weights
352 | 
353 |     @staticmethod
354 |     def _load_flow_graph_data(data_base_path, num_nodes, num_edges,
355 |                               flow_adj_threshold):
356 |         # Flow adjacency matrix
357 |         flow_adj_indices = np.load(os.path.join(data_base_path,
358 |                                                 "flow_adj_indices.npy")).T
359 |         flow_adj_values = np.load(os.path.join(data_base_path,
360 |                                                "flow_adj_values.npy"))
361 |         flow_adj_matrix = ssp.coo_matrix((flow_adj_values,
362 |                                           (flow_adj_indices[0],
363 |                                            flow_adj_indices[1])),
364 |                                          shape=(num_nodes, num_nodes))
365 |         flow_adj_matrix = flow_adj_matrix.tocsr()
366 | 
367 |         unweighted_flow_adj_indices = flow_adj_indices[:,
368 |                                       flow_adj_values >= flow_adj_threshold]
369 |         flow_adj_values = flow_adj_values[flow_adj_values >= flow_adj_threshold]
370 |         unweighted_flow_adj_matrix = ssp.coo_matrix(
371 |             (flow_adj_values,
372 |              (unweighted_flow_adj_indices[0], unweighted_flow_adj_indices[1])),
373 |             shape=(num_nodes, num_nodes))
374 |         unweighted_flow_adj_matrix.setdiag(np.ones(num_nodes))
375 |         flow_adj_values = unweighted_flow_adj_matrix.tocoo().data
376 |         flow_adj_indices = np.stack((unweighted_flow_adj_matrix.row,
377 |                                      unweighted_flow_adj_matrix.col), axis=-1)
378 |         flow_adj_indices = flow_adj_indices.astype(np.int64)
379 |         flow_adj_values = flow_adj_values.astype(np.float32)
380 |         unweighted_flow_adj_matrix = (unweighted_flow_adj_matrix > 0.0).astype(np.float)
381 | 
382 |         # Flow incidence matrix for all edges
383 |         flow_inc_indices = np.load(os.path.join(data_base_path,
384 |                                                 "flow_inc_indices.npy"))
385 |         flow_inc_matrix = ssp.coo_matrix(
386 |             (np.ones(flow_inc_indices.shape[1]),
387 |              (flow_inc_indices[0],
388 |               flow_inc_indices[1])),
389 |             shape=(num_nodes, num_edges))
390 |         flow_inc_matrix = flow_inc_matrix.tocsr()
391 | 
392 |         return (flow_adj_matrix, flow_inc_matrix, flow_adj_indices,
393 |                 unweighted_flow_adj_matrix, flow_adj_values)
394 | 
395 |     @staticmethod
396 |     def _load_geo_graph_data(data_base_path, num_nodes, num_edges,
397 |                              flow_topology):
398 |         # Geographical adjacency matrix
399 |         geo_adj_indices = np.load(os.path.join(data_base_path,
400 |                                                "geo_adj_indices.npy"))
401 |         geo_adj_matrix = ssp.coo_matrix((np.ones(geo_adj_indices.shape[1]),
402 |                                          (geo_adj_indices[0],
403 |                                           geo_adj_indices[1])),
404 |                                         shape=(num_nodes, num_nodes))
405 |         geo_adj_matrix = geo_adj_matrix.tocsr()
406 | 
407 |         # Geographical incidence matrix for all edges
408 |         geo_inc_indices = np.load(os.path.join(data_base_path,
409 |                                                "geo_inc_indices.npy"))
410 |         geo_inc_matrix = ssp.coo_matrix(
411 |             (np.ones(geo_inc_indices.shape[1]),
412 |              (geo_inc_indices[0],
413 |               geo_inc_indices[1])),
414 |             shape=(num_nodes, num_edges))
415 |         geo_inc_matrix = geo_inc_matrix.tocsr()
416 | 
417 |         # Get flows for the geographical edges
418 |         all_edges = np.array(flow_topology.adj_matrix.todense()).reshape(-1)            # N^2 matrix
419 |         geo_indices_of_edges = np.array(geo_adj_matrix.todense()).reshape(-1).nonzero() # N^2 matrix
420 |         geo_flows = all_edges[geo_indices_of_edges]
421 |         del all_edges
422 |         all_edges = None
423 |         del geo_indices_of_edges
424 |         geo_indices_of_edges = None
425 |         geo_flows = (geo_flows+1e-5).astype(np.float32)
426 | 
427 |         return geo_adj_matrix, geo_inc_matrix, geo_adj_indices.T, geo_flows
428 | 
429 |     @staticmethod
430 |     def _create_data_loaders(train_idcs, val_idcs, test_idcs,
431 |                              train_bin_weights, edge_indices, edge_feats,
432 |                              edge_labels, edge_buckets, batch_size, resample,
433 |                              seed):
434 |         """
435 |         :param train_idcs:
436 |         :param val_idcs:
437 |         :param test_idcs:
438 |         :param train_bin_weights:
439 |         :param edge_indices:
440 |         :param edge_feats:
441 |         :param edge_labels:
442 |         :param edge_buckets:
443 |         :param flow_node_edges_matrix: Transpose of the incidence matrix
444 |         for incoming edges. Shape [E, N].
445 |         :param batch_size:
446 |         :param resample:
447 |         :param seed:
448 |         :return:
449 |         """
450 |         assert (len(edge_indices) == len(edge_feats) == len(edge_labels)
451 |                 == len(edge_buckets))
452 | 
453 |         train_idcs = torch.from_numpy(train_idcs)
454 |         val_idcs = torch.from_numpy(val_idcs)
455 |         test_idcs = torch.from_numpy(test_idcs)
456 | 
457 |         # Sample weights
458 |         train_sample_weights = train_bin_weights[edge_buckets[train_idcs]]
459 | 
460 |         # Compute split into training, validation, and test set
461 |         np.random.seed(seed)
462 |         if resample:
463 |             train_sampler = BatchSampler(
464 |                 WeightedRandomSampler(train_sample_weights,
465 |                                       train_idcs.shape[0]),
466 |                 batch_size=batch_size, drop_last=False)
467 |             train_loader = DataLoader(TensorDataset(edge_indices[train_idcs],
468 |                                                     edge_feats[train_idcs],
469 |                                                     edge_labels[train_idcs],
470 |                                                     edge_buckets[train_idcs]),
471 |                                       batch_sampler=train_sampler)
472 |         else:
473 |             train_loader = DataLoader(TensorDataset(edge_indices[train_idcs],
474 |                                                     edge_feats[train_idcs],
475 |                                                     edge_labels[train_idcs],
476 |                                                     edge_buckets[train_idcs]),
477 |                                       batch_size=batch_size, shuffle=False)
478 |         val_loader = DataLoader(TensorDataset(edge_indices[val_idcs],
479 |                                               edge_feats[val_idcs],
480 |                                               edge_labels[val_idcs],
481 |                                               edge_buckets[val_idcs]),
482 |                                 batch_size=batch_size, shuffle=False)
483 |         test_loader = DataLoader(TensorDataset(edge_indices[test_idcs],
484 |                                                edge_feats[test_idcs],
485 |                                                edge_labels[test_idcs],
486 |                                                edge_buckets[test_idcs]),
487 |                                  batch_size=batch_size, shuffle=False)
488 |         return train_loader, val_loader, test_loader
489 | 
490 |     @staticmethod
491 |     def _get_continuous_feature_idcs(df):
492 |         continuous_feature_idcs = []
493 |         for idx, col in enumerate(df.columns):
494 |             if len(df[col].unique()) > 2:
495 |                 continuous_feature_idcs.append(idx)
496 |         return continuous_feature_idcs
497 | 
498 | 
499 | if __name__ == '__main__':
500 |     ds = UrbanPlanningDataset(data_base_path="Data/London_high/",
501 |                               use_binned_transformer=True,
502 |                               excluded_node_feature_columns=tuple())
503 | 
504 |     print("\n\nNode features")
505 |     summarize_feature_matrix(ds.node_feats.numpy())
506 |     print("\n\nEdge features")
507 |     summarize_feature_matrix(ds.edge_feats.numpy())


--------------------------------------------------------------------------------