├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── chem_tensorflow.py ├── chem_tensorflow_async.py ├── chem_tensorflow_dense.py ├── chem_tensorflow_gcn.py ├── chem_tensorflow_sparse.py ├── get_data.py ├── requirements.txt ├── utils.py └── valid_idx.json /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | molecules_*.json 104 | data/ 105 | logs/ 106 | .idea/ -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## CONTRIBUTING 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 6 | 7 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | 11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gated Graph Neural Networks 2 | 3 | > ## This repository is not maintained anymore. An updated version of the _sparse_ codebase in this repo, together with many more GNN implementations, is available on https://github.com/microsoft/tf-gnn-samples. 4 | 5 | This repository contains two implementations of the Gated Graph Neural Networks 6 | of [Li et al. 2015](https://arxiv.org/abs/1511.05493) for learning properties of chemical molecules. 7 | The inspiration for this application comes from [Gilmer et al. 2017](https://arxiv.org/abs/1704.01212). 8 | 9 | This code was tested in Python 3.5 with TensorFlow 1.3. To run the code `docopt` is also necessary. 10 | 11 | This code was maintained by the [Deep Program Understanding](https://www.microsoft.com/en-us/research/project/program/) project at Microsoft Research, Cambridge, UK. 12 | 13 | ## Data Extraction 14 | To download the related data run `get_data.py`. It requires the python package `rdkit` within the Python package 15 | environment. For example, this can be obtained by 16 | ``` 17 | conda install -c rdkit rdkit 18 | ``` 19 | 20 | ## Running Graph Neural Network Training 21 | We provide four versions of Graph Neural Networks: Gated Graph Neural Networks (one implementation using dense 22 | adjacency matrices and a sparse variant), Asynchronous Gated Graph Neural Networks, and Graph Convolutional 23 | Networks (sparse). 24 | The dense version is faster for small or dense graphs, including the molecules dataset (though the difference is 25 | small for it). In contrast, the sparse version is faster for large and sparse graphs, especially in cases where 26 | representing a dense representation of the adjacency matrix would result in prohibitively large memory usage. 27 | Asynchronous GNNs do not propagate information from all nodes to all neighbouring nodes at each timestep; 28 | instead, they follow an update schedule such that messages are propagated in sequence. Their implementation 29 | is far more inefficient (due to the small number of updates at each step), but a single propagation round 30 | (i.e., performing each propagation step along a few edges once) can suffice to propagate messages across a 31 | large graph. 32 | 33 | To run dense Gated Graph Neural Networks, use 34 | ``` 35 | python3 ./chem_tensorflow_dense.py 36 | ``` 37 | 38 | To run sparse Gated Graph Neural Networks, use 39 | ``` 40 | python3 ./chem_tensorflow_sparse.py 41 | ``` 42 | 43 | To run sparse Graph Convolutional Networks (as in [Kipf et al. 2016](https://arxiv.org/abs/1609.02907)), use 44 | ``` 45 | python3 ./chem_tensorflow_gcn.py 46 | ``` 47 | 48 | Finally, it turns out that the extension of GCN to different edge types is a variant of GGNN, and you can run 49 | GCN (as in [Schlichtkrull et al. 2017](https://arxiv.org/abs/1703.06103)) by calling 50 | ``` 51 | python3 ./chem_tensorflow_sparse.py --config '{"use_edge_bias": false, "use_edge_msg_avg_aggregation": true, "residual_connections": {}, "layer_timesteps": [1,1,1,1,1,1,1,1], "graph_rnn_cell": "RNN", "graph_rnn_activation": "ReLU"}' 52 | ``` 53 | 54 | To run asynchronous Gated Graph Neural Networks, use 55 | ``` 56 | python3 ./chem_tensorflow_async.py 57 | ``` 58 | 59 | ## Restoring models 60 | 61 | Suppose you have trained a model e.g. the following trains for a single epoch: 62 | 63 | ``` 64 | python3 ./chem_tensorflow_dense.py --config '{"num_epochs": 1}' 65 | == Epoch 1 66 | Train: loss: 0.52315 | acc: 0:0.64241 | error_ratio: 0:9.65831 | instances/sec: 6758.04 67 | Valid: loss: 0.26930 | acc: 0:0.55949 | error_ratio: 0:8.41163 | instances/sec: 9902.71 68 | (Best epoch so far, cum. val. acc decreased to 0.55949 from inf. Saving to './2018-02-01-11-30-05_16306_model_best.pickle') 69 | ``` 70 | 71 | Note that a checkpoint was stored to './2018-02-01-11-30-05_16306_model_best.pickle'. To restore this model and continue training, use: 72 | ``` 73 | python3 ./chem_tensorflow_dense.py --restore ./2018-02-01-11-30-05_16306_model_best.pickle 74 | ``` 75 | 76 | 77 | 78 | 79 | ## Contributing 80 | 81 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 82 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 83 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 84 | 85 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 86 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 87 | provided by the bot. You will only need to do this once across all repos using our CLA. 88 | 89 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 90 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 91 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 92 | -------------------------------------------------------------------------------- /chem_tensorflow.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python 2 | 3 | import json 4 | import os 5 | import pickle 6 | import random 7 | import time 8 | from typing import List, Any, Sequence 9 | 10 | import numpy as np 11 | import tensorflow as tf 12 | 13 | from utils import MLP, ThreadedIterator, SMALL_NUMBER 14 | 15 | 16 | class ChemModel(object): 17 | @classmethod 18 | def default_params(cls): 19 | return { 20 | 'num_epochs': 3000, 21 | 'patience': 25, 22 | 'learning_rate': 0.001, 23 | 'clamp_gradient_norm': 1.0, 24 | 'out_layer_dropout_keep_prob': 1.0, 25 | 26 | 'hidden_size': 100, 27 | 'num_timesteps': 4, 28 | 'use_graph': True, 29 | 30 | 'tie_fwd_bkwd': True, 31 | 'task_ids': [0], 32 | 33 | 'random_seed': 0, 34 | 35 | 'train_file': 'molecules_train.json', 36 | 'valid_file': 'molecules_valid.json' 37 | } 38 | 39 | def __init__(self, args): 40 | self.args = args 41 | 42 | # Collect argument things: 43 | data_dir = '' 44 | if '--data_dir' in args and args['--data_dir'] is not None: 45 | data_dir = args['--data_dir'] 46 | self.data_dir = data_dir 47 | 48 | self.run_id = "_".join([time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())]) 49 | log_dir = args.get('--log_dir') or '.' 50 | os.makedirs(log_dir, exist_ok=True) 51 | self.log_file = os.path.join(log_dir, "%s_log.json" % self.run_id) 52 | self.best_model_file = os.path.join(log_dir, "%s_model_best.pickle" % self.run_id) 53 | tb_log_dir = os.path.join(log_dir, "tb", self.run_id) 54 | os.makedirs(tb_log_dir, exist_ok=True) 55 | 56 | # Collect parameters: 57 | params = self.default_params() 58 | config_file = args.get('--config-file') 59 | if config_file is not None: 60 | with open(config_file, 'r') as f: 61 | params.update(json.load(f)) 62 | config = args.get('--config') 63 | if config is not None: 64 | params.update(json.loads(config)) 65 | self.params = params 66 | with open(os.path.join(log_dir, "%s_params.json" % self.run_id), "w") as f: 67 | json.dump(params, f) 68 | print("Run %s starting with following parameters:\n%s" % (self.run_id, json.dumps(self.params))) 69 | random.seed(params['random_seed']) 70 | np.random.seed(params['random_seed']) 71 | 72 | # Load data: 73 | self.max_num_vertices = 0 74 | self.num_edge_types = 0 75 | self.annotation_size = 0 76 | self.train_data = self.load_data(params['train_file'], is_training_data=True) 77 | self.valid_data = self.load_data(params['valid_file'], is_training_data=False) 78 | 79 | # Build the actual model 80 | config = tf.ConfigProto() 81 | config.gpu_options.allow_growth = True 82 | self.graph = tf.Graph() 83 | self.sess = tf.Session(graph=self.graph, config=config) 84 | with self.graph.as_default(): 85 | tf.set_random_seed(params['random_seed']) 86 | self.placeholders = {} 87 | self.weights = {} 88 | self.ops = {} 89 | self.make_model() 90 | self.make_train_step() 91 | self.make_summaries() 92 | 93 | # Restore/initialize variables: 94 | restore_file = args.get('--restore') 95 | if restore_file is not None: 96 | self.train_step_id, self.valid_step_id = self.restore_progress(restore_file) 97 | else: 98 | self.initialize_model() 99 | self.train_step_id = 0 100 | self.valid_step_id = 0 101 | self.train_writer = tf.summary.FileWriter(os.path.join(tb_log_dir, 'train'), graph=self.graph) 102 | self.valid_writer = tf.summary.FileWriter(os.path.join(tb_log_dir, 'validation'), graph=self.graph) 103 | 104 | def load_data(self, file_name, is_training_data: bool): 105 | full_path = os.path.join(self.data_dir, file_name) 106 | 107 | print("Loading data from %s" % full_path) 108 | with open(full_path, 'r') as f: 109 | data = json.load(f) 110 | 111 | restrict = self.args.get("--restrict_data") 112 | if restrict is not None and restrict > 0: 113 | data = data[:restrict] 114 | 115 | # Get some common data out: 116 | num_fwd_edge_types = 0 117 | for g in data: 118 | self.max_num_vertices = max(self.max_num_vertices, max([v for e in g['graph'] for v in [e[0], e[2]]])) 119 | num_fwd_edge_types = max(num_fwd_edge_types, max([e[1] for e in g['graph']])) 120 | self.num_edge_types = max(self.num_edge_types, num_fwd_edge_types * (1 if self.params['tie_fwd_bkwd'] else 2)) 121 | self.annotation_size = max(self.annotation_size, len(data[0]["node_features"][0])) 122 | 123 | return self.process_raw_graphs(data, is_training_data) 124 | 125 | @staticmethod 126 | def graph_string_to_array(graph_string: str) -> List[List[int]]: 127 | return [[int(v) for v in s.split(' ')] 128 | for s in graph_string.split('\n')] 129 | 130 | def process_raw_graphs(self, raw_data: Sequence[Any], is_training_data: bool) -> Any: 131 | raise Exception("Models have to implement process_raw_graphs!") 132 | 133 | def make_model(self): 134 | self.placeholders['target_values'] = tf.placeholder(tf.float32, [len(self.params['task_ids']), None], 135 | name='target_values') 136 | self.placeholders['target_mask'] = tf.placeholder(tf.float32, [len(self.params['task_ids']), None], 137 | name='target_mask') 138 | self.placeholders['num_graphs'] = tf.placeholder(tf.int32, [], name='num_graphs') 139 | self.placeholders['out_layer_dropout_keep_prob'] = tf.placeholder(tf.float32, [], name='out_layer_dropout_keep_prob') 140 | 141 | with tf.variable_scope("graph_model"): 142 | self.prepare_specific_graph_model() 143 | # This does the actual graph work: 144 | if self.params['use_graph']: 145 | self.ops['final_node_representations'] = self.compute_final_node_representations() 146 | else: 147 | self.ops['final_node_representations'] = tf.zeros_like(self.placeholders['initial_node_representation']) 148 | 149 | self.ops['losses'] = [] 150 | for (internal_id, task_id) in enumerate(self.params['task_ids']): 151 | with tf.variable_scope("out_layer_task%i" % task_id): 152 | with tf.variable_scope("regression_gate"): 153 | self.weights['regression_gate_task%i' % task_id] = MLP(2 * self.params['hidden_size'], 1, [], 154 | self.placeholders['out_layer_dropout_keep_prob']) 155 | with tf.variable_scope("regression"): 156 | self.weights['regression_transform_task%i' % task_id] = MLP(self.params['hidden_size'], 1, [], 157 | self.placeholders['out_layer_dropout_keep_prob']) 158 | computed_values = self.gated_regression(self.ops['final_node_representations'], 159 | self.weights['regression_gate_task%i' % task_id], 160 | self.weights['regression_transform_task%i' % task_id]) 161 | diff = computed_values - self.placeholders['target_values'][internal_id, :] 162 | task_target_mask = self.placeholders['target_mask'][internal_id, :] 163 | task_target_num = tf.reduce_sum(task_target_mask) + SMALL_NUMBER 164 | diff = diff * task_target_mask # Mask out unused values 165 | self.ops['accuracy_task%i' % task_id] = tf.reduce_sum(tf.abs(diff)) / task_target_num 166 | task_loss = tf.reduce_sum(0.5 * tf.square(diff)) / task_target_num 167 | # Normalise loss to account for fewer task-specific examples in batch: 168 | task_loss = task_loss * (1.0 / (self.params['task_sample_ratios'].get(task_id) or 1.0)) 169 | self.ops['losses'].append(task_loss) 170 | self.ops['loss'] = tf.reduce_sum(self.ops['losses']) 171 | 172 | def make_train_step(self): 173 | trainable_vars = self.sess.graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) 174 | if self.args.get('--freeze-graph-model'): 175 | graph_vars = set(self.sess.graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="graph_model")) 176 | filtered_vars = [] 177 | for var in trainable_vars: 178 | if var not in graph_vars: 179 | filtered_vars.append(var) 180 | else: 181 | print("Freezing weights of variable %s." % var.name) 182 | trainable_vars = filtered_vars 183 | optimizer = tf.train.AdamOptimizer(self.params['learning_rate']) 184 | grads_and_vars = optimizer.compute_gradients(self.ops['loss'], var_list=trainable_vars) 185 | clipped_grads = [] 186 | for grad, var in grads_and_vars: 187 | if grad is not None: 188 | clipped_grads.append((tf.clip_by_norm(grad, self.params['clamp_gradient_norm']), var)) 189 | else: 190 | clipped_grads.append((grad, var)) 191 | self.ops['train_step'] = optimizer.apply_gradients(clipped_grads) 192 | # Initialize newly-introduced variables: 193 | self.sess.run(tf.local_variables_initializer()) 194 | 195 | def make_summaries(self): 196 | with tf.name_scope('summary'): 197 | tf.summary.scalar('loss', self.ops['loss']) 198 | for task_id in self.params['task_ids']: 199 | tf.summary.scalar('accuracy%i' % task_id, self.ops['accuracy_task%i' % task_id]) 200 | self.ops['summary'] = tf.summary.merge_all() 201 | 202 | def gated_regression(self, last_h, regression_gate, regression_transform): 203 | raise Exception("Models have to implement gated_regression!") 204 | 205 | def prepare_specific_graph_model(self) -> None: 206 | raise Exception("Models have to implement prepare_specific_graph_model!") 207 | 208 | def compute_final_node_representations(self) -> tf.Tensor: 209 | raise Exception("Models have to implement compute_final_node_representations!") 210 | 211 | def make_minibatch_iterator(self, data: Any, is_training: bool): 212 | raise Exception("Models have to implement make_minibatch_iterator!") 213 | 214 | def run_epoch(self, epoch_name: str, data, is_training: bool, start_step: int = 0): 215 | chemical_accuracies = np.array([0.066513725, 0.012235489, 0.071939046, 0.033730778, 0.033486113, 0.004278493, 216 | 0.001330901, 0.004165489, 0.004128926, 0.00409976, 0.004527465, 0.012292586, 217 | 0.037467458]) 218 | 219 | loss = 0 220 | accuracies = [] 221 | accuracy_ops = [self.ops['accuracy_task%i' % task_id] for task_id in self.params['task_ids']] 222 | start_time = time.time() 223 | processed_graphs = 0 224 | steps = 0 225 | batch_iterator = ThreadedIterator(self.make_minibatch_iterator(data, is_training), max_queue_size=5) 226 | for step, batch_data in enumerate(batch_iterator): 227 | num_graphs = batch_data[self.placeholders['num_graphs']] 228 | processed_graphs += num_graphs 229 | if is_training: 230 | batch_data[self.placeholders['out_layer_dropout_keep_prob']] = self.params['out_layer_dropout_keep_prob'] 231 | fetch_list = [self.ops['loss'], accuracy_ops, self.ops['summary'], self.ops['train_step']] 232 | else: 233 | batch_data[self.placeholders['out_layer_dropout_keep_prob']] = 1.0 234 | fetch_list = [self.ops['loss'], accuracy_ops, self.ops['summary']] 235 | result = self.sess.run(fetch_list, feed_dict=batch_data) 236 | (batch_loss, batch_accuracies, batch_summary) = (result[0], result[1], result[2]) 237 | writer = self.train_writer if is_training else self.valid_writer 238 | writer.add_summary(batch_summary, start_step + step) 239 | loss += batch_loss * num_graphs 240 | accuracies.append(np.array(batch_accuracies) * num_graphs) 241 | 242 | print("Running %s, batch %i (has %i graphs). Loss so far: %.4f" % (epoch_name, 243 | step, 244 | num_graphs, 245 | loss / processed_graphs), 246 | end='\r') 247 | steps += 1 248 | 249 | accuracies = np.sum(accuracies, axis=0) / processed_graphs 250 | loss = loss / processed_graphs 251 | error_ratios = accuracies / chemical_accuracies[self.params["task_ids"]] 252 | instance_per_sec = processed_graphs / (time.time() - start_time) 253 | return loss, accuracies, error_ratios, instance_per_sec, steps 254 | 255 | def train(self): 256 | log_to_save = [] 257 | total_time_start = time.time() 258 | with self.graph.as_default(): 259 | if self.args.get('--restore') is not None: 260 | _, valid_accs, _, _, steps = self.run_epoch("Resumed (validation)", self.valid_data, False) 261 | best_val_acc = np.sum(valid_accs) 262 | best_val_acc_epoch = 0 263 | print("\r\x1b[KResumed operation, initial cum. val. acc: %.5f" % best_val_acc) 264 | else: 265 | (best_val_acc, best_val_acc_epoch) = (float("+inf"), 0) 266 | for epoch in range(1, self.params['num_epochs'] + 1): 267 | print("== Epoch %i" % epoch) 268 | train_loss, train_accs, train_errs, train_speed, train_steps = self.run_epoch("epoch %i (training)" % epoch, 269 | self.train_data, True, self.train_step_id) 270 | self.train_step_id += train_steps 271 | accs_str = " ".join(["%i:%.5f" % (id, acc) for (id, acc) in zip(self.params['task_ids'], train_accs)]) 272 | errs_str = " ".join(["%i:%.5f" % (id, err) for (id, err) in zip(self.params['task_ids'], train_errs)]) 273 | print("\r\x1b[K Train: loss: %.5f | acc: %s | error_ratio: %s | instances/sec: %.2f" % (train_loss, 274 | accs_str, 275 | errs_str, 276 | train_speed)) 277 | valid_loss, valid_accs, valid_errs, valid_speed, valid_steps = self.run_epoch("epoch %i (validation)" % epoch, 278 | self.valid_data, False, self.valid_step_id) 279 | self.valid_step_id += valid_steps 280 | accs_str = " ".join(["%i:%.5f" % (id, acc) for (id, acc) in zip(self.params['task_ids'], valid_accs)]) 281 | errs_str = " ".join(["%i:%.5f" % (id, err) for (id, err) in zip(self.params['task_ids'], valid_errs)]) 282 | print("\r\x1b[K Valid: loss: %.5f | acc: %s | error_ratio: %s | instances/sec: %.2f" % (valid_loss, 283 | accs_str, 284 | errs_str, 285 | valid_speed)) 286 | 287 | epoch_time = time.time() - total_time_start 288 | log_entry = { 289 | 'epoch': epoch, 290 | 'time': epoch_time, 291 | 'train_results': (train_loss, train_accs.tolist(), train_errs.tolist(), train_speed), 292 | 'valid_results': (valid_loss, valid_accs.tolist(), valid_errs.tolist(), valid_speed), 293 | } 294 | log_to_save.append(log_entry) 295 | with open(self.log_file, 'w') as f: 296 | json.dump(log_to_save, f, indent=4) 297 | 298 | val_acc = np.sum(valid_accs) # type: float 299 | if val_acc < best_val_acc: 300 | self.save_progress(self.best_model_file, self.train_step_id, self.valid_step_id) 301 | print(" (Best epoch so far, cum. val. acc decreased to %.5f from %.5f. Saving to '%s')" % ( 302 | val_acc, best_val_acc, self.best_model_file)) 303 | best_val_acc = val_acc 304 | best_val_acc_epoch = epoch 305 | elif epoch - best_val_acc_epoch >= self.params['patience']: 306 | print("Stopping training after %i epochs without improvement on validation accuracy." % self.params['patience']) 307 | break 308 | 309 | def save_progress(self, model_path: str, train_step: int, valid_step: int) -> None: 310 | weights_to_save = {} 311 | for variable in self.sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): 312 | assert variable.name not in weights_to_save 313 | weights_to_save[variable.name] = self.sess.run(variable) 314 | 315 | data_to_save = { 316 | "params": self.params, 317 | "weights": weights_to_save, 318 | "train_step": train_step, 319 | "valid_step": valid_step, 320 | } 321 | 322 | with open(model_path, 'wb') as out_file: 323 | pickle.dump(data_to_save, out_file, pickle.HIGHEST_PROTOCOL) 324 | 325 | def initialize_model(self) -> None: 326 | init_op = tf.group(tf.global_variables_initializer(), 327 | tf.local_variables_initializer()) 328 | self.sess.run(init_op) 329 | 330 | def restore_progress(self, model_path: str) -> (int, int): 331 | print("Restoring weights from file %s." % model_path) 332 | with open(model_path, 'rb') as in_file: 333 | data_to_load = pickle.load(in_file) 334 | 335 | # Assert that we got the same model configuration 336 | assert len(self.params) == len(data_to_load['params']) 337 | for (par, par_value) in self.params.items(): 338 | # Fine to have different task_ids: 339 | if par not in ['task_ids', 'num_epochs']: 340 | assert par_value == data_to_load['params'][par] 341 | 342 | variables_to_initialize = [] 343 | with tf.name_scope("restore"): 344 | restore_ops = [] 345 | used_vars = set() 346 | for variable in self.sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES): 347 | used_vars.add(variable.name) 348 | if variable.name in data_to_load['weights']: 349 | restore_ops.append(variable.assign(data_to_load['weights'][variable.name])) 350 | else: 351 | print('Freshly initializing %s since no saved value was found.' % variable.name) 352 | variables_to_initialize.append(variable) 353 | for var_name in data_to_load['weights']: 354 | if var_name not in used_vars: 355 | print('Saved weights for %s not used by model.' % var_name) 356 | restore_ops.append(tf.variables_initializer(variables_to_initialize)) 357 | self.sess.run(restore_ops) 358 | 359 | return data_to_load['train_step'], data_to_load['valid_step'] 360 | -------------------------------------------------------------------------------- /chem_tensorflow_async.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python 2 | """ 3 | Usage: 4 | chem_tensorflow_async.py [options] 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --config-file FILE Hyperparameter configuration file path (in JSON format). 9 | --config CONFIG Hyperparameter configuration dictionary (in JSON format). 10 | --log_dir DIR Log dir name. 11 | --data_dir DIR Data dir name. 12 | --restore FILE File to restore weights from. 13 | --freeze-graph-model Freeze weights of graph model components. 14 | """ 15 | from typing import List, Tuple, Dict, Sequence, Any 16 | 17 | from docopt import docopt 18 | from collections import defaultdict 19 | import numpy as np 20 | import tensorflow as tf 21 | import sys, traceback 22 | import pdb 23 | 24 | from chem_tensorflow import ChemModel 25 | from utils import glorot_init, SMALL_NUMBER 26 | 27 | 28 | def bfs_visit(outgoing_edges: Dict[int, Sequence[int]], node_depths: Dict[int, int], v: int, depth: int): 29 | # Already seen, skip: 30 | if v in node_depths: 31 | return 32 | node_depths[v] = depth 33 | for (_, __, w) in outgoing_edges[v]: 34 | bfs_visit(outgoing_edges, node_depths, w, depth + 1) 35 | 36 | 37 | class AsyncGGNNChemModel(ChemModel): 38 | def __init__(self, args): 39 | super().__init__(args) 40 | 41 | @classmethod 42 | def default_params(cls): 43 | params = dict(super().default_params()) 44 | params.update({ 45 | 'num_nodes': 100000, 46 | 'use_edge_bias': False, 47 | 48 | 'propagation_rounds': 4, # Has to be an even number 49 | 'propagation_substeps': 15, 50 | 51 | 'graph_rnn_cell': 'GRU', # GRU or RNN 52 | 'graph_rnn_activation': 'tanh', # tanh, ReLU 53 | 'graph_state_dropout_keep_prob': 1., 54 | 55 | 'task_sample_ratios': {}, 56 | }) 57 | return params 58 | 59 | def prepare_specific_graph_model(self) -> None: 60 | h_dim = self.params['hidden_size'] 61 | self.placeholders['initial_node_representation'] = tf.placeholder(tf.float32, [None, h_dim], 62 | name='node_features') 63 | 64 | # Initial nodes I_{r}: Node IDs that will have no incoming edges in round r. 65 | self.placeholders['initial_nodes'] = [tf.placeholder(tf.int32, [None], name="initial_nodes_round%i" % prop_round) 66 | for prop_round in range(self.params['propagation_rounds'])] 67 | 68 | # Sending nodes S_{r,s,e}: Source node ids of edges propagating in step s of round r. 69 | # Restrictions: If v in S_{r,s,e}, then v in R_{r,s'} for s' < s or v in I_{r} 70 | self.placeholders['sending_nodes'] = [[[tf.placeholder(tf.int32, 71 | [None], 72 | name="sending_nodes_round%i_step%i_edgetyp%i" % (prop_round, step, edge_typ)) 73 | for edge_typ in range(self.num_edge_types)] 74 | for step in range(self.params['propagation_substeps'])] 75 | for prop_round in range(self.params['propagation_rounds'])] 76 | 77 | # Normalised edge target nodes T_{r,s}: Targets of edges propagating in step s of round r, normalised to a 78 | # continuous range starting from 0. This is used for aggregating messages from the sending nodes. 79 | self.placeholders['msg_targets'] = [[tf.placeholder(tf.int32, 80 | [None], 81 | name="msg_targets_nodes_round%i_step%i" % (prop_round, step)) 82 | for step in range(self.params['propagation_substeps'])] 83 | for prop_round in range(self.params['propagation_rounds'])] 84 | 85 | 86 | # Receiving nodes R_{r,s}: Target node ids of aggregated messages in propagation step s of round r. 87 | # Restrictions: If v in R_{r,s}, v not in R_{r,s'} for all s' != s and v not in I_{r} 88 | self.placeholders['receiving_nodes'] = [[tf.placeholder(tf.int32, 89 | [None], 90 | name="receiving_nodes_round%i_step%i" % (prop_round, step)) 91 | for step in range(self.params['propagation_substeps'])] 92 | for prop_round in range(self.params['propagation_rounds'])] 93 | 94 | # Number of receiving nodes N_{r,s} 95 | # Restrictions: N_{r,s} = len(R_{r,s}) 96 | self.placeholders['receiving_node_num'] = [tf.placeholder(tf.int32, 97 | [self.params['propagation_substeps']], 98 | name="receiving_nodes_num_round%i" % (prop_round,)) 99 | for prop_round in range(self.params['propagation_rounds'])] 100 | 101 | self.placeholders['graph_nodes_list'] = tf.placeholder(tf.int32, [None], name='graph_nodes_list') 102 | self.placeholders['graph_state_keep_prob'] = tf.placeholder(tf.float32, None, name='graph_state_keep_prob') 103 | 104 | activation_name = self.params['graph_rnn_activation'].lower() 105 | if activation_name == 'tanh': 106 | activation_fun = tf.nn.tanh 107 | elif activation_name == 'relu': 108 | activation_fun = tf.nn.relu 109 | else: 110 | raise Exception("Unknown activation function type '%s'." % activation_name) 111 | 112 | # Generate per-layer values for edge weights, biases and gated units. If we tie them, they are just copies: 113 | self.weights['edge_weights'] = [tf.Variable(glorot_init([h_dim, h_dim]), name='gnn_edge_weights_typ%i' % e_typ) 114 | for e_typ in range(self.num_edge_types)] 115 | 116 | if self.params['use_edge_bias']: 117 | self.weights['edge_biases'] = [tf.Variable(np.zeros([h_dim], dtype=np.float32), name='gnn_edge_biases_typ%i' % e_typ) 118 | for e_typ in range(self.num_edge_types)] 119 | 120 | cell_type = self.params['graph_rnn_cell'].lower() 121 | if cell_type == 'gru': 122 | cell = tf.nn.rnn_cell.GRUCell(h_dim, activation=activation_fun) 123 | elif cell_type == 'rnn': 124 | cell = tf.nn.rnn_cell.BasicRNNCell(h_dim, activation=activation_fun) 125 | else: 126 | raise Exception("Unknown RNN cell type '%s'." % cell_type) 127 | cell = tf.nn.rnn_cell.DropoutWrapper(cell, 128 | state_keep_prob=self.placeholders['graph_state_keep_prob']) 129 | self.weights['rnn_cells'] = cell 130 | 131 | def compute_final_node_representations(self) -> tf.Tensor: 132 | cur_node_states = self.placeholders['initial_node_representation'] 133 | 134 | for prop_round in range(self.params['propagation_rounds']): 135 | with tf.variable_scope('prop_round%i' % (prop_round,)): 136 | # ---- Declare and fill tensor arrays used in tf.while_loop: 137 | sending_nodes_ta = tf.TensorArray(tf.int32, 138 | infer_shape=False, 139 | element_shape=[None], 140 | size=self.params['propagation_substeps'] * self.num_edge_types, 141 | name='sending_nodes') 142 | msg_targets_ta = tf.TensorArray(tf.int32, 143 | infer_shape=False, 144 | element_shape=[None], 145 | size=self.params['propagation_substeps'], 146 | name='msg_targets') 147 | receiving_nodes_ta = tf.TensorArray(tf.int32, 148 | infer_shape=False, 149 | element_shape=[None], 150 | size=self.params['propagation_substeps'], 151 | clear_after_read=False, 152 | name='receiving_nodes') 153 | receiving_node_num_ta = tf.TensorArray(tf.int32, 154 | infer_shape=False, 155 | element_shape=[], 156 | size=self.params['propagation_substeps'], 157 | name='receiving_nodes_num') 158 | 159 | for step in range(self.params['propagation_substeps']): 160 | for edge_typ in range(self.num_edge_types): 161 | sending_nodes_ta = sending_nodes_ta.write(step * self.num_edge_types + edge_typ, 162 | self.placeholders['sending_nodes'][prop_round][step][edge_typ]) 163 | msg_targets_ta = msg_targets_ta.write(step, self.placeholders['msg_targets'][prop_round][step]) 164 | receiving_nodes_ta = receiving_nodes_ta.write(step, self.placeholders['receiving_nodes'][prop_round][step]) 165 | receiving_node_num_ta = receiving_node_num_ta.unstack(self.placeholders['receiving_node_num'][prop_round]) 166 | 167 | new_node_states_ta = tf.TensorArray(tf.float32, 168 | infer_shape=False, 169 | element_shape=[self.params['hidden_size']], 170 | size=tf.shape(cur_node_states)[0], 171 | clear_after_read=False, 172 | name='new_node_states') 173 | 174 | # ---- Actual propagation schedule implementation: 175 | # Initialize the initial nodes with their state from last round: 176 | new_node_states_ta = new_node_states_ta.scatter(self.placeholders['initial_nodes'][prop_round], 177 | tf.gather(cur_node_states, self.placeholders['initial_nodes'][prop_round])) 178 | 179 | def do_substep(substep_id, new_node_states_ta): 180 | # For each edge active in this substep, pull source state and transform: 181 | sent_messages = [] 182 | for edge_typ in range(self.num_edge_types): 183 | sending_states = new_node_states_ta.gather(sending_nodes_ta.read(substep_id * self.num_edge_types + edge_typ)) 184 | messages = tf.matmul(sending_states, self.weights['edge_weights'][edge_typ]) 185 | if self.params['use_edge_bias']: 186 | messages += self.weights['edge_biases'][edge_typ] 187 | sent_messages.append(messages) 188 | 189 | # Stack all edge messages and aggregate as sum for each receiving node: 190 | sent_messages = tf.concat(sent_messages, axis=0) 191 | aggregated_received_messages = tf.unsorted_segment_sum(sent_messages, 192 | msg_targets_ta.read(substep_id), 193 | receiving_node_num_ta.read(substep_id)) 194 | 195 | # Collect old states for receiving nodes, and combine in RNN cell with incoming message 196 | substep_receiving_nodes = receiving_nodes_ta.read(substep_id) 197 | old_receiving_node_states = tf.gather(cur_node_states, substep_receiving_nodes) 198 | aggregated_received_messages.set_shape([None, self.params['hidden_size']]) 199 | old_receiving_node_states.set_shape([None, self.params['hidden_size']]) 200 | substep_new_node_states = self.weights['rnn_cells'](aggregated_received_messages, 201 | old_receiving_node_states)[1] 202 | 203 | # Write updated states back: 204 | new_node_states_ta = new_node_states_ta.scatter(substep_receiving_nodes, substep_new_node_states) 205 | return (substep_id + 1, new_node_states_ta) 206 | 207 | def is_done(substep_id, new_node_states_ta_unused): 208 | return tf.logical_and(substep_id < self.params['propagation_substeps'], 209 | tf.greater(tf.shape(receiving_nodes_ta.read(substep_id))[0], 0)) 210 | 211 | _, new_node_states_ta = tf.while_loop(cond=is_done, 212 | body=do_substep, 213 | loop_vars=[tf.constant(0), new_node_states_ta] 214 | ) 215 | 216 | cur_node_states = new_node_states_ta.stack(name="state_stack_round%i" % (prop_round,)) 217 | 218 | return cur_node_states 219 | 220 | def gated_regression(self, last_h, regression_gate, regression_transform): 221 | # last_h: [v x h] 222 | gate_input = tf.concat([last_h, self.placeholders['initial_node_representation']], axis=-1) # [v x 2h] 223 | gated_outputs = tf.nn.sigmoid(regression_gate(gate_input)) * regression_transform(last_h) # [v x 1] 224 | 225 | # Sum up all nodes per graph 226 | graph_representations = tf.unsorted_segment_sum(data=gated_outputs, 227 | segment_ids=self.placeholders['graph_nodes_list'], 228 | num_segments=self.placeholders['num_graphs']) # [g x 1] 229 | return tf.squeeze(graph_representations) # [g] 230 | 231 | # ----- Data preprocessing and chunking into minibatches: 232 | def process_raw_graphs(self, raw_data: Sequence[Any], is_training_data: bool) -> Any: 233 | processed_graphs = [] 234 | for d in raw_data: 235 | prop_schedules = self.__graph_to_propagation_schedules(d['graph']) 236 | processed_graphs.append({"init": d["node_features"], 237 | "prop_schedules": prop_schedules, 238 | "target_values": [d["targets"][task_id][0] for task_id in self.params['task_ids']]}) 239 | 240 | if is_training_data: 241 | np.random.shuffle(processed_graphs) 242 | for task_id in self.params['task_ids']: 243 | task_sample_ratio = self.params['task_sample_ratios'].get(str(task_id)) 244 | if task_sample_ratio is not None: 245 | ex_to_sample = int(len(processed_graphs) * task_sample_ratio) 246 | for ex_id in range(ex_to_sample, len(processed_graphs)): 247 | processed_graphs[ex_id]['target_values'][task_id] = None 248 | 249 | return processed_graphs 250 | 251 | def __tensorise_edge_sequence(self, edges)\ 252 | -> Tuple[np.ndarray, List[List[np.ndarray]], List[List[np.ndarray]], List[np.ndarray]]: 253 | sending_nodes = [] # type: List[List[np.ndarray]] 254 | msg_targets = [] # type: List[List[np.ndarray]] 255 | receiving_nodes = [] # type: List[np.ndarray] 256 | all_nodes = set() 257 | for step_edges in edges: 258 | msg_targets_uniq = set(w for (_, __, w) in step_edges) 259 | recv_nodes = list(sorted(msg_targets_uniq)) 260 | recv_nodes_to_uniq_id = {v: i for (i, v) in enumerate(recv_nodes)} 261 | 262 | sending_nodes_in_step = [] 263 | msg_targets_in_step = [] 264 | for target_e_typ in range(self.num_edge_types): 265 | sending_nodes_in_step.append(np.array([v for (v, e_typ, _) in step_edges if e_typ == target_e_typ], dtype=np.int32)) 266 | msg_targets_in_step.append(np.array([recv_nodes_to_uniq_id[w] for (_, e_typ, w) in step_edges if e_typ == target_e_typ], dtype=np.int32)) 267 | msg_targets.append(msg_targets_in_step) 268 | sending_nodes.append(sending_nodes_in_step) 269 | receiving_nodes.append(np.array(recv_nodes, dtype=np.int32)) 270 | all_nodes.update(v for (v, _, __) in step_edges) 271 | all_nodes.update(w for (_, __, w) in step_edges) 272 | 273 | all_updated_nodes = set() 274 | all_updated_nodes.update(v for step_receiving_nodes in receiving_nodes 275 | for v in step_receiving_nodes) 276 | initial_nodes = list(sorted(all_nodes - all_updated_nodes)) 277 | 278 | #initialised_nodes = set() 279 | #initialised_nodes.update(initial_nodes) 280 | #for step in range(len(receiving_nodes)): 281 | # sent_nodes = set() 282 | # for edge_typ in range(self.num_edge_types): 283 | # sent_nodes.update(sending_nodes[step][edge_typ]) 284 | # for v in sent_nodes: 285 | # assert v in initialised_nodes 286 | # 287 | # for v in receiving_nodes[step]: 288 | # assert v not in initialised_nodes 289 | # initialised_nodes.update(receiving_nodes[step]) 290 | 291 | return (np.array(initial_nodes, dtype=np.int32), sending_nodes, msg_targets, receiving_nodes) 292 | 293 | def __graph_to_propagation_schedules(self, graph)\ 294 | -> List[Tuple[np.ndarray, List[List[np.ndarray]], List[List[np.ndarray]], List[np.ndarray]]]: 295 | num_incoming_edges = defaultdict(lambda: 0) 296 | outgoing_edges = defaultdict(lambda: []) 297 | # Compute number of incoming edges per node, and build adjacency lists: 298 | for (v, typ, w) in graph: 299 | num_incoming_edges[v] += 1 300 | num_incoming_edges[w] += 1 301 | edge_bwd_typ = typ if self.params['tie_fwd_bkwd'] else self.num_edge_types + typ 302 | outgoing_edges[v].append((v, typ, w)) 303 | outgoing_edges[w].append((w, edge_bwd_typ, v)) 304 | 305 | # Sort them, pick node with lowest number of incoming edges: 306 | tensorised_prop_schedules = [] 307 | for prop_round in range(int(self.params['propagation_rounds'] / 2)): 308 | dag_seed = min(num_incoming_edges.items(), key=lambda t: t[1])[prop_round] 309 | node_depths = {} 310 | bfs_visit(outgoing_edges, node_depths, dag_seed, 0) 311 | 312 | # Now split edges into forward/backward sets, by using their depths. 313 | # Intuitively, a node with depth h will get updated in step h. 314 | max_depth = max(node_depths.values()) 315 | assert(max_depth <= self.params['propagation_substeps']) 316 | fwd_pass_edges = [[] for _ in range(max_depth)] 317 | bwd_pass_edges = [[] for _ in range(max_depth)] 318 | for (v, typ, w) in graph: 319 | edge_bwd_type = typ if self.params['tie_fwd_bkwd'] else self.num_edge_types + typ 320 | v_depth = node_depths[v] 321 | w_depth = node_depths[w] 322 | if v_depth < w_depth: # "Forward": We are going up in depth: 323 | fwd_pass_edges[w_depth - 1].append((v, typ, w)) 324 | bwd_pass_edges[-v_depth - 1].append((w, edge_bwd_type, v)) 325 | elif w_depth < v_depth: # "Backward": We are going down in depth 326 | fwd_pass_edges[v_depth - 1].append((w, edge_bwd_type, v)) 327 | bwd_pass_edges[-w_depth - 1].append((v, typ, w)) 328 | else: 329 | # We ignore self-loops: 330 | assert v == w 331 | 332 | tensorised_prop_schedules.append(self.__tensorise_edge_sequence(fwd_pass_edges)) 333 | tensorised_prop_schedules.append(self.__tensorise_edge_sequence(bwd_pass_edges)) 334 | 335 | return tensorised_prop_schedules 336 | 337 | def make_minibatch_iterator(self, data: Any, is_training: bool): 338 | """Create minibatches by flattening graphs into a single one with multiple disconnected components.""" 339 | if is_training: 340 | np.random.shuffle(data) 341 | dropout_keep_prob = self.params['graph_state_dropout_keep_prob'] if is_training else 1. 342 | 343 | # Pack until we cannot fit more graphs in the batch 344 | num_graphs = 0 345 | while num_graphs < len(data): 346 | num_graphs_in_batch = 0 347 | batch_node_features = [] 348 | batch_target_task_values = [] 349 | batch_target_task_mask = [] 350 | batch_graph_nodes_list = [] 351 | node_offset = 0 352 | 353 | # Collect all indices; we'll strip out the batch dimension with a np.concatenate along that axis at the end: 354 | batch_initial_nodes = [[] for _ in range(self.params['propagation_rounds']) 355 | ] # type: List[List[np.ndarray]] # (prop_round, batch, None) 356 | batch_sending_nodes = [[[[] for _ in range(self.num_edge_types)] 357 | for _ in range(self.params['propagation_substeps'])] 358 | for _ in range(self.params['propagation_rounds']) 359 | ] # type: List[List[List[List[np.ndarray]]]] # (prop_round, step, edge_typ, batch, None) 360 | batch_msg_targets = [[[[] for _ in range(self.num_edge_types)] 361 | for _ in range(self.params['propagation_substeps'])] 362 | for _ in range(self.params['propagation_rounds']) 363 | ] # type: List[List[List[List[np.ndarray]]]] # (prop_round, step, edge_typ, batch, None) 364 | batch_receiving_nodes = [[[] for _ in range(self.params['propagation_substeps'])] 365 | for _ in range(self.params['propagation_rounds']) 366 | ] # type: List[List[List[np.ndarray]]] # (prop_round, step, batch, None) 367 | batch_receiving_node_num = [[0 for _ in range(self.params['propagation_substeps'])] 368 | for _ in range(self.params['propagation_rounds']) 369 | ] # type: List[List[int]] # (prop_round, step) 370 | 371 | msg_target_offsets = [[[0 for _ in range(self.num_edge_types)] 372 | for _ in range(self.params['propagation_substeps'])] 373 | for _ in range(self.params['propagation_rounds'])] 374 | 375 | while num_graphs < len(data) and node_offset + len(data[num_graphs]['init']) < self.params['num_nodes']: 376 | cur_graph = data[num_graphs] 377 | num_nodes_in_graph = len(cur_graph['init']) 378 | padded_features = np.pad(cur_graph['init'], 379 | ((0, 0), (0, self.params['hidden_size'] - self.annotation_size)), 380 | 'constant') 381 | batch_node_features.extend(padded_features) 382 | batch_graph_nodes_list.append(np.full(shape=[num_nodes_in_graph], fill_value=num_graphs_in_batch, dtype=np.int32)) 383 | 384 | # Combine the different propagation schedules: 385 | for prop_round in range(self.params['propagation_rounds']): 386 | cur_prop_schedule = cur_graph['prop_schedules'][prop_round] 387 | (graph_initial_nodes, 388 | graph_sending_nodes, 389 | graph_msg_targets, 390 | graph_recv_nodes) = cur_prop_schedule 391 | batch_initial_nodes[prop_round].append(graph_initial_nodes + node_offset) 392 | for step in range(self.params['propagation_substeps']): 393 | # Stop if we don't have that many steps: 394 | if step >= len(graph_sending_nodes): 395 | break 396 | 397 | for e_typ in range(self.num_edge_types): 398 | batch_sending_nodes[prop_round][step][e_typ].append(graph_sending_nodes[step][e_typ] + node_offset) 399 | batch_msg_targets[prop_round][step][e_typ].append(graph_msg_targets[step][e_typ] + msg_target_offsets[prop_round][step][e_typ]) 400 | if len(graph_msg_targets[step][e_typ]) > 0: 401 | msg_target_offsets[prop_round][step][e_typ] += max(graph_msg_targets[step][e_typ]) + 1 # ... 0-based indexing! 402 | batch_receiving_nodes[prop_round][step].append(graph_recv_nodes[step] + node_offset) 403 | batch_receiving_node_num[prop_round][step] += len(graph_recv_nodes[step]) 404 | 405 | target_task_values = [] 406 | target_task_mask = [] 407 | for target_val in cur_graph['target_values']: 408 | if target_val is None: # This is one of the examples we didn't sample... 409 | target_task_values.append(0.) 410 | target_task_mask.append(0.) 411 | else: 412 | target_task_values.append(target_val) 413 | target_task_mask.append(1.) 414 | batch_target_task_values.append(target_task_values) 415 | batch_target_task_mask.append(target_task_mask) 416 | num_graphs += 1 417 | num_graphs_in_batch += 1 418 | node_offset += num_nodes_in_graph 419 | 420 | batch_feed_dict = { 421 | self.placeholders['initial_node_representation']: np.array(batch_node_features), 422 | self.placeholders['graph_nodes_list']: np.concatenate(batch_graph_nodes_list, axis=0), 423 | self.placeholders['target_values']: np.transpose(batch_target_task_values, axes=[1,0]), 424 | self.placeholders['target_mask']: np.transpose(batch_target_task_mask, axes=[1, 0]), 425 | self.placeholders['num_graphs']: num_graphs_in_batch, 426 | self.placeholders['graph_state_keep_prob']: dropout_keep_prob, 427 | } 428 | 429 | for prop_round in range(self.params['propagation_rounds']): 430 | batch_feed_dict[self.placeholders['initial_nodes'][prop_round]] = \ 431 | np.concatenate(batch_initial_nodes[prop_round], axis=0) 432 | for step in range(self.params['propagation_substeps']): 433 | msg_targets = [] 434 | for edge_typ in range(self.num_edge_types): 435 | raw_senders = batch_sending_nodes[prop_round][step][edge_typ] 436 | batch_feed_dict[self.placeholders['sending_nodes'][prop_round][step][edge_typ]] = \ 437 | np.concatenate(raw_senders, axis=0) if len(raw_senders) > 0 else np.empty(shape=(0,), 438 | dtype=np.int32) 439 | raw_targets = batch_msg_targets[prop_round][step][edge_typ] 440 | msg_targets.extend(np.concatenate(raw_targets, axis=0) if len(raw_targets) > 0 else np.empty(shape=(0,), 441 | dtype=np.int32)) 442 | 443 | batch_feed_dict[self.placeholders['msg_targets'][prop_round][step]] = \ 444 | np.array(msg_targets, dtype=np.int32) 445 | raw_recvs = batch_receiving_nodes[prop_round][step] 446 | batch_feed_dict[self.placeholders['receiving_nodes'][prop_round][step]] = \ 447 | np.concatenate(raw_recvs, axis=0) if len(raw_recvs) > 0 else np.empty(shape=(0,), 448 | dtype=np.int32) 449 | batch_feed_dict[self.placeholders['receiving_node_num'][prop_round]] = \ 450 | np.array(batch_receiving_node_num[prop_round]) 451 | 452 | #self.check_batch_invariants(batch_feed_dict) 453 | yield batch_feed_dict 454 | 455 | 456 | def check_batch_invariants(self, batch_feed_dict): 457 | for prop_round in range(self.params['propagation_rounds']): 458 | initialised_nodes = set() 459 | initialised_nodes.update(batch_feed_dict[self.placeholders['initial_nodes'][prop_round]]) 460 | for step in range(self.params['propagation_substeps']): 461 | sending_nodes = set() 462 | for edge_typ in range(self.num_edge_types): 463 | sending_nodes.update(batch_feed_dict[self.placeholders['sending_nodes'][prop_round][step][edge_typ]]) 464 | for v in sending_nodes: 465 | assert v in initialised_nodes 466 | 467 | recv_nodes = batch_feed_dict[self.placeholders['receiving_nodes'][prop_round][step]] 468 | for v in recv_nodes: 469 | assert v not in initialised_nodes 470 | initialised_nodes.update(recv_nodes) 471 | 472 | def main(): 473 | args = docopt(__doc__) 474 | try: 475 | model = AsyncGGNNChemModel(args) 476 | model.train() 477 | except: 478 | typ, value, tb = sys.exc_info() 479 | traceback.print_exc() 480 | pdb.post_mortem(tb) 481 | 482 | 483 | if __name__ == "__main__": 484 | main() 485 | -------------------------------------------------------------------------------- /chem_tensorflow_dense.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python 2 | """ 3 | Usage: 4 | chem_tensorflow_dense.py [options] 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --config-file FILE Hyperparameter configuration file path (in JSON format) 9 | --config CONFIG Hyperparameter configuration dictionary (in JSON format) 10 | --log_dir NAME log dir name 11 | --data_dir NAME data dir name 12 | --restore FILE File to restore weights from. 13 | --freeze-graph-model Freeze weights of graph model components. 14 | --evaluate example evaluation mode using a restored model 15 | """ 16 | 17 | from typing import Sequence, Any 18 | from docopt import docopt 19 | from collections import defaultdict 20 | import numpy as np 21 | import tensorflow as tf 22 | import sys, traceback 23 | import pdb 24 | import json 25 | 26 | from chem_tensorflow import ChemModel 27 | from utils import glorot_init 28 | 29 | 30 | def graph_to_adj_mat(graph, max_n_vertices, num_edge_types, tie_fwd_bkwd=True): 31 | bwd_edge_offset = 0 if tie_fwd_bkwd else (num_edge_types // 2) 32 | amat = np.zeros((num_edge_types, max_n_vertices, max_n_vertices)) 33 | for src, e, dest in graph: 34 | amat[e-1, dest, src] = 1 35 | amat[e-1 + bwd_edge_offset, src, dest] = 1 36 | return amat 37 | 38 | 39 | ''' 40 | Comments provide the expected tensor shapes where helpful. 41 | 42 | Key to symbols in comments: 43 | --------------------------- 44 | [...]: a tensor 45 | ; ; : a list 46 | b: batch size 47 | e: number of edge types (4) 48 | v: number of vertices per graph in this batch 49 | h: GNN hidden size 50 | ''' 51 | 52 | class DenseGGNNChemModel(ChemModel): 53 | def __init__(self, args): 54 | super().__init__(args) 55 | 56 | @classmethod 57 | def default_params(cls): 58 | params = dict(super().default_params()) 59 | params.update({ 60 | 'batch_size': 256, 61 | 'graph_state_dropout_keep_prob': 1., 62 | 'task_sample_ratios': {}, 63 | 'use_edge_bias': True, 64 | 'edge_weight_dropout_keep_prob': 1 65 | }) 66 | return params 67 | 68 | def prepare_specific_graph_model(self) -> None: 69 | h_dim = self.params['hidden_size'] 70 | # inputs 71 | self.placeholders['graph_state_keep_prob'] = tf.placeholder(tf.float32, None, name='graph_state_keep_prob') 72 | self.placeholders['edge_weight_dropout_keep_prob'] = tf.placeholder(tf.float32, None, name='edge_weight_dropout_keep_prob') 73 | self.placeholders['initial_node_representation'] = tf.placeholder(tf.float32, 74 | [None, None, self.params['hidden_size']], 75 | name='node_features') 76 | self.placeholders['node_mask'] = tf.placeholder(tf.float32, [None, None], name='node_mask') 77 | self.placeholders['num_vertices'] = tf.placeholder(tf.int32, ()) 78 | self.placeholders['adjacency_matrix'] = tf.placeholder(tf.float32, 79 | [None, self.num_edge_types, None, None]) # [b, e, v, v] 80 | self.__adjacency_matrix = tf.transpose(self.placeholders['adjacency_matrix'], [1, 0, 2, 3]) # [e, b, v, v] 81 | 82 | 83 | # weights 84 | self.weights['edge_weights'] = tf.Variable(glorot_init([self.num_edge_types, h_dim, h_dim])) 85 | if self.params['use_edge_bias']: 86 | self.weights['edge_biases'] = tf.Variable(np.zeros([self.num_edge_types, 1, h_dim]).astype(np.float32)) 87 | with tf.variable_scope("gru_scope"): 88 | cell = tf.contrib.rnn.GRUCell(h_dim) 89 | cell = tf.nn.rnn_cell.DropoutWrapper(cell, 90 | state_keep_prob=self.placeholders['graph_state_keep_prob']) 91 | self.weights['node_gru'] = cell 92 | 93 | def compute_final_node_representations(self) -> tf.Tensor: 94 | v = self.placeholders['num_vertices'] 95 | h_dim = self.params['hidden_size'] 96 | h = self.placeholders['initial_node_representation'] # [b, v, h] 97 | h = tf.reshape(h, [-1, h_dim]) 98 | 99 | with tf.variable_scope("gru_scope") as scope: 100 | for i in range(self.params['num_timesteps']): 101 | if i > 0: 102 | tf.get_variable_scope().reuse_variables() 103 | for edge_type in range(self.num_edge_types): 104 | m = tf.matmul(h, tf.nn.dropout(self.weights['edge_weights'][edge_type], 105 | keep_prob=self.placeholders['edge_weight_dropout_keep_prob'])) # [b*v, h] 106 | m = tf.reshape(m, [-1, v, h_dim]) # [b, v, h] 107 | if self.params['use_edge_bias']: 108 | m += self.weights['edge_biases'][edge_type] # [b, v, h] 109 | if edge_type == 0: 110 | acts = tf.matmul(self.__adjacency_matrix[edge_type], m) 111 | else: 112 | acts += tf.matmul(self.__adjacency_matrix[edge_type], m) 113 | acts = tf.reshape(acts, [-1, h_dim]) # [b*v, h] 114 | 115 | h = self.weights['node_gru'](acts, h)[1] # [b*v, h] 116 | last_h = tf.reshape(h, [-1, v, h_dim]) 117 | return last_h 118 | 119 | def gated_regression(self, last_h, regression_gate, regression_transform): 120 | # last_h: [b x v x h] 121 | gate_input = tf.concat([last_h, self.placeholders['initial_node_representation']], axis = 2) # [b, v, 2h] 122 | gate_input = tf.reshape(gate_input, [-1, 2 * self.params["hidden_size"]]) # [b*v, 2h] 123 | last_h = tf.reshape(last_h, [-1, self.params["hidden_size"]]) # [b*v, h] 124 | gated_outputs = tf.nn.sigmoid(regression_gate(gate_input)) * regression_transform(last_h) # [b*v, 1] 125 | gated_outputs = tf.reshape(gated_outputs, [-1, self.placeholders['num_vertices']]) # [b, v] 126 | masked_gated_outputs = gated_outputs * self.placeholders['node_mask'] # [b x v] 127 | output = tf.reduce_sum(masked_gated_outputs, axis = 1) # [b] 128 | self.output = output 129 | return output 130 | 131 | # ----- Data preprocessing and chunking into minibatches: 132 | def process_raw_graphs(self, raw_data: Sequence[Any], is_training_data: bool, bucket_sizes=None) -> Any: 133 | if bucket_sizes is None: 134 | bucket_sizes = np.array(list(range(4, 28, 2)) + [29]) 135 | bucketed = defaultdict(list) 136 | x_dim = len(raw_data[0]["node_features"][0]) 137 | for d in raw_data: 138 | chosen_bucket_idx = np.argmax(bucket_sizes > max([v for e in d['graph'] 139 | for v in [e[0], e[2]]])) 140 | chosen_bucket_size = bucket_sizes[chosen_bucket_idx] 141 | n_active_nodes = len(d["node_features"]) 142 | bucketed[chosen_bucket_idx].append({ 143 | 'adj_mat': graph_to_adj_mat(d['graph'], chosen_bucket_size, self.num_edge_types, self.params['tie_fwd_bkwd']), 144 | 'init': d["node_features"] + [[0 for _ in range(x_dim)] for __ in 145 | range(chosen_bucket_size - n_active_nodes)], 146 | 'labels': [d["targets"][task_id][0] for task_id in self.params['task_ids']], 147 | 'mask': [1. for _ in range(n_active_nodes) ] + [0. for _ in range(chosen_bucket_size - n_active_nodes)] 148 | }) 149 | 150 | if is_training_data: 151 | for (bucket_idx, bucket) in bucketed.items(): 152 | np.random.shuffle(bucket) 153 | for task_id in self.params['task_ids']: 154 | task_sample_ratio = self.params['task_sample_ratios'].get(str(task_id)) 155 | if task_sample_ratio is not None: 156 | ex_to_sample = int(len(bucket) * task_sample_ratio) 157 | for ex_id in range(ex_to_sample, len(bucket)): 158 | bucket[ex_id]['labels'][task_id] = None 159 | 160 | bucket_at_step = [[bucket_idx for _ in range(len(bucket_data) // self.params['batch_size'])] 161 | for bucket_idx, bucket_data in bucketed.items()] 162 | bucket_at_step = [x for y in bucket_at_step for x in y] 163 | 164 | return (bucketed, bucket_sizes, bucket_at_step) 165 | 166 | def pad_annotations(self, annotations): 167 | return np.pad(annotations, 168 | pad_width=[[0, 0], [0, 0], [0, self.params['hidden_size'] - self.annotation_size]], 169 | mode='constant') 170 | 171 | 172 | def make_batch(self, elements): 173 | batch_data = {'adj_mat': [], 'init': [], 'labels': [], 'node_mask': [], 'task_masks': []} 174 | for d in elements: 175 | batch_data['adj_mat'].append(d['adj_mat']) 176 | batch_data['init'].append(d['init']) 177 | batch_data['node_mask'].append(d['mask']) 178 | 179 | target_task_values = [] 180 | target_task_mask = [] 181 | for target_val in d['labels']: 182 | if target_val is None: # This is one of the examples we didn't sample... 183 | target_task_values.append(0.) 184 | target_task_mask.append(0.) 185 | else: 186 | target_task_values.append(target_val) 187 | target_task_mask.append(1.) 188 | batch_data['labels'].append(target_task_values) 189 | batch_data['task_masks'].append(target_task_mask) 190 | 191 | return batch_data 192 | 193 | 194 | def make_minibatch_iterator(self, data, is_training: bool): 195 | (bucketed, bucket_sizes, bucket_at_step) = data 196 | if is_training: 197 | np.random.shuffle(bucket_at_step) 198 | for _, bucketed_data in bucketed.items(): 199 | np.random.shuffle(bucketed_data) 200 | 201 | bucket_counters = defaultdict(int) 202 | dropout_keep_prob = self.params['graph_state_dropout_keep_prob'] if is_training else 1. 203 | for step in range(len(bucket_at_step)): 204 | bucket = bucket_at_step[step] 205 | start_idx = bucket_counters[bucket] * self.params['batch_size'] 206 | end_idx = (bucket_counters[bucket] + 1) * self.params['batch_size'] 207 | elements = bucketed[bucket][start_idx:end_idx] 208 | batch_data = self.make_batch(elements) 209 | 210 | num_graphs = len(batch_data['init']) 211 | initial_representations = batch_data['init'] 212 | initial_representations = self.pad_annotations(initial_representations) 213 | 214 | batch_feed_dict = { 215 | self.placeholders['initial_node_representation']: initial_representations, 216 | self.placeholders['target_values']: np.transpose(batch_data['labels'], axes=[1,0]), 217 | self.placeholders['target_mask']: np.transpose(batch_data['task_masks'], axes=[1, 0]), 218 | self.placeholders['num_graphs']: num_graphs, 219 | self.placeholders['num_vertices']: bucket_sizes[bucket], 220 | self.placeholders['adjacency_matrix']: batch_data['adj_mat'], 221 | self.placeholders['node_mask']: batch_data['node_mask'], 222 | self.placeholders['graph_state_keep_prob']: dropout_keep_prob, 223 | self.placeholders['edge_weight_dropout_keep_prob']: dropout_keep_prob 224 | } 225 | 226 | bucket_counters[bucket] += 1 227 | 228 | yield batch_feed_dict 229 | 230 | def evaluate_one_batch(self, initial_node_representations, adjacency_matrices, node_masks=None): 231 | num_vertices = len(initial_node_representations[0]) 232 | if node_masks is None: 233 | node_masks = [] 234 | for r in initial_node_representations: 235 | node_masks.append([1. for _ in r] + [0. for _ in range(num_vertices - len(r))]) 236 | batch_feed_dict = { 237 | self.placeholders['initial_node_representation']: self.pad_annotations(initial_node_representations), 238 | self.placeholders['num_graphs']: len(initial_node_representations), 239 | self.placeholders['num_vertices']: len(initial_node_representations[0]), 240 | self.placeholders['adjacency_matrix']: adjacency_matrices, 241 | self.placeholders['node_mask']: node_masks, 242 | self.placeholders['graph_state_keep_prob']: 1.0, 243 | self.placeholders['out_layer_dropout_keep_prob']: 1.0, 244 | self.placeholders['edge_weight_dropout_keep_prob']: 1.0 245 | } 246 | 247 | fetch_list = self.output 248 | result = self.sess.run(fetch_list, feed_dict=batch_feed_dict) 249 | return result 250 | 251 | def example_evaluation(self): 252 | ''' Demonstration of what test-time code would look like 253 | we query the model with the first n_example_molecules from the validation file 254 | ''' 255 | n_example_molecules = 10 256 | with open('molecules_valid.json', 'r') as valid_file: 257 | example_molecules = json.load(valid_file)[:n_example_molecules] 258 | 259 | for mol in example_molecules: 260 | print(mol['targets']) 261 | 262 | example_molecules, _, _ = self.process_raw_graphs(example_molecules, 263 | is_training_data=False, bucket_sizes=np.array([29])) 264 | batch_data = self.make_batch(example_molecules[0]) 265 | print(self.evaluate_one_batch(batch_data['init'], batch_data['adj_mat'])) 266 | 267 | 268 | 269 | 270 | 271 | def main(): 272 | args = docopt(__doc__) 273 | try: 274 | model = DenseGGNNChemModel(args) 275 | 276 | if args['--evaluate']: 277 | model.example_evaluation() 278 | else: 279 | model.train() 280 | except: 281 | typ, value, tb = sys.exc_info() 282 | traceback.print_exc() 283 | pdb.post_mortem(tb) 284 | 285 | 286 | if __name__ == "__main__": 287 | main() 288 | -------------------------------------------------------------------------------- /chem_tensorflow_gcn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python 2 | ''' 3 | Usage: 4 | chem_tensorflow_gcn.py [options] 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --config-file FILE Hyperparameter configuration file path (in JSON format) 9 | --config CONFIG Hyperparameter configuration dictionary (in JSON format) 10 | --log_dir NAME log dir name 11 | --data_dir NAME data dir name 12 | --restore FILE File to restore weights from. 13 | --freeze-graph-model Freeze weights of graph model components. 14 | ''' 15 | from typing import Tuple, Sequence, Any 16 | 17 | from docopt import docopt 18 | import numpy as np 19 | import tensorflow as tf 20 | 21 | import sys, traceback 22 | import pdb 23 | 24 | from chem_tensorflow import ChemModel 25 | from utils import glorot_init 26 | 27 | 28 | class SparseGCNChemModel(ChemModel): 29 | def __init__(self, args): 30 | super().__init__(args) 31 | 32 | @classmethod 33 | def default_params(cls): 34 | params = dict(super().default_params()) 35 | params.update({'batch_size': 100000, 36 | 'task_sample_ratios': {}, 37 | 'gcn_use_bias': False, 38 | 'graph_state_dropout_keep_prob': 1.0, 39 | }) 40 | return params 41 | 42 | def prepare_specific_graph_model(self) -> None: 43 | h_dim = self.params['hidden_size'] 44 | self.placeholders['initial_node_representation'] = tf.placeholder(tf.float32, [None, h_dim], 45 | name='node_features') 46 | self.placeholders['adjacency_list'] = tf.placeholder(tf.int64, [None, 2], name='adjacency_list') 47 | self.placeholders['adjacency_weights'] = tf.placeholder(tf.float32, [None], name='adjacency_weights') 48 | self.placeholders['graph_nodes_list'] = tf.placeholder(tf.int32, [None], name='graph_nodes_list') 49 | self.placeholders['graph_state_keep_prob'] = tf.placeholder(tf.float32, None, name='graph_state_keep_prob') 50 | 51 | with tf.variable_scope('gcn_scope'): 52 | self.weights['edge_weights'] = [tf.Variable(glorot_init((h_dim, h_dim)), name="gcn_weights_%i" % i) 53 | for i in range(self.params['num_timesteps'])] 54 | 55 | if self.params['gcn_use_bias']: 56 | self.weights['edge_biases'] = [tf.Variable(np.zeros([h_dim], dtype=np.float32), name="gcn_bias_%i" % i) 57 | for i in range(self.params['num_timesteps'])] 58 | 59 | def compute_final_node_representations(self): 60 | with tf.variable_scope('gcn_scope'): 61 | cur_node_states = self.placeholders['initial_node_representation'] # number of nodes in batch v x D 62 | num_nodes = tf.shape(self.placeholders['initial_node_representation'], out_type=tf.int64)[0] 63 | 64 | adjacency_matrix = tf.SparseTensor(indices=self.placeholders['adjacency_list'], 65 | values=self.placeholders['adjacency_weights'], 66 | dense_shape=[num_nodes, num_nodes]) 67 | 68 | for layer_idx in range(self.params['num_timesteps']): 69 | scaled_cur_node_states = tf.sparse_tensor_dense_matmul(adjacency_matrix, cur_node_states) # v x D 70 | new_node_states = tf.matmul(scaled_cur_node_states, self.weights['edge_weights'][layer_idx]) 71 | 72 | if self.params['gcn_use_bias']: 73 | new_node_states += self.weights['edge_biases'][layer_idx] # v x D 74 | 75 | # On all but final layer do ReLU and dropout: 76 | if layer_idx < self.params['num_timesteps'] - 1: 77 | new_node_states = tf.nn.relu(new_node_states) 78 | new_node_states = tf.nn.dropout(new_node_states, keep_prob=self.placeholders['graph_state_keep_prob']) 79 | 80 | cur_node_states = new_node_states 81 | 82 | return cur_node_states 83 | 84 | def gated_regression(self, last_h, regression_gate, regression_transform): 85 | # last_h: [v x h] 86 | gate_input = tf.concat([last_h, self.placeholders['initial_node_representation']], axis=-1) # [v x 2h] 87 | gated_outputs = tf.nn.sigmoid(regression_gate(gate_input)) * regression_transform(last_h) # [v x 1] 88 | 89 | # Sum up all nodes per-graph 90 | graph_representations = tf.unsorted_segment_sum(data=gated_outputs, 91 | segment_ids=self.placeholders['graph_nodes_list'], 92 | num_segments=self.placeholders['num_graphs']) # [g x 1] 93 | return tf.squeeze(graph_representations) # [g] 94 | 95 | # ----- Data preprocessing and chunking into minibatches: 96 | def process_raw_graphs(self, raw_data: Sequence[Any], is_training_data: bool) -> Any: 97 | processed_graphs = [] 98 | for d in raw_data: 99 | (adjacency_list, adjacency_weights) = self.__graph_to_adjacency_list(d['graph'], len(d["node_features"])) 100 | processed_graphs.append({"adjacency_list": adjacency_list, 101 | "adjacency_weights": adjacency_weights, 102 | "init": d["node_features"], 103 | "labels": [d["targets"][task_id][0] for task_id in self.params['task_ids']]}) 104 | 105 | if is_training_data: 106 | np.random.shuffle(processed_graphs) 107 | for task_id in self.params['task_ids']: 108 | task_sample_ratio = self.params['task_sample_ratios'].get(str(task_id)) 109 | if task_sample_ratio is not None: 110 | ex_to_sample = int(len(processed_graphs) * task_sample_ratio) 111 | for ex_id in range(ex_to_sample, len(processed_graphs)): 112 | processed_graphs[ex_id]['labels'][task_id] = None 113 | 114 | return processed_graphs 115 | 116 | def __graph_to_adjacency_list(self, graph, num_nodes: int) -> Tuple[np.ndarray, np.ndarray]: 117 | # Step 1: Generate adjacency matrices: 118 | adj_matrix = np.zeros((num_nodes, num_nodes)) 119 | for src, _, dest in graph: 120 | adj_matrix[src, dest] = 1 121 | adj_matrix[dest, src] = 1 122 | 123 | # Step 2: Introduce self loops: 124 | self_loops = np.eye(num_nodes) 125 | adj_matrix += self_loops 126 | 127 | # Step 3: Normalize adj_matrices so that scale of vectors doesn't explode: 128 | row_sum = np.sum(adj_matrix, axis=-1) 129 | D_inv_sqrt = np.diag(np.power(row_sum, -0.5).flatten() + 1e-7) 130 | adj_matrix = D_inv_sqrt.dot(adj_matrix).dot(D_inv_sqrt) 131 | 132 | # Step 4: Turn into sorted adjacency lists: 133 | final_adj_list = [] 134 | final_adj_weights = [] 135 | for i in range(num_nodes): 136 | for j in range(num_nodes): 137 | w = adj_matrix[i, j] 138 | if w != 0: 139 | final_adj_list.append([i,j]) 140 | final_adj_weights.append(w) 141 | 142 | return np.array(final_adj_list), np.array(final_adj_weights) 143 | 144 | def make_minibatch_iterator(self, data: Any, is_training: bool): 145 | """Create minibatches by flattening adjacency matrices into a single adjacency matrix with 146 | multiple disconnected components.""" 147 | if is_training: 148 | np.random.shuffle(data) 149 | dropout_keep_prob = self.params['graph_state_dropout_keep_prob'] if is_training else 1. 150 | # Pack until we cannot fit more graphs in the batch 151 | num_graphs = 0 152 | while num_graphs < len(data): 153 | num_graphs_in_batch = 0 154 | batch_node_features = [] 155 | batch_target_task_values = [] 156 | batch_target_task_mask = [] 157 | batch_adjacency_list = [] 158 | batch_adjacency_weights = [] 159 | batch_graph_nodes_list = [] 160 | node_offset = 0 161 | 162 | while num_graphs < len(data) and node_offset + len(data[num_graphs]['init']) < self.params['batch_size']: 163 | cur_graph = data[num_graphs] 164 | num_nodes_in_graph = len(cur_graph['init']) 165 | padded_features = np.pad(cur_graph['init'], 166 | ((0, 0), (0, self.params['hidden_size'] - self.annotation_size)), 167 | mode='constant') 168 | batch_node_features.extend(padded_features) 169 | batch_graph_nodes_list.append(np.full(shape=[num_nodes_in_graph], fill_value=num_graphs_in_batch, dtype=np.int32)) 170 | batch_adjacency_list.append(cur_graph['adjacency_list'] + node_offset) 171 | batch_adjacency_weights.append(cur_graph['adjacency_weights']) 172 | 173 | target_task_values = [] 174 | target_task_mask = [] 175 | for target_val in cur_graph['labels']: 176 | if target_val is None: # This is one of the examples we didn't sample... 177 | target_task_values.append(0.) 178 | target_task_mask.append(0.) 179 | else: 180 | target_task_values.append(target_val) 181 | target_task_mask.append(1.) 182 | batch_target_task_values.append(target_task_values) 183 | batch_target_task_mask.append(target_task_mask) 184 | num_graphs += 1 185 | num_graphs_in_batch += 1 186 | node_offset += num_nodes_in_graph 187 | 188 | batch_feed_dict = { 189 | self.placeholders['initial_node_representation']: np.array(batch_node_features), 190 | self.placeholders['adjacency_list']: np.concatenate(batch_adjacency_list, axis=0), 191 | self.placeholders['adjacency_weights']: np.concatenate(batch_adjacency_weights, axis=0), 192 | self.placeholders['graph_nodes_list']: np.concatenate(batch_graph_nodes_list, axis=0), 193 | self.placeholders['target_values']: np.transpose(batch_target_task_values, axes=[1,0]), 194 | self.placeholders['target_mask']: np.transpose(batch_target_task_mask, axes=[1, 0]), 195 | self.placeholders['num_graphs']: num_graphs_in_batch, 196 | self.placeholders['graph_state_keep_prob']: dropout_keep_prob, 197 | } 198 | 199 | yield batch_feed_dict 200 | 201 | 202 | def main(): 203 | args = docopt(__doc__) 204 | try: 205 | model = SparseGCNChemModel(args) 206 | model.train() 207 | except: 208 | typ, value, tb = sys.exc_info() 209 | traceback.print_exc() 210 | pdb.post_mortem(tb) 211 | 212 | if __name__ == "__main__": 213 | main() 214 | -------------------------------------------------------------------------------- /chem_tensorflow_sparse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python 2 | """ 3 | Usage: 4 | chem_tensorflow_sparse.py [options] 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --config-file FILE Hyperparameter configuration file path (in JSON format). 9 | --config CONFIG Hyperparameter configuration dictionary (in JSON format). 10 | --log_dir DIR Log dir name. 11 | --data_dir DIR Data dir name. 12 | --restore FILE File to restore weights from. 13 | --freeze-graph-model Freeze weights of graph model components. 14 | --evaluate example evaluation mode using a restored model 15 | """ 16 | from typing import List, Tuple, Dict, Sequence, Any 17 | 18 | from docopt import docopt 19 | from collections import defaultdict, namedtuple 20 | import numpy as np 21 | import tensorflow as tf 22 | import sys, traceback 23 | import pdb 24 | import json 25 | 26 | from chem_tensorflow import ChemModel 27 | from utils import glorot_init, SMALL_NUMBER 28 | 29 | 30 | GGNNWeights = namedtuple('GGNNWeights', ['edge_weights', 31 | 'edge_biases', 32 | 'edge_type_attention_weights', 33 | 'rnn_cells',]) 34 | 35 | 36 | class SparseGGNNChemModel(ChemModel): 37 | def __init__(self, args): 38 | super().__init__(args) 39 | 40 | @classmethod 41 | def default_params(cls): 42 | params = dict(super().default_params()) 43 | params.update({ 44 | 'batch_size': 100000, 45 | 'use_edge_bias': False, 46 | 'use_propagation_attention': False, 47 | 'use_edge_msg_avg_aggregation': True, 48 | 'residual_connections': { # For layer i, specify list of layers whose output is added as an input 49 | "2": [0], 50 | "4": [0, 2] 51 | }, 52 | 53 | 'layer_timesteps': [2, 2, 1, 2, 1], # number of layers & propagation steps per layer 54 | 55 | 'graph_rnn_cell': 'GRU', # GRU, CudnnCompatibleGRUCell, or RNN 56 | 'graph_rnn_activation': 'tanh', # tanh, ReLU 57 | 'graph_state_dropout_keep_prob': 1., 58 | 'task_sample_ratios': {}, 59 | 'edge_weight_dropout_keep_prob': .8 60 | }) 61 | return params 62 | 63 | def prepare_specific_graph_model(self) -> None: 64 | h_dim = self.params['hidden_size'] 65 | self.placeholders['initial_node_representation'] = tf.placeholder(tf.float32, [None, h_dim], 66 | name='node_features') 67 | self.placeholders['adjacency_lists'] = [tf.placeholder(tf.int32, [None, 2], name='adjacency_e%s' % e) 68 | for e in range(self.num_edge_types)] 69 | self.placeholders['num_incoming_edges_per_type'] = tf.placeholder(tf.float32, [None, self.num_edge_types], 70 | name='num_incoming_edges_per_type') 71 | self.placeholders['graph_nodes_list'] = tf.placeholder(tf.int32, [None], name='graph_nodes_list') 72 | self.placeholders['graph_state_keep_prob'] = tf.placeholder(tf.float32, None, name='graph_state_keep_prob') 73 | self.placeholders['edge_weight_dropout_keep_prob'] = tf.placeholder(tf.float32, None, name='edge_weight_dropout_keep_prob') 74 | 75 | activation_name = self.params['graph_rnn_activation'].lower() 76 | if activation_name == 'tanh': 77 | activation_fun = tf.nn.tanh 78 | elif activation_name == 'relu': 79 | activation_fun = tf.nn.relu 80 | else: 81 | raise Exception("Unknown activation function type '%s'." % activation_name) 82 | 83 | # Generate per-layer values for edge weights, biases and gated units: 84 | self.weights = {} # Used by super-class to place generic things 85 | self.gnn_weights = GGNNWeights([], [], [], []) 86 | for layer_idx in range(len(self.params['layer_timesteps'])): 87 | with tf.variable_scope('gnn_layer_%i' % layer_idx): 88 | edge_weights = tf.Variable(glorot_init([self.num_edge_types * h_dim, h_dim]), 89 | name='gnn_edge_weights_%i' % layer_idx) 90 | edge_weights = tf.reshape(edge_weights, [self.num_edge_types, h_dim, h_dim]) 91 | edge_weights = tf.nn.dropout(edge_weights, keep_prob=self.placeholders['edge_weight_dropout_keep_prob']) 92 | self.gnn_weights.edge_weights.append(edge_weights) 93 | 94 | if self.params['use_propagation_attention']: 95 | self.gnn_weights.edge_type_attention_weights.append(tf.Variable(np.ones([self.num_edge_types], dtype=np.float32), 96 | name='edge_type_attention_weights_%i' % layer_idx)) 97 | 98 | if self.params['use_edge_bias']: 99 | self.gnn_weights.edge_biases.append(tf.Variable(np.zeros([self.num_edge_types, h_dim], dtype=np.float32), 100 | name='gnn_edge_biases_%i' % layer_idx)) 101 | 102 | cell_type = self.params['graph_rnn_cell'].lower() 103 | if cell_type == 'gru': 104 | cell = tf.nn.rnn_cell.GRUCell(h_dim, activation=activation_fun) 105 | elif cell_type == 'cudnncompatiblegrucell': 106 | assert(activation_name == 'tanh') 107 | import tensorflow.contrib.cudnn_rnn as cudnn_rnn 108 | cell = cudnn_rnn.CudnnCompatibleGRUCell(h_dim) 109 | elif cell_type == 'rnn': 110 | cell = tf.nn.rnn_cell.BasicRNNCell(h_dim, activation=activation_fun) 111 | else: 112 | raise Exception("Unknown RNN cell type '%s'." % cell_type) 113 | cell = tf.nn.rnn_cell.DropoutWrapper(cell, 114 | state_keep_prob=self.placeholders['graph_state_keep_prob']) 115 | self.gnn_weights.rnn_cells.append(cell) 116 | 117 | def compute_final_node_representations(self) -> tf.Tensor: 118 | node_states_per_layer = [] # one entry per layer (final state of that layer), shape: number of nodes in batch v x D 119 | node_states_per_layer.append(self.placeholders['initial_node_representation']) 120 | num_nodes = tf.shape(self.placeholders['initial_node_representation'], out_type=tf.int32)[0] 121 | 122 | message_targets = [] # list of tensors of message targets of shape [E] 123 | message_edge_types = [] # list of tensors of edge type of shape [E] 124 | for edge_type_idx, adjacency_list_for_edge_type in enumerate(self.placeholders['adjacency_lists']): 125 | edge_targets = adjacency_list_for_edge_type[:, 1] 126 | message_targets.append(edge_targets) 127 | message_edge_types.append(tf.ones_like(edge_targets, dtype=tf.int32) * edge_type_idx) 128 | message_targets = tf.concat(message_targets, axis=0) # Shape [M] 129 | message_edge_types = tf.concat(message_edge_types, axis=0) # Shape [M] 130 | 131 | for (layer_idx, num_timesteps) in enumerate(self.params['layer_timesteps']): 132 | with tf.variable_scope('gnn_layer_%i' % layer_idx): 133 | # Used shape abbreviations: 134 | # V ~ number of nodes 135 | # D ~ state dimension 136 | # E ~ number of edges of current type 137 | # M ~ number of messages (sum of all E) 138 | 139 | # Extract residual messages, if any: 140 | layer_residual_connections = self.params['residual_connections'].get(str(layer_idx)) 141 | if layer_residual_connections is None: 142 | layer_residual_states = [] 143 | else: 144 | layer_residual_states = [node_states_per_layer[residual_layer_idx] 145 | for residual_layer_idx in layer_residual_connections] 146 | 147 | if self.params['use_propagation_attention']: 148 | message_edge_type_factors = tf.nn.embedding_lookup(params=self.gnn_weights.edge_type_attention_weights[layer_idx], 149 | ids=message_edge_types) # Shape [M] 150 | 151 | # Record new states for this layer. Initialised to last state, but will be updated below: 152 | node_states_per_layer.append(node_states_per_layer[-1]) 153 | for step in range(num_timesteps): 154 | with tf.variable_scope('timestep_%i' % step): 155 | messages = [] # list of tensors of messages of shape [E, D] 156 | message_source_states = [] # list of tensors of edge source states of shape [E, D] 157 | 158 | # Collect incoming messages per edge type 159 | for edge_type_idx, adjacency_list_for_edge_type in enumerate(self.placeholders['adjacency_lists']): 160 | edge_sources = adjacency_list_for_edge_type[:, 0] 161 | edge_source_states = tf.nn.embedding_lookup(params=node_states_per_layer[-1], 162 | ids=edge_sources) # Shape [E, D] 163 | all_messages_for_edge_type = tf.matmul(edge_source_states, 164 | self.gnn_weights.edge_weights[layer_idx][edge_type_idx]) # Shape [E, D] 165 | messages.append(all_messages_for_edge_type) 166 | message_source_states.append(edge_source_states) 167 | 168 | messages = tf.concat(messages, axis=0) # Shape [M, D] 169 | 170 | if self.params['use_propagation_attention']: 171 | message_source_states = tf.concat(message_source_states, axis=0) # Shape [M, D] 172 | message_target_states = tf.nn.embedding_lookup(params=node_states_per_layer[-1], 173 | ids=message_targets) # Shape [M, D] 174 | message_attention_scores = tf.einsum('mi,mi->m', message_source_states, message_target_states) # Shape [M] 175 | message_attention_scores = message_attention_scores * message_edge_type_factors 176 | 177 | # The following is softmax-ing over the incoming messages per node. 178 | # As the number of incoming varies, we can't just use tf.softmax. Reimplement with logsumexp trick: 179 | # Step (1): Obtain shift constant as max of messages going into a node 180 | message_attention_score_max_per_target = tf.unsorted_segment_max(data=message_attention_scores, 181 | segment_ids=message_targets, 182 | num_segments=num_nodes) # Shape [V] 183 | # Step (2): Distribute max out to the corresponding messages again, and shift scores: 184 | message_attention_score_max_per_message = tf.gather(params=message_attention_score_max_per_target, 185 | indices=message_targets) # Shape [M] 186 | message_attention_scores -= message_attention_score_max_per_message 187 | # Step (3): Exp, sum up per target, compute exp(score) / exp(sum) as attention prob: 188 | message_attention_scores_exped = tf.exp(message_attention_scores) # Shape [M] 189 | message_attention_score_sum_per_target = tf.unsorted_segment_sum(data=message_attention_scores_exped, 190 | segment_ids=message_targets, 191 | num_segments=num_nodes) # Shape [V] 192 | message_attention_normalisation_sum_per_message = tf.gather(params=message_attention_score_sum_per_target, 193 | indices=message_targets) # Shape [M] 194 | message_attention = message_attention_scores_exped / (message_attention_normalisation_sum_per_message + SMALL_NUMBER) # Shape [M] 195 | # Step (4): Weigh messages using the attention prob: 196 | messages = messages * tf.expand_dims(message_attention, -1) 197 | 198 | incoming_messages = tf.unsorted_segment_sum(data=messages, 199 | segment_ids=message_targets, 200 | num_segments=num_nodes) # Shape [V, D] 201 | 202 | if self.params['use_edge_bias']: 203 | incoming_messages += tf.matmul(self.placeholders['num_incoming_edges_per_type'], 204 | self.gnn_weights.edge_biases[layer_idx]) # Shape [V, D] 205 | 206 | if self.params['use_edge_msg_avg_aggregation']: 207 | num_incoming_edges = tf.reduce_sum(self.placeholders['num_incoming_edges_per_type'], 208 | keep_dims=True, axis=-1) # Shape [V, 1] 209 | incoming_messages /= num_incoming_edges + SMALL_NUMBER 210 | 211 | incoming_information = tf.concat(layer_residual_states + [incoming_messages], 212 | axis=-1) # Shape [V, D*(1 + num of residual connections)] 213 | 214 | # pass updated vertex features into RNN cell 215 | node_states_per_layer[-1] = self.gnn_weights.rnn_cells[layer_idx](incoming_information, 216 | node_states_per_layer[-1])[1] # Shape [V, D] 217 | 218 | return node_states_per_layer[-1] 219 | 220 | def gated_regression(self, last_h, regression_gate, regression_transform): 221 | # last_h: [v x h] 222 | gate_input = tf.concat([last_h, self.placeholders['initial_node_representation']], axis=-1) # [v x 2h] 223 | gated_outputs = tf.nn.sigmoid(regression_gate(gate_input)) * regression_transform(last_h) # [v x 1] 224 | 225 | # Sum up all nodes per-graph 226 | graph_representations = tf.unsorted_segment_sum(data=gated_outputs, 227 | segment_ids=self.placeholders['graph_nodes_list'], 228 | num_segments=self.placeholders['num_graphs']) # [g x 1] 229 | output = tf.squeeze(graph_representations) # [g] 230 | self.output = output 231 | return output 232 | 233 | # ----- Data preprocessing and chunking into minibatches: 234 | def process_raw_graphs(self, raw_data: Sequence[Any], is_training_data: bool) -> Any: 235 | processed_graphs = [] 236 | for d in raw_data: 237 | (adjacency_lists, num_incoming_edge_per_type) = self.__graph_to_adjacency_lists(d['graph']) 238 | processed_graphs.append({"adjacency_lists": adjacency_lists, 239 | "num_incoming_edge_per_type": num_incoming_edge_per_type, 240 | "init": d["node_features"], 241 | "labels": [d["targets"][task_id][0] for task_id in self.params['task_ids']]}) 242 | 243 | if is_training_data: 244 | np.random.shuffle(processed_graphs) 245 | for task_id in self.params['task_ids']: 246 | task_sample_ratio = self.params['task_sample_ratios'].get(str(task_id)) 247 | if task_sample_ratio is not None: 248 | ex_to_sample = int(len(processed_graphs) * task_sample_ratio) 249 | for ex_id in range(ex_to_sample, len(processed_graphs)): 250 | processed_graphs[ex_id]['labels'][task_id] = None 251 | 252 | return processed_graphs 253 | 254 | def __graph_to_adjacency_lists(self, graph) -> Tuple[Dict[int, np.ndarray], Dict[int, Dict[int, int]]]: 255 | adj_lists = defaultdict(list) 256 | num_incoming_edges_dicts_per_type = defaultdict(lambda: defaultdict(lambda: 0)) 257 | for src, e, dest in graph: 258 | fwd_edge_type = e - 1 # Make edges start from 0 259 | adj_lists[fwd_edge_type].append((src, dest)) 260 | num_incoming_edges_dicts_per_type[fwd_edge_type][dest] += 1 261 | if self.params['tie_fwd_bkwd']: 262 | adj_lists[fwd_edge_type].append((dest, src)) 263 | num_incoming_edges_dicts_per_type[fwd_edge_type][src] += 1 264 | 265 | final_adj_lists = {e: np.array(sorted(lm), dtype=np.int32) 266 | for e, lm in adj_lists.items()} 267 | 268 | # Add backward edges as an additional edge type that goes backwards: 269 | if not (self.params['tie_fwd_bkwd']): 270 | for (edge_type, edges) in adj_lists.items(): 271 | bwd_edge_type = self.num_edge_types + edge_type 272 | final_adj_lists[bwd_edge_type] = np.array(sorted((y, x) for (x, y) in edges), dtype=np.int32) 273 | for (x, y) in edges: 274 | num_incoming_edges_dicts_per_type[bwd_edge_type][y] += 1 275 | 276 | return final_adj_lists, num_incoming_edges_dicts_per_type 277 | 278 | def make_minibatch_iterator(self, data: Any, is_training: bool): 279 | """Create minibatches by flattening adjacency matrices into a single adjacency matrix with 280 | multiple disconnected components.""" 281 | if is_training: 282 | np.random.shuffle(data) 283 | # Pack until we cannot fit more graphs in the batch 284 | state_dropout_keep_prob = self.params['graph_state_dropout_keep_prob'] if is_training else 1. 285 | edge_weights_dropout_keep_prob = self.params['edge_weight_dropout_keep_prob'] if is_training else 1. 286 | num_graphs = 0 287 | while num_graphs < len(data): 288 | num_graphs_in_batch = 0 289 | batch_node_features = [] 290 | batch_target_task_values = [] 291 | batch_target_task_mask = [] 292 | batch_adjacency_lists = [[] for _ in range(self.num_edge_types)] 293 | batch_num_incoming_edges_per_type = [] 294 | batch_graph_nodes_list = [] 295 | node_offset = 0 296 | 297 | while num_graphs < len(data) and node_offset + len(data[num_graphs]['init']) < self.params['batch_size']: 298 | cur_graph = data[num_graphs] 299 | num_nodes_in_graph = len(cur_graph['init']) 300 | padded_features = np.pad(cur_graph['init'], 301 | ((0, 0), (0, self.params['hidden_size'] - self.annotation_size)), 302 | 'constant') 303 | batch_node_features.extend(padded_features) 304 | batch_graph_nodes_list.append(np.full(shape=[num_nodes_in_graph], fill_value=num_graphs_in_batch, dtype=np.int32)) 305 | for i in range(self.num_edge_types): 306 | if i in cur_graph['adjacency_lists']: 307 | batch_adjacency_lists[i].append(cur_graph['adjacency_lists'][i] + node_offset) 308 | 309 | # Turn counters for incoming edges into np array: 310 | num_incoming_edges_per_type = np.zeros((num_nodes_in_graph, self.num_edge_types)) 311 | for (e_type, num_incoming_edges_per_type_dict) in cur_graph['num_incoming_edge_per_type'].items(): 312 | for (node_id, edge_count) in num_incoming_edges_per_type_dict.items(): 313 | num_incoming_edges_per_type[node_id, e_type] = edge_count 314 | batch_num_incoming_edges_per_type.append(num_incoming_edges_per_type) 315 | 316 | target_task_values = [] 317 | target_task_mask = [] 318 | for target_val in cur_graph['labels']: 319 | if target_val is None: # This is one of the examples we didn't sample... 320 | target_task_values.append(0.) 321 | target_task_mask.append(0.) 322 | else: 323 | target_task_values.append(target_val) 324 | target_task_mask.append(1.) 325 | batch_target_task_values.append(target_task_values) 326 | batch_target_task_mask.append(target_task_mask) 327 | num_graphs += 1 328 | num_graphs_in_batch += 1 329 | node_offset += num_nodes_in_graph 330 | 331 | batch_feed_dict = { 332 | self.placeholders['initial_node_representation']: np.array(batch_node_features), 333 | self.placeholders['num_incoming_edges_per_type']: np.concatenate(batch_num_incoming_edges_per_type, axis=0), 334 | self.placeholders['graph_nodes_list']: np.concatenate(batch_graph_nodes_list), 335 | self.placeholders['target_values']: np.transpose(batch_target_task_values, axes=[1,0]), 336 | self.placeholders['target_mask']: np.transpose(batch_target_task_mask, axes=[1, 0]), 337 | self.placeholders['num_graphs']: num_graphs_in_batch, 338 | self.placeholders['graph_state_keep_prob']: state_dropout_keep_prob, 339 | self.placeholders['edge_weight_dropout_keep_prob']: edge_weights_dropout_keep_prob 340 | } 341 | 342 | # Merge adjacency lists and information about incoming nodes: 343 | for i in range(self.num_edge_types): 344 | if len(batch_adjacency_lists[i]) > 0: 345 | adj_list = np.concatenate(batch_adjacency_lists[i]) 346 | else: 347 | adj_list = np.zeros((0, 2), dtype=np.int32) 348 | batch_feed_dict[self.placeholders['adjacency_lists'][i]] = adj_list 349 | 350 | yield batch_feed_dict 351 | 352 | def evaluate_one_batch(self, data): 353 | fetch_list = self.output 354 | batch_feed_dict = self.make_minibatch_iterator(data, is_training=False) 355 | 356 | for item in batch_feed_dict: 357 | item[self.placeholders['graph_state_keep_prob']] = 1.0 358 | item[self.placeholders['edge_weight_dropout_keep_prob']] = 1.0 359 | item[self.placeholders['out_layer_dropout_keep_prob']] = 1.0 360 | item[self.placeholders['target_values']] = [[]] 361 | item[self.placeholders['target_mask']] = [[]] 362 | print(self.sess.run(fetch_list, feed_dict=item)) 363 | 364 | def example_evaluation(self): 365 | ''' Demonstration of what test-time code would look like 366 | we query the model with the first n_example_molecules from the validation file 367 | ''' 368 | n_example_molecules = 10 369 | with open('molecules_valid.json', 'r') as valid_file: 370 | example_molecules = json.load(valid_file)[:n_example_molecules] 371 | 372 | for mol in example_molecules: 373 | print(mol['targets']) 374 | 375 | example_molecules = self.process_raw_graphs(example_molecules, is_training_data=False) 376 | self.evaluate_one_batch(example_molecules) 377 | 378 | def main(): 379 | args = docopt(__doc__) 380 | try: 381 | model = SparseGGNNChemModel(args) 382 | if args['--evaluate']: 383 | model.example_evaluation() 384 | else: 385 | model.train() 386 | except: 387 | typ, value, tb = sys.exc_info() 388 | traceback.print_exc() 389 | pdb.post_mortem(tb) 390 | 391 | 392 | if __name__ == "__main__": 393 | main() 394 | -------------------------------------------------------------------------------- /get_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | from rdkit import Chem 3 | import glob 4 | import json 5 | import numpy as np 6 | 7 | if not os.path.exists('data'): 8 | os.mkdir('data') 9 | print('made directory ./data/') 10 | 11 | download_path = os.path.join('data', 'dsgdb9nsd.xyz.tar.bz2') 12 | if not os.path.exists(download_path): 13 | print('downloading data to %s ...' % download_path) 14 | source = 'https://ndownloader.figshare.com/files/3195389' 15 | os.system('wget -O %s %s' % (download_path, source)) 16 | print('finished downloading') 17 | 18 | unzip_path = os.path.join('data', 'qm9_raw') 19 | if not os.path.exists(unzip_path): 20 | print('extracting data to %s ...' % unzip_path) 21 | os.mkdir(unzip_path) 22 | os.system('tar xvjf %s -C %s' % (download_path, unzip_path)) 23 | print('finished extracting') 24 | 25 | def preprocess(): 26 | index_of_mu = 4 27 | 28 | def read_xyz(file_path): 29 | with open(file_path, 'r') as f: 30 | lines = f.readlines() 31 | smiles = lines[-2].split('\t')[0] 32 | properties = lines[1].split('\t') 33 | mu = float(properties[index_of_mu]) 34 | return {'smiles': smiles, 'mu': mu} 35 | 36 | print('loading train/validation split') 37 | with open('valid_idx.json', 'r') as f: 38 | valid_idx = json.load(f)['valid_idxs'] 39 | valid_files = [os.path.join(unzip_path, 'dsgdb9nsd_%s.xyz' % i) for i in valid_idx] 40 | 41 | print('reading data...') 42 | raw_data = {'train': [], 'valid': []} 43 | all_files = glob.glob(os.path.join(unzip_path, '*.xyz')) 44 | for file_idx, file_path in enumerate(all_files): 45 | if file_idx % 100 == 0: 46 | print('%.1f %% \r' % (file_idx / float(len(all_files)) * 100), end=""), 47 | if file_path not in valid_files: 48 | raw_data['train'].append(read_xyz(file_path)) 49 | else: 50 | raw_data['valid'].append(read_xyz(file_path)) 51 | all_mu = [mol['mu'] for mol in raw_data['train']] 52 | mean_mu = np.mean(all_mu) 53 | std_mu = np.std(all_mu) 54 | 55 | def normalize_mu(mu): 56 | return (mu - mean_mu) / std_mu 57 | 58 | def onehot(idx, len): 59 | z = [0 for _ in range(len)] 60 | z[idx] = 1 61 | return z 62 | 63 | bond_dict = {'SINGLE': 1, 'DOUBLE': 2, 'TRIPLE': 3, "AROMATIC": 4} 64 | def to_graph(smiles): 65 | mol = Chem.MolFromSmiles(smiles) 66 | mol = Chem.AddHs(mol) 67 | edges = [] 68 | nodes = [] 69 | for bond in mol.GetBonds(): 70 | edges.append((bond.GetBeginAtomIdx(), bond_dict[str(bond.GetBondType())], bond.GetEndAtomIdx())) 71 | for atom in mol.GetAtoms(): 72 | nodes.append(onehot(["H", "C", "N", "O", "F"].index(atom.GetSymbol()), 5)) 73 | return nodes, edges 74 | 75 | print('parsing smiles as graphs...') 76 | processed_data = {'train': [], 'valid': []} 77 | for section in ['train', 'valid']: 78 | for i,(smiles, mu) in enumerate([(mol['smiles'], mol['mu']) for mol in raw_data[section]]): 79 | if i % 100 == 0: 80 | print('%s: %.1f %% \r' % (section, 100*i/float(len(raw_data[section]))), end="") 81 | nodes, edges = to_graph(smiles) 82 | processed_data[section].append({ 83 | 'targets': [[normalize_mu(mu)]], 84 | 'graph': edges, 85 | 'node_features': nodes 86 | }) 87 | print('%s: 100 %% ' % (section)) 88 | with open('molecules_%s.json' % section, 'w') as f: 89 | json.dump(processed_data[section], f) 90 | 91 | preprocess() 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | docopt==0.6.2 2 | tensorflow==1.3.0 3 | numpy==1.13.1 4 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env/python 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | import queue 6 | import threading 7 | 8 | SMALL_NUMBER = 1e-7 9 | 10 | 11 | def glorot_init(shape): 12 | initialization_range = np.sqrt(6.0 / (shape[-2] + shape[-1])) 13 | return np.random.uniform(low=-initialization_range, high=initialization_range, size=shape).astype(np.float32) 14 | 15 | 16 | class ThreadedIterator: 17 | """An iterator object that computes its elements in a parallel thread to be ready to be consumed. 18 | The iterator should *not* return None""" 19 | 20 | def __init__(self, original_iterator, max_queue_size: int=2): 21 | self.__queue = queue.Queue(maxsize=max_queue_size) 22 | self.__thread = threading.Thread(target=lambda: self.worker(original_iterator)) 23 | self.__thread.start() 24 | 25 | def worker(self, original_iterator): 26 | for element in original_iterator: 27 | assert element is not None, 'By convention, iterator elements much not be None' 28 | self.__queue.put(element, block=True) 29 | self.__queue.put(None, block=True) 30 | 31 | def __iter__(self): 32 | next_element = self.__queue.get(block=True) 33 | while next_element is not None: 34 | yield next_element 35 | next_element = self.__queue.get(block=True) 36 | self.__thread.join() 37 | 38 | 39 | class MLP(object): 40 | def __init__(self, in_size, out_size, hid_sizes, dropout_keep_prob): 41 | self.in_size = in_size 42 | self.out_size = out_size 43 | self.hid_sizes = hid_sizes 44 | self.dropout_keep_prob = dropout_keep_prob 45 | self.params = self.make_network_params() 46 | 47 | def make_network_params(self): 48 | dims = [self.in_size] + self.hid_sizes + [self.out_size] 49 | weight_sizes = list(zip(dims[:-1], dims[1:])) 50 | weights = [tf.Variable(self.init_weights(s), name='MLP_W_layer%i' % i) 51 | for (i, s) in enumerate(weight_sizes)] 52 | biases = [tf.Variable(np.zeros(s[-1]).astype(np.float32), name='MLP_b_layer%i' % i) 53 | for (i, s) in enumerate(weight_sizes)] 54 | 55 | network_params = { 56 | "weights": weights, 57 | "biases": biases, 58 | } 59 | 60 | return network_params 61 | 62 | def init_weights(self, shape): 63 | return np.sqrt(6.0 / (shape[-2] + shape[-1])) * (2 * np.random.rand(*shape).astype(np.float32) - 1) 64 | 65 | def __call__(self, inputs): 66 | acts = inputs 67 | for W, b in zip(self.params["weights"], self.params["biases"]): 68 | hid = tf.matmul(acts, tf.nn.dropout(W, self.dropout_keep_prob)) + b 69 | acts = tf.nn.relu(hid) 70 | last_hidden = hid 71 | return last_hidden --------------------------------------------------------------------------------