├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── chem_tensorflow.py
├── chem_tensorflow_async.py
├── chem_tensorflow_dense.py
├── chem_tensorflow_gcn.py
├── chem_tensorflow_sparse.py
├── get_data.py
├── requirements.txt
├── utils.py
└── valid_idx.json


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | molecules_*.json
104 | data/
105 | logs/
106 | .idea/


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## CONTRIBUTING
 2 | 
 3 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
 5 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
 6 | 
 7 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
 8 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
 9 | provided by the bot. You will only need to do this once across all repos using our CLA.
10 | 
11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Gated Graph Neural Networks
 2 | 
 3 | > ## This repository is not maintained anymore. An updated version of the _sparse_ codebase in this repo, together with many more GNN implementations, is available on https://github.com/microsoft/tf-gnn-samples.
 4 | 
 5 | This repository contains two implementations of the Gated Graph Neural Networks
 6 | of [Li et al. 2015](https://arxiv.org/abs/1511.05493) for learning properties of chemical molecules.
 7 | The inspiration for this application comes from [Gilmer et al. 2017](https://arxiv.org/abs/1704.01212).
 8 | 
 9 | This code was tested in Python 3.5 with TensorFlow 1.3. To run the code `docopt` is also necessary.
10 | 
11 | This code was maintained by the [Deep Program Understanding](https://www.microsoft.com/en-us/research/project/program/) project at Microsoft Research, Cambridge, UK.
12 | 
13 | ## Data Extraction
14 | To download the related data run `get_data.py`. It requires the python package `rdkit` within the Python package
15 | environment. For example, this can be obtained by
16 | ```
17 | conda install -c rdkit rdkit
18 | ```
19 | 
20 | ## Running Graph Neural Network Training
21 | We provide four versions of Graph Neural Networks: Gated Graph Neural Networks (one implementation using dense
22 | adjacency matrices and a sparse variant), Asynchronous Gated Graph Neural Networks, and Graph Convolutional
23 | Networks (sparse).
24 | The dense version is faster for small or dense graphs, including the molecules dataset (though the difference is
25 | small for it). In contrast, the sparse version is faster for large and sparse graphs, especially in cases where
26 | representing a dense representation of the adjacency matrix would result in prohibitively large memory usage.
27 | Asynchronous GNNs do not propagate information from all nodes to all neighbouring nodes at each timestep;
28 | instead, they follow an update schedule such that messages are propagated in sequence. Their implementation
29 | is far more inefficient (due to the small number of updates at each step), but a single propagation round
30 | (i.e., performing each propagation step along a few edges once) can suffice to propagate messages across a
31 | large graph.
32 | 
33 | To run dense Gated Graph Neural Networks, use
34 | ```
35 | python3 ./chem_tensorflow_dense.py
36 | ```
37 | 
38 | To run sparse Gated Graph Neural Networks, use
39 | ```
40 | python3 ./chem_tensorflow_sparse.py
41 | ```
42 | 
43 | To run sparse Graph Convolutional Networks (as in [Kipf et al. 2016](https://arxiv.org/abs/1609.02907)), use
44 | ```
45 | python3 ./chem_tensorflow_gcn.py
46 | ```
47 | 
48 | Finally, it turns out that the extension of GCN to different edge types is a variant of GGNN, and you can run
49 | GCN (as in [Schlichtkrull et al. 2017](https://arxiv.org/abs/1703.06103)) by calling
50 | ```
51 | python3 ./chem_tensorflow_sparse.py --config '{"use_edge_bias": false, "use_edge_msg_avg_aggregation": true, "residual_connections": {}, "layer_timesteps": [1,1,1,1,1,1,1,1], "graph_rnn_cell": "RNN", "graph_rnn_activation": "ReLU"}'
52 | ```
53 | 
54 | To run asynchronous Gated Graph Neural Networks, use
55 | ```
56 | python3 ./chem_tensorflow_async.py
57 | ```
58 | 
59 | ## Restoring models
60 | 
61 | Suppose you have trained a model e.g. the following trains for a single epoch:
62 | 
63 | ```
64 | python3 ./chem_tensorflow_dense.py --config '{"num_epochs": 1}'
65 | == Epoch 1
66 |  Train: loss: 0.52315 | acc: 0:0.64241 | error_ratio: 0:9.65831 | instances/sec: 6758.04
67 |  Valid: loss: 0.26930 | acc: 0:0.55949 | error_ratio: 0:8.41163 | instances/sec: 9902.71
68 |   (Best epoch so far, cum. val. acc decreased to 0.55949 from inf. Saving to './2018-02-01-11-30-05_16306_model_best.pickle')
69 | ```
70 | 
71 | Note that a checkpoint was stored to './2018-02-01-11-30-05_16306_model_best.pickle'. To restore this model and continue training, use:
72 | ```
73 | python3 ./chem_tensorflow_dense.py --restore ./2018-02-01-11-30-05_16306_model_best.pickle
74 | ```
75 | 
76 | 
77 | 
78 | 
79 | ## Contributing
80 | 
81 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
82 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
83 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
84 | 
85 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
86 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
87 | provided by the bot. You will only need to do this once across all repos using our CLA.
88 | 
89 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
90 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
91 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
92 | 


--------------------------------------------------------------------------------
/chem_tensorflow.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env/python
  2 | 
  3 | import json
  4 | import os
  5 | import pickle
  6 | import random
  7 | import time
  8 | from typing import List, Any, Sequence
  9 | 
 10 | import numpy as np
 11 | import tensorflow as tf
 12 | 
 13 | from utils import MLP, ThreadedIterator, SMALL_NUMBER
 14 | 
 15 | 
 16 | class ChemModel(object):
 17 |     @classmethod
 18 |     def default_params(cls):
 19 |         return {
 20 |             'num_epochs': 3000,
 21 |             'patience': 25,
 22 |             'learning_rate': 0.001,
 23 |             'clamp_gradient_norm': 1.0,
 24 |             'out_layer_dropout_keep_prob': 1.0,
 25 | 
 26 |             'hidden_size': 100,
 27 |             'num_timesteps': 4,
 28 |             'use_graph': True,
 29 | 
 30 |             'tie_fwd_bkwd': True,
 31 |             'task_ids': [0],
 32 | 
 33 |             'random_seed': 0,
 34 | 
 35 |             'train_file': 'molecules_train.json',
 36 |             'valid_file': 'molecules_valid.json'
 37 |         }
 38 | 
 39 |     def __init__(self, args):
 40 |         self.args = args
 41 | 
 42 |         # Collect argument things:
 43 |         data_dir = ''
 44 |         if '--data_dir' in args and args['--data_dir'] is not None:
 45 |             data_dir = args['--data_dir']
 46 |         self.data_dir = data_dir
 47 | 
 48 |         self.run_id = "_".join([time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())])
 49 |         log_dir = args.get('--log_dir') or '.'
 50 |         os.makedirs(log_dir, exist_ok=True)
 51 |         self.log_file = os.path.join(log_dir, "%s_log.json" % self.run_id)
 52 |         self.best_model_file = os.path.join(log_dir, "%s_model_best.pickle" % self.run_id)
 53 |         tb_log_dir = os.path.join(log_dir, "tb", self.run_id)
 54 |         os.makedirs(tb_log_dir, exist_ok=True)
 55 | 
 56 |         # Collect parameters:
 57 |         params = self.default_params()
 58 |         config_file = args.get('--config-file')
 59 |         if config_file is not None:
 60 |             with open(config_file, 'r') as f:
 61 |                 params.update(json.load(f))
 62 |         config = args.get('--config')
 63 |         if config is not None:
 64 |             params.update(json.loads(config))
 65 |         self.params = params
 66 |         with open(os.path.join(log_dir, "%s_params.json" % self.run_id), "w") as f:
 67 |             json.dump(params, f)
 68 |         print("Run %s starting with following parameters:\n%s" % (self.run_id, json.dumps(self.params)))
 69 |         random.seed(params['random_seed'])
 70 |         np.random.seed(params['random_seed'])
 71 | 
 72 |         # Load data:
 73 |         self.max_num_vertices = 0
 74 |         self.num_edge_types = 0
 75 |         self.annotation_size = 0
 76 |         self.train_data = self.load_data(params['train_file'], is_training_data=True)
 77 |         self.valid_data = self.load_data(params['valid_file'], is_training_data=False)
 78 | 
 79 |         # Build the actual model
 80 |         config = tf.ConfigProto()
 81 |         config.gpu_options.allow_growth = True
 82 |         self.graph = tf.Graph()
 83 |         self.sess = tf.Session(graph=self.graph, config=config)
 84 |         with self.graph.as_default():
 85 |             tf.set_random_seed(params['random_seed'])
 86 |             self.placeholders = {}
 87 |             self.weights = {}
 88 |             self.ops = {}
 89 |             self.make_model()
 90 |             self.make_train_step()
 91 |             self.make_summaries()
 92 | 
 93 |             # Restore/initialize variables:
 94 |             restore_file = args.get('--restore')
 95 |             if restore_file is not None:
 96 |                 self.train_step_id, self.valid_step_id = self.restore_progress(restore_file)
 97 |             else:
 98 |                 self.initialize_model()
 99 |                 self.train_step_id = 0
100 |                 self.valid_step_id = 0
101 |             self.train_writer = tf.summary.FileWriter(os.path.join(tb_log_dir, 'train'), graph=self.graph)
102 |             self.valid_writer = tf.summary.FileWriter(os.path.join(tb_log_dir, 'validation'), graph=self.graph)
103 | 
104 |     def load_data(self, file_name, is_training_data: bool):
105 |         full_path = os.path.join(self.data_dir, file_name)
106 | 
107 |         print("Loading data from %s" % full_path)
108 |         with open(full_path, 'r') as f:
109 |             data = json.load(f)
110 | 
111 |         restrict = self.args.get("--restrict_data")
112 |         if restrict is not None and restrict > 0:
113 |             data = data[:restrict]
114 | 
115 |         # Get some common data out:
116 |         num_fwd_edge_types = 0
117 |         for g in data:
118 |             self.max_num_vertices = max(self.max_num_vertices, max([v for e in g['graph'] for v in [e[0], e[2]]]))
119 |             num_fwd_edge_types = max(num_fwd_edge_types, max([e[1] for e in g['graph']]))
120 |         self.num_edge_types = max(self.num_edge_types, num_fwd_edge_types * (1 if self.params['tie_fwd_bkwd'] else 2))
121 |         self.annotation_size = max(self.annotation_size, len(data[0]["node_features"][0]))
122 | 
123 |         return self.process_raw_graphs(data, is_training_data)
124 | 
125 |     @staticmethod
126 |     def graph_string_to_array(graph_string: str) -> List[List[int]]:
127 |         return [[int(v) for v in s.split(' ')]
128 |                 for s in graph_string.split('\n')]
129 | 
130 |     def process_raw_graphs(self, raw_data: Sequence[Any], is_training_data: bool) -> Any:
131 |         raise Exception("Models have to implement process_raw_graphs!")
132 | 
133 |     def make_model(self):
134 |         self.placeholders['target_values'] = tf.placeholder(tf.float32, [len(self.params['task_ids']), None],
135 |                                                             name='target_values')
136 |         self.placeholders['target_mask'] = tf.placeholder(tf.float32, [len(self.params['task_ids']), None],
137 |                                                           name='target_mask')
138 |         self.placeholders['num_graphs'] = tf.placeholder(tf.int32, [], name='num_graphs')
139 |         self.placeholders['out_layer_dropout_keep_prob'] = tf.placeholder(tf.float32, [], name='out_layer_dropout_keep_prob')
140 | 
141 |         with tf.variable_scope("graph_model"):
142 |             self.prepare_specific_graph_model()
143 |             # This does the actual graph work:
144 |             if self.params['use_graph']:
145 |                 self.ops['final_node_representations'] = self.compute_final_node_representations()
146 |             else:
147 |                 self.ops['final_node_representations'] = tf.zeros_like(self.placeholders['initial_node_representation'])
148 | 
149 |         self.ops['losses'] = []
150 |         for (internal_id, task_id) in enumerate(self.params['task_ids']):
151 |             with tf.variable_scope("out_layer_task%i" % task_id):
152 |                 with tf.variable_scope("regression_gate"):
153 |                     self.weights['regression_gate_task%i' % task_id] = MLP(2 * self.params['hidden_size'], 1, [],
154 |                                                                            self.placeholders['out_layer_dropout_keep_prob'])
155 |                 with tf.variable_scope("regression"):
156 |                     self.weights['regression_transform_task%i' % task_id] = MLP(self.params['hidden_size'], 1, [],
157 |                                                                                 self.placeholders['out_layer_dropout_keep_prob'])
158 |                 computed_values = self.gated_regression(self.ops['final_node_representations'],
159 |                                                         self.weights['regression_gate_task%i' % task_id],
160 |                                                         self.weights['regression_transform_task%i' % task_id])
161 |                 diff = computed_values - self.placeholders['target_values'][internal_id, :]
162 |                 task_target_mask = self.placeholders['target_mask'][internal_id, :]
163 |                 task_target_num = tf.reduce_sum(task_target_mask) + SMALL_NUMBER
164 |                 diff = diff * task_target_mask  # Mask out unused values
165 |                 self.ops['accuracy_task%i' % task_id] = tf.reduce_sum(tf.abs(diff)) / task_target_num
166 |                 task_loss = tf.reduce_sum(0.5 * tf.square(diff)) / task_target_num
167 |                 # Normalise loss to account for fewer task-specific examples in batch:
168 |                 task_loss = task_loss * (1.0 / (self.params['task_sample_ratios'].get(task_id) or 1.0))
169 |                 self.ops['losses'].append(task_loss)
170 |         self.ops['loss'] = tf.reduce_sum(self.ops['losses'])
171 | 
172 |     def make_train_step(self):
173 |         trainable_vars = self.sess.graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
174 |         if self.args.get('--freeze-graph-model'):
175 |             graph_vars = set(self.sess.graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="graph_model"))
176 |             filtered_vars = []
177 |             for var in trainable_vars:
178 |                 if var not in graph_vars:
179 |                     filtered_vars.append(var)
180 |                 else:
181 |                     print("Freezing weights of variable %s." % var.name)
182 |             trainable_vars = filtered_vars
183 |         optimizer = tf.train.AdamOptimizer(self.params['learning_rate'])
184 |         grads_and_vars = optimizer.compute_gradients(self.ops['loss'], var_list=trainable_vars)
185 |         clipped_grads = []
186 |         for grad, var in grads_and_vars:
187 |             if grad is not None:
188 |                 clipped_grads.append((tf.clip_by_norm(grad, self.params['clamp_gradient_norm']), var))
189 |             else:
190 |                 clipped_grads.append((grad, var))
191 |         self.ops['train_step'] = optimizer.apply_gradients(clipped_grads)
192 |         # Initialize newly-introduced variables:
193 |         self.sess.run(tf.local_variables_initializer())
194 | 
195 |     def make_summaries(self):
196 |         with tf.name_scope('summary'):
197 |             tf.summary.scalar('loss', self.ops['loss'])
198 |             for task_id in self.params['task_ids']:
199 |                 tf.summary.scalar('accuracy%i' % task_id, self.ops['accuracy_task%i' % task_id])
200 |         self.ops['summary'] = tf.summary.merge_all()
201 | 
202 |     def gated_regression(self, last_h, regression_gate, regression_transform):
203 |         raise Exception("Models have to implement gated_regression!")
204 | 
205 |     def prepare_specific_graph_model(self) -> None:
206 |         raise Exception("Models have to implement prepare_specific_graph_model!")
207 | 
208 |     def compute_final_node_representations(self) -> tf.Tensor:
209 |         raise Exception("Models have to implement compute_final_node_representations!")
210 | 
211 |     def make_minibatch_iterator(self, data: Any, is_training: bool):
212 |         raise Exception("Models have to implement make_minibatch_iterator!")
213 | 
214 |     def run_epoch(self, epoch_name: str, data, is_training: bool, start_step: int = 0):
215 |         chemical_accuracies = np.array([0.066513725, 0.012235489, 0.071939046, 0.033730778, 0.033486113, 0.004278493,
216 |                                         0.001330901, 0.004165489, 0.004128926, 0.00409976, 0.004527465, 0.012292586,
217 |                                         0.037467458])
218 | 
219 |         loss = 0
220 |         accuracies = []
221 |         accuracy_ops = [self.ops['accuracy_task%i' % task_id] for task_id in self.params['task_ids']]
222 |         start_time = time.time()
223 |         processed_graphs = 0
224 |         steps = 0
225 |         batch_iterator = ThreadedIterator(self.make_minibatch_iterator(data, is_training), max_queue_size=5)
226 |         for step, batch_data in enumerate(batch_iterator):
227 |             num_graphs = batch_data[self.placeholders['num_graphs']]
228 |             processed_graphs += num_graphs
229 |             if is_training:
230 |                 batch_data[self.placeholders['out_layer_dropout_keep_prob']] = self.params['out_layer_dropout_keep_prob']
231 |                 fetch_list = [self.ops['loss'], accuracy_ops, self.ops['summary'], self.ops['train_step']]
232 |             else:
233 |                 batch_data[self.placeholders['out_layer_dropout_keep_prob']] = 1.0
234 |                 fetch_list = [self.ops['loss'], accuracy_ops, self.ops['summary']]
235 |             result = self.sess.run(fetch_list, feed_dict=batch_data)
236 |             (batch_loss, batch_accuracies, batch_summary) = (result[0], result[1], result[2])
237 |             writer = self.train_writer if is_training else self.valid_writer
238 |             writer.add_summary(batch_summary, start_step + step)
239 |             loss += batch_loss * num_graphs
240 |             accuracies.append(np.array(batch_accuracies) * num_graphs)
241 | 
242 |             print("Running %s, batch %i (has %i graphs). Loss so far: %.4f" % (epoch_name,
243 |                                                                                step,
244 |                                                                                num_graphs,
245 |                                                                                loss / processed_graphs),
246 |                   end='\r')
247 |             steps += 1
248 | 
249 |         accuracies = np.sum(accuracies, axis=0) / processed_graphs
250 |         loss = loss / processed_graphs
251 |         error_ratios = accuracies / chemical_accuracies[self.params["task_ids"]]
252 |         instance_per_sec = processed_graphs / (time.time() - start_time)
253 |         return loss, accuracies, error_ratios, instance_per_sec, steps
254 | 
255 |     def train(self):
256 |         log_to_save = []
257 |         total_time_start = time.time()
258 |         with self.graph.as_default():
259 |             if self.args.get('--restore') is not None:
260 |                 _, valid_accs, _, _, steps = self.run_epoch("Resumed (validation)", self.valid_data, False)
261 |                 best_val_acc = np.sum(valid_accs)
262 |                 best_val_acc_epoch = 0
263 |                 print("\r\x1b[KResumed operation, initial cum. val. acc: %.5f" % best_val_acc)
264 |             else:
265 |                 (best_val_acc, best_val_acc_epoch) = (float("+inf"), 0)
266 |             for epoch in range(1, self.params['num_epochs'] + 1):
267 |                 print("== Epoch %i" % epoch)
268 |                 train_loss, train_accs, train_errs, train_speed, train_steps = self.run_epoch("epoch %i (training)" % epoch,
269 |                                                                                               self.train_data, True, self.train_step_id)
270 |                 self.train_step_id += train_steps
271 |                 accs_str = " ".join(["%i:%.5f" % (id, acc) for (id, acc) in zip(self.params['task_ids'], train_accs)])
272 |                 errs_str = " ".join(["%i:%.5f" % (id, err) for (id, err) in zip(self.params['task_ids'], train_errs)])
273 |                 print("\r\x1b[K Train: loss: %.5f | acc: %s | error_ratio: %s | instances/sec: %.2f" % (train_loss,
274 |                                                                                                         accs_str,
275 |                                                                                                         errs_str,
276 |                                                                                                         train_speed))
277 |                 valid_loss, valid_accs, valid_errs, valid_speed, valid_steps = self.run_epoch("epoch %i (validation)" % epoch,
278 |                                                                                               self.valid_data, False, self.valid_step_id)
279 |                 self.valid_step_id += valid_steps
280 |                 accs_str = " ".join(["%i:%.5f" % (id, acc) for (id, acc) in zip(self.params['task_ids'], valid_accs)])
281 |                 errs_str = " ".join(["%i:%.5f" % (id, err) for (id, err) in zip(self.params['task_ids'], valid_errs)])
282 |                 print("\r\x1b[K Valid: loss: %.5f | acc: %s | error_ratio: %s | instances/sec: %.2f" % (valid_loss,
283 |                                                                                                         accs_str,
284 |                                                                                                         errs_str,
285 |                                                                                                         valid_speed))
286 | 
287 |                 epoch_time = time.time() - total_time_start
288 |                 log_entry = {
289 |                     'epoch': epoch,
290 |                     'time': epoch_time,
291 |                     'train_results': (train_loss, train_accs.tolist(), train_errs.tolist(), train_speed),
292 |                     'valid_results': (valid_loss, valid_accs.tolist(), valid_errs.tolist(), valid_speed),
293 |                 }
294 |                 log_to_save.append(log_entry)
295 |                 with open(self.log_file, 'w') as f:
296 |                     json.dump(log_to_save, f, indent=4)
297 | 
298 |                 val_acc = np.sum(valid_accs)  # type: float
299 |                 if val_acc < best_val_acc:
300 |                     self.save_progress(self.best_model_file, self.train_step_id, self.valid_step_id)
301 |                     print("  (Best epoch so far, cum. val. acc decreased to %.5f from %.5f. Saving to '%s')" % (
302 |                         val_acc, best_val_acc, self.best_model_file))
303 |                     best_val_acc = val_acc
304 |                     best_val_acc_epoch = epoch
305 |                 elif epoch - best_val_acc_epoch >= self.params['patience']:
306 |                     print("Stopping training after %i epochs without improvement on validation accuracy." % self.params['patience'])
307 |                     break
308 | 
309 |     def save_progress(self, model_path: str, train_step: int, valid_step: int) -> None:
310 |         weights_to_save = {}
311 |         for variable in self.sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
312 |             assert variable.name not in weights_to_save
313 |             weights_to_save[variable.name] = self.sess.run(variable)
314 | 
315 |         data_to_save = {
316 |             "params": self.params,
317 |             "weights": weights_to_save,
318 |             "train_step": train_step,
319 |             "valid_step": valid_step,
320 |         }
321 | 
322 |         with open(model_path, 'wb') as out_file:
323 |             pickle.dump(data_to_save, out_file, pickle.HIGHEST_PROTOCOL)
324 | 
325 |     def initialize_model(self) -> None:
326 |         init_op = tf.group(tf.global_variables_initializer(),
327 |                            tf.local_variables_initializer())
328 |         self.sess.run(init_op)
329 | 
330 |     def restore_progress(self, model_path: str) -> (int, int):
331 |         print("Restoring weights from file %s." % model_path)
332 |         with open(model_path, 'rb') as in_file:
333 |             data_to_load = pickle.load(in_file)
334 | 
335 |         # Assert that we got the same model configuration
336 |         assert len(self.params) == len(data_to_load['params'])
337 |         for (par, par_value) in self.params.items():
338 |             # Fine to have different task_ids:
339 |             if par not in ['task_ids', 'num_epochs']:
340 |                 assert par_value == data_to_load['params'][par]
341 | 
342 |         variables_to_initialize = []
343 |         with tf.name_scope("restore"):
344 |             restore_ops = []
345 |             used_vars = set()
346 |             for variable in self.sess.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
347 |                 used_vars.add(variable.name)
348 |                 if variable.name in data_to_load['weights']:
349 |                     restore_ops.append(variable.assign(data_to_load['weights'][variable.name]))
350 |                 else:
351 |                     print('Freshly initializing %s since no saved value was found.' % variable.name)
352 |                     variables_to_initialize.append(variable)
353 |             for var_name in data_to_load['weights']:
354 |                 if var_name not in used_vars:
355 |                     print('Saved weights for %s not used by model.' % var_name)
356 |             restore_ops.append(tf.variables_initializer(variables_to_initialize))
357 |             self.sess.run(restore_ops)
358 | 
359 |         return data_to_load['train_step'], data_to_load['valid_step']
360 | 


--------------------------------------------------------------------------------
/chem_tensorflow_async.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env/python
  2 | """
  3 | Usage:
  4 |     chem_tensorflow_async.py [options]
  5 | 
  6 | Options:
  7 |     -h --help                Show this screen.
  8 |     --config-file FILE       Hyperparameter configuration file path (in JSON format).
  9 |     --config CONFIG          Hyperparameter configuration dictionary (in JSON format).
 10 |     --log_dir DIR            Log dir name.
 11 |     --data_dir DIR           Data dir name.
 12 |     --restore FILE           File to restore weights from.
 13 |     --freeze-graph-model     Freeze weights of graph model components.
 14 | """
 15 | from typing import List, Tuple, Dict, Sequence, Any
 16 | 
 17 | from docopt import docopt
 18 | from collections import defaultdict
 19 | import numpy as np
 20 | import tensorflow as tf
 21 | import sys, traceback
 22 | import pdb
 23 | 
 24 | from chem_tensorflow import ChemModel
 25 | from utils import glorot_init, SMALL_NUMBER
 26 | 
 27 | 
 28 | def bfs_visit(outgoing_edges: Dict[int, Sequence[int]], node_depths: Dict[int, int], v: int, depth: int):
 29 |     # Already seen, skip:
 30 |     if v in node_depths:
 31 |         return
 32 |     node_depths[v] = depth
 33 |     for (_, __, w) in outgoing_edges[v]:
 34 |         bfs_visit(outgoing_edges, node_depths, w, depth + 1)
 35 | 
 36 | 
 37 | class AsyncGGNNChemModel(ChemModel):
 38 |     def __init__(self, args):
 39 |         super().__init__(args)
 40 | 
 41 |     @classmethod
 42 |     def default_params(cls):
 43 |         params = dict(super().default_params())
 44 |         params.update({
 45 |             'num_nodes': 100000,
 46 |             'use_edge_bias': False,
 47 | 
 48 |             'propagation_rounds': 4,  # Has to be an even number
 49 |             'propagation_substeps': 15,
 50 | 
 51 |             'graph_rnn_cell': 'GRU',  # GRU or RNN
 52 |             'graph_rnn_activation': 'tanh',  # tanh, ReLU
 53 |             'graph_state_dropout_keep_prob': 1.,
 54 | 
 55 |             'task_sample_ratios': {},
 56 |         })
 57 |         return params
 58 | 
 59 |     def prepare_specific_graph_model(self) -> None:
 60 |         h_dim = self.params['hidden_size']
 61 |         self.placeholders['initial_node_representation'] = tf.placeholder(tf.float32, [None, h_dim],
 62 |                                                                           name='node_features')
 63 | 
 64 |         # Initial nodes I_{r}: Node IDs that will have no incoming edges in round r.
 65 |         self.placeholders['initial_nodes'] = [tf.placeholder(tf.int32, [None], name="initial_nodes_round%i" % prop_round)
 66 |                                               for prop_round in range(self.params['propagation_rounds'])]
 67 | 
 68 |         # Sending nodes S_{r,s,e}: Source node ids of edges propagating in step s of round r.
 69 |         # Restrictions: If v in S_{r,s,e}, then v in R_{r,s'} for s' < s or v in I_{r}
 70 |         self.placeholders['sending_nodes'] = [[[tf.placeholder(tf.int32,
 71 |                                                                [None],
 72 |                                                                name="sending_nodes_round%i_step%i_edgetyp%i" % (prop_round, step, edge_typ))
 73 |                                                 for edge_typ in range(self.num_edge_types)]
 74 |                                                for step in range(self.params['propagation_substeps'])]
 75 |                                               for prop_round in range(self.params['propagation_rounds'])]
 76 | 
 77 |         # Normalised edge target nodes T_{r,s}: Targets of edges propagating in step s of round r, normalised to a
 78 |         # continuous range starting from 0. This is used for aggregating messages from the sending nodes.
 79 |         self.placeholders['msg_targets'] = [[tf.placeholder(tf.int32,
 80 |                                                             [None],
 81 |                                                             name="msg_targets_nodes_round%i_step%i" % (prop_round, step))
 82 |                                              for step in range(self.params['propagation_substeps'])]
 83 |                                             for prop_round in range(self.params['propagation_rounds'])]
 84 | 
 85 | 
 86 |         # Receiving nodes R_{r,s}: Target node ids of aggregated messages in propagation step s of round r.
 87 |         # Restrictions: If v in R_{r,s}, v not in R_{r,s'} for all s' != s and v not in I_{r}
 88 |         self.placeholders['receiving_nodes'] = [[tf.placeholder(tf.int32,
 89 |                                                                 [None],
 90 |                                                                 name="receiving_nodes_round%i_step%i" % (prop_round, step))
 91 |                                                  for step in range(self.params['propagation_substeps'])]
 92 |                                                 for prop_round in range(self.params['propagation_rounds'])]
 93 | 
 94 |         # Number of receiving nodes N_{r,s}
 95 |         # Restrictions: N_{r,s} = len(R_{r,s})
 96 |         self.placeholders['receiving_node_num'] = [tf.placeholder(tf.int32,
 97 |                                                                   [self.params['propagation_substeps']],
 98 |                                                                   name="receiving_nodes_num_round%i" % (prop_round,))
 99 |                                                    for prop_round in range(self.params['propagation_rounds'])]
100 | 
101 |         self.placeholders['graph_nodes_list'] = tf.placeholder(tf.int32, [None], name='graph_nodes_list')
102 |         self.placeholders['graph_state_keep_prob'] = tf.placeholder(tf.float32, None, name='graph_state_keep_prob')
103 | 
104 |         activation_name = self.params['graph_rnn_activation'].lower()
105 |         if activation_name == 'tanh':
106 |             activation_fun = tf.nn.tanh
107 |         elif activation_name == 'relu':
108 |             activation_fun = tf.nn.relu
109 |         else:
110 |             raise Exception("Unknown activation function type '%s'." % activation_name)
111 | 
112 |         # Generate per-layer values for edge weights, biases and gated units. If we tie them, they are just copies:
113 |         self.weights['edge_weights'] = [tf.Variable(glorot_init([h_dim, h_dim]), name='gnn_edge_weights_typ%i' % e_typ)
114 |                                         for e_typ in range(self.num_edge_types)]
115 | 
116 |         if self.params['use_edge_bias']:
117 |             self.weights['edge_biases'] = [tf.Variable(np.zeros([h_dim], dtype=np.float32), name='gnn_edge_biases_typ%i' % e_typ)
118 |                                            for e_typ in range(self.num_edge_types)]
119 | 
120 |         cell_type = self.params['graph_rnn_cell'].lower()
121 |         if cell_type == 'gru':
122 |             cell = tf.nn.rnn_cell.GRUCell(h_dim, activation=activation_fun)
123 |         elif cell_type == 'rnn':
124 |             cell = tf.nn.rnn_cell.BasicRNNCell(h_dim, activation=activation_fun)
125 |         else:
126 |             raise Exception("Unknown RNN cell type '%s'." % cell_type)
127 |         cell = tf.nn.rnn_cell.DropoutWrapper(cell,
128 |                                              state_keep_prob=self.placeholders['graph_state_keep_prob'])
129 |         self.weights['rnn_cells'] = cell
130 | 
131 |     def compute_final_node_representations(self) -> tf.Tensor:
132 |         cur_node_states = self.placeholders['initial_node_representation']
133 | 
134 |         for prop_round in range(self.params['propagation_rounds']):
135 |             with tf.variable_scope('prop_round%i' % (prop_round,)):
136 |                 # ---- Declare and fill tensor arrays used in tf.while_loop:
137 |                 sending_nodes_ta = tf.TensorArray(tf.int32,
138 |                                                   infer_shape=False,
139 |                                                   element_shape=[None],
140 |                                                   size=self.params['propagation_substeps'] * self.num_edge_types,
141 |                                                   name='sending_nodes')
142 |                 msg_targets_ta = tf.TensorArray(tf.int32,
143 |                                                 infer_shape=False,
144 |                                                 element_shape=[None],
145 |                                                 size=self.params['propagation_substeps'],
146 |                                                 name='msg_targets')
147 |                 receiving_nodes_ta = tf.TensorArray(tf.int32,
148 |                                                     infer_shape=False,
149 |                                                     element_shape=[None],
150 |                                                     size=self.params['propagation_substeps'],
151 |                                                     clear_after_read=False,
152 |                                                     name='receiving_nodes')
153 |                 receiving_node_num_ta = tf.TensorArray(tf.int32,
154 |                                                        infer_shape=False,
155 |                                                        element_shape=[],
156 |                                                        size=self.params['propagation_substeps'],
157 |                                                        name='receiving_nodes_num')
158 | 
159 |                 for step in range(self.params['propagation_substeps']):
160 |                     for edge_typ in range(self.num_edge_types):
161 |                         sending_nodes_ta = sending_nodes_ta.write(step * self.num_edge_types + edge_typ,
162 |                                                                   self.placeholders['sending_nodes'][prop_round][step][edge_typ])
163 |                     msg_targets_ta = msg_targets_ta.write(step, self.placeholders['msg_targets'][prop_round][step])
164 |                     receiving_nodes_ta = receiving_nodes_ta.write(step, self.placeholders['receiving_nodes'][prop_round][step])
165 |                 receiving_node_num_ta = receiving_node_num_ta.unstack(self.placeholders['receiving_node_num'][prop_round])
166 | 
167 |                 new_node_states_ta = tf.TensorArray(tf.float32,
168 |                                                     infer_shape=False,
169 |                                                     element_shape=[self.params['hidden_size']],
170 |                                                     size=tf.shape(cur_node_states)[0],
171 |                                                     clear_after_read=False,
172 |                                                     name='new_node_states')
173 | 
174 |                 # ---- Actual propagation schedule implementation:
175 |                 # Initialize the initial nodes with their state from last round:
176 |                 new_node_states_ta = new_node_states_ta.scatter(self.placeholders['initial_nodes'][prop_round],
177 |                                                                 tf.gather(cur_node_states, self.placeholders['initial_nodes'][prop_round]))
178 | 
179 |                 def do_substep(substep_id, new_node_states_ta):
180 |                     # For each edge active in this substep, pull source state and transform:
181 |                     sent_messages = []
182 |                     for edge_typ in range(self.num_edge_types):
183 |                         sending_states = new_node_states_ta.gather(sending_nodes_ta.read(substep_id * self.num_edge_types + edge_typ))
184 |                         messages = tf.matmul(sending_states, self.weights['edge_weights'][edge_typ])
185 |                         if self.params['use_edge_bias']:
186 |                             messages += self.weights['edge_biases'][edge_typ]
187 |                         sent_messages.append(messages)
188 | 
189 |                     # Stack all edge messages and aggregate as sum for each receiving node:
190 |                     sent_messages = tf.concat(sent_messages, axis=0)
191 |                     aggregated_received_messages = tf.unsorted_segment_sum(sent_messages,
192 |                                                                            msg_targets_ta.read(substep_id),
193 |                                                                            receiving_node_num_ta.read(substep_id))
194 | 
195 |                     # Collect old states for receiving nodes, and combine in RNN cell with incoming message
196 |                     substep_receiving_nodes = receiving_nodes_ta.read(substep_id)
197 |                     old_receiving_node_states = tf.gather(cur_node_states, substep_receiving_nodes)
198 |                     aggregated_received_messages.set_shape([None, self.params['hidden_size']])
199 |                     old_receiving_node_states.set_shape([None, self.params['hidden_size']])
200 |                     substep_new_node_states = self.weights['rnn_cells'](aggregated_received_messages,
201 |                                                                         old_receiving_node_states)[1]
202 | 
203 |                     # Write updated states back:
204 |                     new_node_states_ta = new_node_states_ta.scatter(substep_receiving_nodes, substep_new_node_states)
205 |                     return (substep_id + 1, new_node_states_ta)
206 | 
207 |                 def is_done(substep_id, new_node_states_ta_unused):
208 |                     return tf.logical_and(substep_id < self.params['propagation_substeps'],
209 |                                           tf.greater(tf.shape(receiving_nodes_ta.read(substep_id))[0], 0))
210 | 
211 |                 _, new_node_states_ta = tf.while_loop(cond=is_done,
212 |                                                       body=do_substep,
213 |                                                       loop_vars=[tf.constant(0), new_node_states_ta]
214 |                                                      )
215 | 
216 |                 cur_node_states = new_node_states_ta.stack(name="state_stack_round%i" % (prop_round,))
217 | 
218 |         return cur_node_states
219 | 
220 |     def gated_regression(self, last_h, regression_gate, regression_transform):
221 |         # last_h: [v x h]
222 |         gate_input = tf.concat([last_h, self.placeholders['initial_node_representation']], axis=-1)  # [v x 2h]
223 |         gated_outputs = tf.nn.sigmoid(regression_gate(gate_input)) * regression_transform(last_h)  # [v x 1]
224 | 
225 |         # Sum up all nodes per graph
226 |         graph_representations = tf.unsorted_segment_sum(data=gated_outputs,
227 |                                                         segment_ids=self.placeholders['graph_nodes_list'],
228 |                                                         num_segments=self.placeholders['num_graphs'])  # [g x 1]
229 |         return tf.squeeze(graph_representations)  # [g]
230 | 
231 |     # ----- Data preprocessing and chunking into minibatches:
232 |     def process_raw_graphs(self, raw_data: Sequence[Any], is_training_data: bool) -> Any:
233 |         processed_graphs = []
234 |         for d in raw_data:
235 |             prop_schedules = self.__graph_to_propagation_schedules(d['graph'])
236 |             processed_graphs.append({"init": d["node_features"],
237 |                                      "prop_schedules": prop_schedules,
238 |                                      "target_values": [d["targets"][task_id][0] for task_id in self.params['task_ids']]})
239 | 
240 |         if is_training_data:
241 |             np.random.shuffle(processed_graphs)
242 |             for task_id in self.params['task_ids']:
243 |                 task_sample_ratio = self.params['task_sample_ratios'].get(str(task_id))
244 |                 if task_sample_ratio is not None:
245 |                     ex_to_sample = int(len(processed_graphs) * task_sample_ratio)
246 |                     for ex_id in range(ex_to_sample, len(processed_graphs)):
247 |                         processed_graphs[ex_id]['target_values'][task_id] = None
248 | 
249 |         return processed_graphs
250 | 
251 |     def __tensorise_edge_sequence(self, edges)\
252 |             -> Tuple[np.ndarray, List[List[np.ndarray]], List[List[np.ndarray]], List[np.ndarray]]:
253 |         sending_nodes = []  # type: List[List[np.ndarray]]
254 |         msg_targets = []  # type: List[List[np.ndarray]]
255 |         receiving_nodes = []  # type: List[np.ndarray]
256 |         all_nodes = set()
257 |         for step_edges in edges:
258 |             msg_targets_uniq = set(w for (_, __, w) in step_edges)
259 |             recv_nodes = list(sorted(msg_targets_uniq))
260 |             recv_nodes_to_uniq_id = {v: i for (i, v) in enumerate(recv_nodes)}
261 | 
262 |             sending_nodes_in_step = []
263 |             msg_targets_in_step = []
264 |             for target_e_typ in range(self.num_edge_types):
265 |                 sending_nodes_in_step.append(np.array([v for (v, e_typ, _) in step_edges if e_typ == target_e_typ], dtype=np.int32))
266 |                 msg_targets_in_step.append(np.array([recv_nodes_to_uniq_id[w] for (_, e_typ, w) in step_edges if e_typ == target_e_typ], dtype=np.int32))
267 |             msg_targets.append(msg_targets_in_step)
268 |             sending_nodes.append(sending_nodes_in_step)
269 |             receiving_nodes.append(np.array(recv_nodes, dtype=np.int32))
270 |             all_nodes.update(v for (v, _, __) in step_edges)
271 |             all_nodes.update(w for (_, __, w) in step_edges)
272 |         
273 |         all_updated_nodes = set()
274 |         all_updated_nodes.update(v for step_receiving_nodes in receiving_nodes
275 |                                    for v in step_receiving_nodes)
276 |         initial_nodes = list(sorted(all_nodes - all_updated_nodes))
277 | 
278 |         #initialised_nodes = set()
279 |         #initialised_nodes.update(initial_nodes)
280 |         #for step in range(len(receiving_nodes)):
281 |         #    sent_nodes = set()
282 |         #    for edge_typ in range(self.num_edge_types):
283 |         #        sent_nodes.update(sending_nodes[step][edge_typ])
284 |         #    for v in sent_nodes:
285 |         #        assert v in initialised_nodes
286 |         #
287 |         #    for v in receiving_nodes[step]:
288 |         #        assert v not in initialised_nodes
289 |         #    initialised_nodes.update(receiving_nodes[step])
290 | 
291 |         return (np.array(initial_nodes, dtype=np.int32), sending_nodes, msg_targets, receiving_nodes)
292 | 
293 |     def __graph_to_propagation_schedules(self, graph)\
294 |             -> List[Tuple[np.ndarray, List[List[np.ndarray]], List[List[np.ndarray]], List[np.ndarray]]]:
295 |         num_incoming_edges = defaultdict(lambda: 0)
296 |         outgoing_edges = defaultdict(lambda: [])
297 |         # Compute number of incoming edges per node, and build adjacency lists:
298 |         for (v, typ, w) in graph:
299 |             num_incoming_edges[v] += 1
300 |             num_incoming_edges[w] += 1
301 |             edge_bwd_typ = typ if self.params['tie_fwd_bkwd'] else self.num_edge_types + typ
302 |             outgoing_edges[v].append((v, typ, w))
303 |             outgoing_edges[w].append((w, edge_bwd_typ, v))
304 | 
305 |         # Sort them, pick node with lowest number of incoming edges:
306 |         tensorised_prop_schedules = []
307 |         for prop_round in range(int(self.params['propagation_rounds'] / 2)):
308 |             dag_seed = min(num_incoming_edges.items(), key=lambda t: t[1])[prop_round]
309 |             node_depths = {}
310 |             bfs_visit(outgoing_edges, node_depths, dag_seed, 0)
311 | 
312 |             # Now split edges into forward/backward sets, by using their depths.
313 |             # Intuitively, a node with depth h will get updated in step h.
314 |             max_depth = max(node_depths.values())
315 |             assert(max_depth <= self.params['propagation_substeps'])
316 |             fwd_pass_edges = [[] for _ in range(max_depth)]
317 |             bwd_pass_edges = [[] for _ in range(max_depth)]
318 |             for (v, typ, w) in graph:
319 |                 edge_bwd_type = typ if self.params['tie_fwd_bkwd'] else self.num_edge_types + typ
320 |                 v_depth = node_depths[v]
321 |                 w_depth = node_depths[w]
322 |                 if v_depth < w_depth:  # "Forward": We are going up in depth:
323 |                     fwd_pass_edges[w_depth - 1].append((v, typ, w))
324 |                     bwd_pass_edges[-v_depth - 1].append((w, edge_bwd_type, v))
325 |                 elif w_depth < v_depth:  # "Backward": We are going down in depth
326 |                     fwd_pass_edges[v_depth - 1].append((w, edge_bwd_type, v))
327 |                     bwd_pass_edges[-w_depth - 1].append((v, typ, w))
328 |                 else:
329 |                     # We ignore self-loops:
330 |                     assert v == w
331 | 
332 |             tensorised_prop_schedules.append(self.__tensorise_edge_sequence(fwd_pass_edges))
333 |             tensorised_prop_schedules.append(self.__tensorise_edge_sequence(bwd_pass_edges))
334 | 
335 |         return tensorised_prop_schedules
336 | 
337 |     def make_minibatch_iterator(self, data: Any, is_training: bool):
338 |         """Create minibatches by flattening graphs into a single one with multiple disconnected components."""
339 |         if is_training:
340 |             np.random.shuffle(data)
341 |         dropout_keep_prob = self.params['graph_state_dropout_keep_prob'] if is_training else 1.
342 | 
343 |         # Pack until we cannot fit more graphs in the batch
344 |         num_graphs = 0
345 |         while num_graphs < len(data):
346 |             num_graphs_in_batch = 0
347 |             batch_node_features = []
348 |             batch_target_task_values = []
349 |             batch_target_task_mask = []
350 |             batch_graph_nodes_list = []
351 |             node_offset = 0
352 | 
353 |             # Collect all indices; we'll strip out the batch dimension with a np.concatenate along that axis at the end:
354 |             batch_initial_nodes = [[] for _ in range(self.params['propagation_rounds'])
355 |                                   ]  # type: List[List[np.ndarray]]              # (prop_round, batch, None)
356 |             batch_sending_nodes = [[[[] for _ in range(self.num_edge_types)]
357 |                                     for _ in range(self.params['propagation_substeps'])]
358 |                                    for _ in range(self.params['propagation_rounds'])
359 |                                   ]  # type: List[List[List[List[np.ndarray]]]]  # (prop_round, step, edge_typ, batch, None)
360 |             batch_msg_targets = [[[[] for _ in range(self.num_edge_types)]
361 |                                   for _ in range(self.params['propagation_substeps'])]
362 |                                  for _ in range(self.params['propagation_rounds'])
363 |                                 ]  # type: List[List[List[List[np.ndarray]]]]    # (prop_round, step, edge_typ, batch, None)
364 |             batch_receiving_nodes = [[[] for _ in range(self.params['propagation_substeps'])]
365 |                                      for _ in range(self.params['propagation_rounds'])
366 |                                     ]  # type: List[List[List[np.ndarray]]]      # (prop_round, step, batch, None)
367 |             batch_receiving_node_num = [[0 for _ in range(self.params['propagation_substeps'])]
368 |                                         for _ in range(self.params['propagation_rounds'])
369 |                                        ]  # type: List[List[int]]                # (prop_round, step)
370 | 
371 |             msg_target_offsets = [[[0 for _ in range(self.num_edge_types)]
372 |                                    for _ in range(self.params['propagation_substeps'])]
373 |                                   for _ in range(self.params['propagation_rounds'])]
374 | 
375 |             while num_graphs < len(data) and node_offset + len(data[num_graphs]['init']) < self.params['num_nodes']:
376 |                 cur_graph = data[num_graphs]
377 |                 num_nodes_in_graph = len(cur_graph['init'])
378 |                 padded_features = np.pad(cur_graph['init'],
379 |                                          ((0, 0), (0, self.params['hidden_size'] - self.annotation_size)),
380 |                                          'constant')
381 |                 batch_node_features.extend(padded_features)
382 |                 batch_graph_nodes_list.append(np.full(shape=[num_nodes_in_graph], fill_value=num_graphs_in_batch, dtype=np.int32))
383 | 
384 |                 # Combine the different propagation schedules:
385 |                 for prop_round in range(self.params['propagation_rounds']):
386 |                     cur_prop_schedule = cur_graph['prop_schedules'][prop_round]
387 |                     (graph_initial_nodes,
388 |                      graph_sending_nodes,
389 |                      graph_msg_targets,
390 |                      graph_recv_nodes) = cur_prop_schedule
391 |                     batch_initial_nodes[prop_round].append(graph_initial_nodes + node_offset)
392 |                     for step in range(self.params['propagation_substeps']):
393 |                         # Stop if we don't have that many steps:
394 |                         if step >= len(graph_sending_nodes):
395 |                             break
396 | 
397 |                         for e_typ in range(self.num_edge_types):
398 |                             batch_sending_nodes[prop_round][step][e_typ].append(graph_sending_nodes[step][e_typ] + node_offset)
399 |                             batch_msg_targets[prop_round][step][e_typ].append(graph_msg_targets[step][e_typ] + msg_target_offsets[prop_round][step][e_typ])
400 |                             if len(graph_msg_targets[step][e_typ]) > 0:
401 |                                 msg_target_offsets[prop_round][step][e_typ] += max(graph_msg_targets[step][e_typ]) + 1  # ... 0-based indexing!
402 |                         batch_receiving_nodes[prop_round][step].append(graph_recv_nodes[step] + node_offset)
403 |                         batch_receiving_node_num[prop_round][step] += len(graph_recv_nodes[step])
404 | 
405 |                 target_task_values = []
406 |                 target_task_mask = []
407 |                 for target_val in cur_graph['target_values']:
408 |                     if target_val is None:  # This is one of the examples we didn't sample...
409 |                         target_task_values.append(0.)
410 |                         target_task_mask.append(0.)
411 |                     else:
412 |                         target_task_values.append(target_val)
413 |                         target_task_mask.append(1.)
414 |                 batch_target_task_values.append(target_task_values)
415 |                 batch_target_task_mask.append(target_task_mask)
416 |                 num_graphs += 1
417 |                 num_graphs_in_batch += 1
418 |                 node_offset += num_nodes_in_graph
419 | 
420 |             batch_feed_dict = {
421 |                 self.placeholders['initial_node_representation']: np.array(batch_node_features),
422 |                 self.placeholders['graph_nodes_list']: np.concatenate(batch_graph_nodes_list, axis=0),
423 |                 self.placeholders['target_values']: np.transpose(batch_target_task_values, axes=[1,0]),
424 |                 self.placeholders['target_mask']: np.transpose(batch_target_task_mask, axes=[1, 0]),
425 |                 self.placeholders['num_graphs']: num_graphs_in_batch,
426 |                 self.placeholders['graph_state_keep_prob']: dropout_keep_prob,
427 |             }
428 | 
429 |             for prop_round in range(self.params['propagation_rounds']):
430 |                 batch_feed_dict[self.placeholders['initial_nodes'][prop_round]] = \
431 |                     np.concatenate(batch_initial_nodes[prop_round], axis=0)
432 |                 for step in range(self.params['propagation_substeps']):
433 |                     msg_targets = []
434 |                     for edge_typ in range(self.num_edge_types):
435 |                         raw_senders = batch_sending_nodes[prop_round][step][edge_typ]
436 |                         batch_feed_dict[self.placeholders['sending_nodes'][prop_round][step][edge_typ]] = \
437 |                             np.concatenate(raw_senders, axis=0) if len(raw_senders) > 0 else np.empty(shape=(0,),
438 |                                                                                                       dtype=np.int32)
439 |                         raw_targets = batch_msg_targets[prop_round][step][edge_typ]
440 |                         msg_targets.extend(np.concatenate(raw_targets, axis=0) if len(raw_targets) > 0 else np.empty(shape=(0,),
441 |                                                                                                                      dtype=np.int32))
442 | 
443 |                     batch_feed_dict[self.placeholders['msg_targets'][prop_round][step]] = \
444 |                         np.array(msg_targets, dtype=np.int32)
445 |                     raw_recvs = batch_receiving_nodes[prop_round][step]
446 |                     batch_feed_dict[self.placeholders['receiving_nodes'][prop_round][step]] = \
447 |                         np.concatenate(raw_recvs, axis=0) if len(raw_recvs) > 0 else np.empty(shape=(0,),
448 |                                                                                               dtype=np.int32)
449 |                 batch_feed_dict[self.placeholders['receiving_node_num'][prop_round]] = \
450 |                     np.array(batch_receiving_node_num[prop_round])
451 | 
452 |             #self.check_batch_invariants(batch_feed_dict)
453 |             yield batch_feed_dict
454 | 
455 | 
456 |     def check_batch_invariants(self, batch_feed_dict):
457 |         for prop_round in range(self.params['propagation_rounds']):
458 |             initialised_nodes = set()
459 |             initialised_nodes.update(batch_feed_dict[self.placeholders['initial_nodes'][prop_round]])
460 |             for step in range(self.params['propagation_substeps']):
461 |                 sending_nodes = set()
462 |                 for edge_typ in range(self.num_edge_types):
463 |                     sending_nodes.update(batch_feed_dict[self.placeholders['sending_nodes'][prop_round][step][edge_typ]])
464 |                 for v in sending_nodes:
465 |                     assert v in initialised_nodes
466 | 
467 |                 recv_nodes = batch_feed_dict[self.placeholders['receiving_nodes'][prop_round][step]]
468 |                 for v in recv_nodes:
469 |                     assert v not in initialised_nodes
470 |                 initialised_nodes.update(recv_nodes)
471 | 
472 | def main():
473 |     args = docopt(__doc__)
474 |     try:
475 |         model = AsyncGGNNChemModel(args)
476 |         model.train()
477 |     except:
478 |         typ, value, tb = sys.exc_info()
479 |         traceback.print_exc()
480 |         pdb.post_mortem(tb)
481 | 
482 | 
483 | if __name__ == "__main__":
484 |     main()
485 | 


--------------------------------------------------------------------------------
/chem_tensorflow_dense.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env/python
  2 | """
  3 | Usage:
  4 |     chem_tensorflow_dense.py [options]
  5 | 
  6 | Options:
  7 |     -h --help                Show this screen.
  8 |     --config-file FILE       Hyperparameter configuration file path (in JSON format)
  9 |     --config CONFIG          Hyperparameter configuration dictionary (in JSON format)
 10 |     --log_dir NAME           log dir name
 11 |     --data_dir NAME          data dir name
 12 |     --restore FILE           File to restore weights from.
 13 |     --freeze-graph-model     Freeze weights of graph model components.
 14 |     --evaluate               example evaluation mode using a restored model
 15 | """
 16 | 
 17 | from typing import Sequence, Any
 18 | from docopt import docopt
 19 | from collections import defaultdict
 20 | import numpy as np
 21 | import tensorflow as tf
 22 | import sys, traceback
 23 | import pdb
 24 | import json
 25 | 
 26 | from chem_tensorflow import ChemModel
 27 | from utils import glorot_init
 28 | 
 29 | 
 30 | def graph_to_adj_mat(graph, max_n_vertices, num_edge_types, tie_fwd_bkwd=True):
 31 |     bwd_edge_offset = 0 if tie_fwd_bkwd else (num_edge_types // 2)
 32 |     amat = np.zeros((num_edge_types, max_n_vertices, max_n_vertices))
 33 |     for src, e, dest in graph:
 34 |         amat[e-1, dest, src] = 1
 35 |         amat[e-1 + bwd_edge_offset, src, dest] = 1
 36 |     return amat
 37 | 
 38 | 
 39 | '''
 40 | Comments provide the expected tensor shapes where helpful.
 41 | 
 42 | Key to symbols in comments:
 43 | ---------------------------
 44 | [...]:  a tensor
 45 | ; ; :   a list
 46 | b:      batch size
 47 | e:      number of edge types (4)
 48 | v:      number of vertices per graph in this batch
 49 | h:      GNN hidden size
 50 | '''
 51 | 
 52 | class DenseGGNNChemModel(ChemModel):
 53 |     def __init__(self, args):
 54 |         super().__init__(args)
 55 | 
 56 |     @classmethod
 57 |     def default_params(cls):
 58 |         params = dict(super().default_params())
 59 |         params.update({
 60 |                         'batch_size': 256,
 61 |                         'graph_state_dropout_keep_prob': 1.,
 62 |                         'task_sample_ratios': {},
 63 |                         'use_edge_bias': True,
 64 |                         'edge_weight_dropout_keep_prob': 1
 65 |                       })
 66 |         return params
 67 | 
 68 |     def prepare_specific_graph_model(self) -> None:
 69 |         h_dim = self.params['hidden_size']
 70 |         # inputs
 71 |         self.placeholders['graph_state_keep_prob'] = tf.placeholder(tf.float32, None, name='graph_state_keep_prob')
 72 |         self.placeholders['edge_weight_dropout_keep_prob'] = tf.placeholder(tf.float32, None, name='edge_weight_dropout_keep_prob')
 73 |         self.placeholders['initial_node_representation'] = tf.placeholder(tf.float32,
 74 |                                                                           [None, None, self.params['hidden_size']],
 75 |                                                                           name='node_features')
 76 |         self.placeholders['node_mask'] = tf.placeholder(tf.float32, [None, None], name='node_mask')
 77 |         self.placeholders['num_vertices'] = tf.placeholder(tf.int32, ())
 78 |         self.placeholders['adjacency_matrix'] = tf.placeholder(tf.float32,
 79 |                                                                [None, self.num_edge_types, None, None])     # [b, e, v, v]
 80 |         self.__adjacency_matrix = tf.transpose(self.placeholders['adjacency_matrix'], [1, 0, 2, 3])         # [e, b, v, v]
 81 | 
 82 | 
 83 |         # weights
 84 |         self.weights['edge_weights'] = tf.Variable(glorot_init([self.num_edge_types, h_dim, h_dim]))
 85 |         if self.params['use_edge_bias']:
 86 |             self.weights['edge_biases'] = tf.Variable(np.zeros([self.num_edge_types, 1, h_dim]).astype(np.float32))
 87 |         with tf.variable_scope("gru_scope"):
 88 |             cell = tf.contrib.rnn.GRUCell(h_dim)
 89 |             cell = tf.nn.rnn_cell.DropoutWrapper(cell,
 90 |                                                  state_keep_prob=self.placeholders['graph_state_keep_prob'])
 91 |             self.weights['node_gru'] = cell
 92 | 
 93 |     def compute_final_node_representations(self) -> tf.Tensor:
 94 |         v = self.placeholders['num_vertices']
 95 |         h_dim = self.params['hidden_size']
 96 |         h = self.placeholders['initial_node_representation']                                                # [b, v, h]
 97 |         h = tf.reshape(h, [-1, h_dim])
 98 | 
 99 |         with tf.variable_scope("gru_scope") as scope:
100 |             for i in range(self.params['num_timesteps']):
101 |                 if i > 0:
102 |                     tf.get_variable_scope().reuse_variables()
103 |                 for edge_type in range(self.num_edge_types):
104 |                     m = tf.matmul(h, tf.nn.dropout(self.weights['edge_weights'][edge_type],
105 |                                                    keep_prob=self.placeholders['edge_weight_dropout_keep_prob'])) # [b*v, h]
106 |                     m = tf.reshape(m, [-1, v, h_dim])                                                       # [b, v, h]
107 |                     if self.params['use_edge_bias']:
108 |                         m += self.weights['edge_biases'][edge_type]                                         # [b, v, h]
109 |                     if edge_type == 0:
110 |                         acts = tf.matmul(self.__adjacency_matrix[edge_type], m)
111 |                     else:
112 |                         acts += tf.matmul(self.__adjacency_matrix[edge_type], m)
113 |                 acts = tf.reshape(acts, [-1, h_dim])                                                        # [b*v, h]
114 | 
115 |                 h = self.weights['node_gru'](acts, h)[1]                                                    # [b*v, h]
116 |             last_h = tf.reshape(h, [-1, v, h_dim])
117 |         return last_h
118 | 
119 |     def gated_regression(self, last_h, regression_gate, regression_transform):
120 |         # last_h: [b x v x h]
121 |         gate_input = tf.concat([last_h, self.placeholders['initial_node_representation']], axis = 2)        # [b, v, 2h]
122 |         gate_input = tf.reshape(gate_input, [-1, 2 * self.params["hidden_size"]])                           # [b*v, 2h]
123 |         last_h = tf.reshape(last_h, [-1, self.params["hidden_size"]])                                       # [b*v, h]
124 |         gated_outputs = tf.nn.sigmoid(regression_gate(gate_input)) * regression_transform(last_h)           # [b*v, 1]
125 |         gated_outputs = tf.reshape(gated_outputs, [-1, self.placeholders['num_vertices']])                  # [b, v]
126 |         masked_gated_outputs = gated_outputs * self.placeholders['node_mask']                               # [b x v]
127 |         output = tf.reduce_sum(masked_gated_outputs, axis = 1)                                              # [b]
128 |         self.output = output
129 |         return output
130 | 
131 |     # ----- Data preprocessing and chunking into minibatches:
132 |     def process_raw_graphs(self, raw_data: Sequence[Any], is_training_data: bool, bucket_sizes=None) -> Any:
133 |         if bucket_sizes is None:
134 |             bucket_sizes = np.array(list(range(4, 28, 2)) + [29])
135 |         bucketed = defaultdict(list)
136 |         x_dim = len(raw_data[0]["node_features"][0])
137 |         for d in raw_data:
138 |             chosen_bucket_idx = np.argmax(bucket_sizes > max([v for e in d['graph']
139 |                                                                 for v in [e[0], e[2]]]))
140 |             chosen_bucket_size = bucket_sizes[chosen_bucket_idx]
141 |             n_active_nodes = len(d["node_features"])
142 |             bucketed[chosen_bucket_idx].append({
143 |                 'adj_mat': graph_to_adj_mat(d['graph'], chosen_bucket_size, self.num_edge_types, self.params['tie_fwd_bkwd']),
144 |                 'init': d["node_features"] + [[0 for _ in range(x_dim)] for __ in
145 |                                               range(chosen_bucket_size - n_active_nodes)],
146 |                 'labels': [d["targets"][task_id][0] for task_id in self.params['task_ids']],
147 |                 'mask': [1. for _ in range(n_active_nodes) ] + [0. for _ in range(chosen_bucket_size - n_active_nodes)]
148 |             })
149 | 
150 |         if is_training_data:
151 |             for (bucket_idx, bucket) in bucketed.items():
152 |                 np.random.shuffle(bucket)
153 |                 for task_id in self.params['task_ids']:
154 |                     task_sample_ratio = self.params['task_sample_ratios'].get(str(task_id))
155 |                     if task_sample_ratio is not None:
156 |                         ex_to_sample = int(len(bucket) * task_sample_ratio)
157 |                         for ex_id in range(ex_to_sample, len(bucket)):
158 |                             bucket[ex_id]['labels'][task_id] = None
159 | 
160 |         bucket_at_step = [[bucket_idx for _ in range(len(bucket_data) // self.params['batch_size'])]
161 |                           for bucket_idx, bucket_data in bucketed.items()]
162 |         bucket_at_step = [x for y in bucket_at_step for x in y]
163 | 
164 |         return (bucketed, bucket_sizes, bucket_at_step)
165 | 
166 |     def pad_annotations(self, annotations):
167 |         return  np.pad(annotations,
168 |                        pad_width=[[0, 0], [0, 0], [0, self.params['hidden_size'] - self.annotation_size]],
169 |                        mode='constant')
170 | 
171 | 
172 |     def make_batch(self, elements):
173 |         batch_data = {'adj_mat': [], 'init': [], 'labels': [], 'node_mask': [], 'task_masks': []}
174 |         for d in elements:
175 |             batch_data['adj_mat'].append(d['adj_mat'])
176 |             batch_data['init'].append(d['init'])
177 |             batch_data['node_mask'].append(d['mask'])
178 | 
179 |             target_task_values = []
180 |             target_task_mask = []
181 |             for target_val in d['labels']:
182 |                 if target_val is None:  # This is one of the examples we didn't sample...
183 |                     target_task_values.append(0.)
184 |                     target_task_mask.append(0.)
185 |                 else:
186 |                     target_task_values.append(target_val)
187 |                     target_task_mask.append(1.)
188 |             batch_data['labels'].append(target_task_values)
189 |             batch_data['task_masks'].append(target_task_mask)
190 | 
191 |         return batch_data
192 | 
193 | 
194 |     def make_minibatch_iterator(self, data, is_training: bool):
195 |         (bucketed, bucket_sizes, bucket_at_step) = data
196 |         if is_training:
197 |             np.random.shuffle(bucket_at_step)
198 |             for _, bucketed_data in bucketed.items():
199 |                 np.random.shuffle(bucketed_data)
200 | 
201 |         bucket_counters = defaultdict(int)
202 |         dropout_keep_prob = self.params['graph_state_dropout_keep_prob'] if is_training else 1.
203 |         for step in range(len(bucket_at_step)):
204 |             bucket = bucket_at_step[step]
205 |             start_idx = bucket_counters[bucket] * self.params['batch_size']
206 |             end_idx = (bucket_counters[bucket] + 1) * self.params['batch_size']
207 |             elements = bucketed[bucket][start_idx:end_idx]
208 |             batch_data = self.make_batch(elements)
209 | 
210 |             num_graphs = len(batch_data['init'])
211 |             initial_representations = batch_data['init']
212 |             initial_representations = self.pad_annotations(initial_representations)
213 | 
214 |             batch_feed_dict = {
215 |                 self.placeholders['initial_node_representation']: initial_representations,
216 |                 self.placeholders['target_values']: np.transpose(batch_data['labels'], axes=[1,0]),
217 |                 self.placeholders['target_mask']: np.transpose(batch_data['task_masks'], axes=[1, 0]),
218 |                 self.placeholders['num_graphs']: num_graphs,
219 |                 self.placeholders['num_vertices']: bucket_sizes[bucket],
220 |                 self.placeholders['adjacency_matrix']: batch_data['adj_mat'],
221 |                 self.placeholders['node_mask']: batch_data['node_mask'],
222 |                 self.placeholders['graph_state_keep_prob']: dropout_keep_prob,
223 |                 self.placeholders['edge_weight_dropout_keep_prob']: dropout_keep_prob
224 |             }
225 | 
226 |             bucket_counters[bucket] += 1
227 | 
228 |             yield batch_feed_dict
229 | 
230 |     def evaluate_one_batch(self, initial_node_representations, adjacency_matrices, node_masks=None):
231 |         num_vertices = len(initial_node_representations[0])
232 |         if node_masks is None:
233 |             node_masks = []
234 |             for r in initial_node_representations:
235 |                 node_masks.append([1. for _ in r] + [0. for _ in range(num_vertices - len(r))])
236 |         batch_feed_dict = {
237 |             self.placeholders['initial_node_representation']: self.pad_annotations(initial_node_representations),
238 |             self.placeholders['num_graphs']: len(initial_node_representations),
239 |             self.placeholders['num_vertices']: len(initial_node_representations[0]),
240 |             self.placeholders['adjacency_matrix']: adjacency_matrices,
241 |             self.placeholders['node_mask']: node_masks,
242 |             self.placeholders['graph_state_keep_prob']: 1.0,
243 |             self.placeholders['out_layer_dropout_keep_prob']: 1.0,
244 |             self.placeholders['edge_weight_dropout_keep_prob']: 1.0
245 |         }
246 | 
247 |         fetch_list = self.output
248 |         result = self.sess.run(fetch_list, feed_dict=batch_feed_dict)
249 |         return result
250 | 
251 |     def example_evaluation(self):
252 |         ''' Demonstration of what test-time code would look like
253 |         we query the model with the first n_example_molecules from the validation file
254 |         '''
255 |         n_example_molecules = 10
256 |         with open('molecules_valid.json', 'r') as valid_file:
257 |             example_molecules = json.load(valid_file)[:n_example_molecules]
258 | 
259 |         for mol in example_molecules:
260 |             print(mol['targets'])
261 | 
262 |         example_molecules, _, _ = self.process_raw_graphs(example_molecules, 
263 |             is_training_data=False, bucket_sizes=np.array([29]))
264 |         batch_data = self.make_batch(example_molecules[0])
265 |         print(self.evaluate_one_batch(batch_data['init'], batch_data['adj_mat']))
266 | 
267 |         
268 | 
269 | 
270 | 
271 | def main():
272 |     args = docopt(__doc__)
273 |     try:
274 |         model = DenseGGNNChemModel(args)
275 | 
276 |         if args['--evaluate']:
277 |             model.example_evaluation()
278 |         else:
279 |             model.train()
280 |     except:
281 |         typ, value, tb = sys.exc_info()
282 |         traceback.print_exc()
283 |         pdb.post_mortem(tb)
284 | 
285 | 
286 | if __name__ == "__main__":
287 |     main()
288 | 


--------------------------------------------------------------------------------
/chem_tensorflow_gcn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env/python
  2 | '''
  3 | Usage:
  4 |     chem_tensorflow_gcn.py [options]
  5 | 
  6 | Options:
  7 |     -h --help                Show this screen.
  8 |     --config-file FILE       Hyperparameter configuration file path (in JSON format)
  9 |     --config CONFIG          Hyperparameter configuration dictionary (in JSON format)
 10 |     --log_dir NAME           log dir name
 11 |     --data_dir NAME          data dir name
 12 |     --restore FILE           File to restore weights from.
 13 |     --freeze-graph-model     Freeze weights of graph model components.
 14 | '''
 15 | from typing import Tuple, Sequence, Any
 16 | 
 17 | from docopt import docopt
 18 | import numpy as np
 19 | import tensorflow as tf
 20 | 
 21 | import sys, traceback
 22 | import pdb
 23 | 
 24 | from chem_tensorflow import ChemModel
 25 | from utils import glorot_init
 26 | 
 27 | 
 28 | class SparseGCNChemModel(ChemModel):
 29 |     def __init__(self, args):
 30 |         super().__init__(args)
 31 | 
 32 |     @classmethod
 33 |     def default_params(cls):
 34 |         params = dict(super().default_params())
 35 |         params.update({'batch_size': 100000,
 36 |                        'task_sample_ratios': {},
 37 |                        'gcn_use_bias': False,
 38 |                        'graph_state_dropout_keep_prob': 1.0,
 39 |                        })
 40 |         return params
 41 | 
 42 |     def prepare_specific_graph_model(self) -> None:
 43 |         h_dim = self.params['hidden_size']
 44 |         self.placeholders['initial_node_representation'] = tf.placeholder(tf.float32, [None, h_dim],
 45 |                                                                           name='node_features')
 46 |         self.placeholders['adjacency_list'] = tf.placeholder(tf.int64, [None, 2], name='adjacency_list')
 47 |         self.placeholders['adjacency_weights'] = tf.placeholder(tf.float32, [None], name='adjacency_weights')
 48 |         self.placeholders['graph_nodes_list'] = tf.placeholder(tf.int32, [None], name='graph_nodes_list')
 49 |         self.placeholders['graph_state_keep_prob'] = tf.placeholder(tf.float32, None, name='graph_state_keep_prob')
 50 | 
 51 |         with tf.variable_scope('gcn_scope'):
 52 |             self.weights['edge_weights'] = [tf.Variable(glorot_init((h_dim, h_dim)), name="gcn_weights_%i" % i)
 53 |                                             for i in range(self.params['num_timesteps'])]
 54 | 
 55 |             if self.params['gcn_use_bias']:
 56 |                 self.weights['edge_biases'] = [tf.Variable(np.zeros([h_dim], dtype=np.float32), name="gcn_bias_%i" % i)
 57 |                                                for i in range(self.params['num_timesteps'])]
 58 | 
 59 |     def compute_final_node_representations(self):
 60 |         with tf.variable_scope('gcn_scope'):
 61 |             cur_node_states = self.placeholders['initial_node_representation']  # number of nodes in batch v x D
 62 |             num_nodes = tf.shape(self.placeholders['initial_node_representation'], out_type=tf.int64)[0]
 63 | 
 64 |             adjacency_matrix = tf.SparseTensor(indices=self.placeholders['adjacency_list'],
 65 |                                                values=self.placeholders['adjacency_weights'],
 66 |                                                dense_shape=[num_nodes, num_nodes])
 67 | 
 68 |             for layer_idx in range(self.params['num_timesteps']):
 69 |                 scaled_cur_node_states = tf.sparse_tensor_dense_matmul(adjacency_matrix, cur_node_states)  # v x D
 70 |                 new_node_states = tf.matmul(scaled_cur_node_states, self.weights['edge_weights'][layer_idx])
 71 | 
 72 |                 if self.params['gcn_use_bias']:
 73 |                     new_node_states += self.weights['edge_biases'][layer_idx]  # v x D
 74 | 
 75 |                 # On all but final layer do ReLU and dropout:
 76 |                 if layer_idx < self.params['num_timesteps'] - 1:
 77 |                     new_node_states = tf.nn.relu(new_node_states)
 78 |                     new_node_states = tf.nn.dropout(new_node_states, keep_prob=self.placeholders['graph_state_keep_prob'])
 79 | 
 80 |                 cur_node_states = new_node_states
 81 | 
 82 |             return cur_node_states
 83 | 
 84 |     def gated_regression(self, last_h, regression_gate, regression_transform):
 85 |         # last_h: [v x h]
 86 |         gate_input = tf.concat([last_h, self.placeholders['initial_node_representation']], axis=-1)  # [v x 2h]
 87 |         gated_outputs = tf.nn.sigmoid(regression_gate(gate_input)) * regression_transform(last_h)    # [v x 1]
 88 | 
 89 |         # Sum up all nodes per-graph
 90 |         graph_representations = tf.unsorted_segment_sum(data=gated_outputs,
 91 |                                                         segment_ids=self.placeholders['graph_nodes_list'],
 92 |                                                         num_segments=self.placeholders['num_graphs'])  # [g x 1]
 93 |         return tf.squeeze(graph_representations)  # [g]
 94 | 
 95 |     # ----- Data preprocessing and chunking into minibatches:
 96 |     def process_raw_graphs(self, raw_data: Sequence[Any], is_training_data: bool) -> Any:
 97 |         processed_graphs = []
 98 |         for d in raw_data:
 99 |             (adjacency_list, adjacency_weights) = self.__graph_to_adjacency_list(d['graph'], len(d["node_features"]))
100 |             processed_graphs.append({"adjacency_list": adjacency_list,
101 |                                      "adjacency_weights": adjacency_weights,
102 |                                      "init": d["node_features"],
103 |                                      "labels": [d["targets"][task_id][0] for task_id in self.params['task_ids']]})
104 | 
105 |         if is_training_data:
106 |             np.random.shuffle(processed_graphs)
107 |             for task_id in self.params['task_ids']:
108 |                 task_sample_ratio = self.params['task_sample_ratios'].get(str(task_id))
109 |                 if task_sample_ratio is not None:
110 |                     ex_to_sample = int(len(processed_graphs) * task_sample_ratio)
111 |                     for ex_id in range(ex_to_sample, len(processed_graphs)):
112 |                         processed_graphs[ex_id]['labels'][task_id] = None
113 | 
114 |         return processed_graphs
115 | 
116 |     def __graph_to_adjacency_list(self, graph, num_nodes: int) -> Tuple[np.ndarray, np.ndarray]:
117 |         # Step 1: Generate adjacency matrices:
118 |         adj_matrix = np.zeros((num_nodes, num_nodes))
119 |         for src, _, dest in graph:
120 |             adj_matrix[src, dest] = 1
121 |             adj_matrix[dest, src] = 1
122 | 
123 |         # Step 2: Introduce self loops:
124 |         self_loops = np.eye(num_nodes)
125 |         adj_matrix += self_loops
126 | 
127 |         # Step 3: Normalize adj_matrices so that scale of vectors doesn't explode:
128 |         row_sum = np.sum(adj_matrix, axis=-1)
129 |         D_inv_sqrt = np.diag(np.power(row_sum, -0.5).flatten() + 1e-7)
130 |         adj_matrix = D_inv_sqrt.dot(adj_matrix).dot(D_inv_sqrt)
131 | 
132 |         # Step 4: Turn into sorted adjacency lists:
133 |         final_adj_list = []
134 |         final_adj_weights = []
135 |         for i in range(num_nodes):
136 |             for j in range(num_nodes):
137 |                 w = adj_matrix[i, j]
138 |                 if w != 0:
139 |                     final_adj_list.append([i,j])
140 |                     final_adj_weights.append(w)
141 | 
142 |         return np.array(final_adj_list), np.array(final_adj_weights)
143 | 
144 |     def make_minibatch_iterator(self, data: Any, is_training: bool):
145 |         """Create minibatches by flattening adjacency matrices into a single adjacency matrix with
146 |         multiple disconnected components."""
147 |         if is_training:
148 |             np.random.shuffle(data)
149 |         dropout_keep_prob = self.params['graph_state_dropout_keep_prob'] if is_training else 1.
150 |         # Pack until we cannot fit more graphs in the batch
151 |         num_graphs = 0
152 |         while num_graphs < len(data):
153 |             num_graphs_in_batch = 0
154 |             batch_node_features = []
155 |             batch_target_task_values = []
156 |             batch_target_task_mask = []
157 |             batch_adjacency_list = []
158 |             batch_adjacency_weights = []
159 |             batch_graph_nodes_list = []
160 |             node_offset = 0
161 | 
162 |             while num_graphs < len(data) and node_offset + len(data[num_graphs]['init']) < self.params['batch_size']:
163 |                 cur_graph = data[num_graphs]
164 |                 num_nodes_in_graph = len(cur_graph['init'])
165 |                 padded_features = np.pad(cur_graph['init'],
166 |                                          ((0, 0), (0, self.params['hidden_size'] - self.annotation_size)),
167 |                                          mode='constant')
168 |                 batch_node_features.extend(padded_features)
169 |                 batch_graph_nodes_list.append(np.full(shape=[num_nodes_in_graph], fill_value=num_graphs_in_batch, dtype=np.int32))
170 |                 batch_adjacency_list.append(cur_graph['adjacency_list'] + node_offset)
171 |                 batch_adjacency_weights.append(cur_graph['adjacency_weights'])
172 | 
173 |                 target_task_values = []
174 |                 target_task_mask = []
175 |                 for target_val in cur_graph['labels']:
176 |                     if target_val is None:  # This is one of the examples we didn't sample...
177 |                         target_task_values.append(0.)
178 |                         target_task_mask.append(0.)
179 |                     else:
180 |                         target_task_values.append(target_val)
181 |                         target_task_mask.append(1.)
182 |                 batch_target_task_values.append(target_task_values)
183 |                 batch_target_task_mask.append(target_task_mask)
184 |                 num_graphs += 1
185 |                 num_graphs_in_batch += 1
186 |                 node_offset += num_nodes_in_graph
187 | 
188 |             batch_feed_dict = {
189 |                 self.placeholders['initial_node_representation']: np.array(batch_node_features),
190 |                 self.placeholders['adjacency_list']: np.concatenate(batch_adjacency_list, axis=0),
191 |                 self.placeholders['adjacency_weights']: np.concatenate(batch_adjacency_weights, axis=0),
192 |                 self.placeholders['graph_nodes_list']: np.concatenate(batch_graph_nodes_list, axis=0),
193 |                 self.placeholders['target_values']: np.transpose(batch_target_task_values, axes=[1,0]),
194 |                 self.placeholders['target_mask']: np.transpose(batch_target_task_mask, axes=[1, 0]),
195 |                 self.placeholders['num_graphs']: num_graphs_in_batch,
196 |                 self.placeholders['graph_state_keep_prob']: dropout_keep_prob,
197 |             }
198 | 
199 |             yield batch_feed_dict
200 | 
201 | 
202 | def main():
203 |     args = docopt(__doc__)
204 |     try:
205 |         model = SparseGCNChemModel(args)
206 |         model.train()
207 |     except:
208 |         typ, value, tb = sys.exc_info()
209 |         traceback.print_exc()
210 |         pdb.post_mortem(tb)
211 | 
212 | if __name__ == "__main__":
213 |     main()
214 | 


--------------------------------------------------------------------------------
/chem_tensorflow_sparse.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env/python
  2 | """
  3 | Usage:
  4 |     chem_tensorflow_sparse.py [options]
  5 | 
  6 | Options:
  7 |     -h --help                Show this screen.
  8 |     --config-file FILE       Hyperparameter configuration file path (in JSON format).
  9 |     --config CONFIG          Hyperparameter configuration dictionary (in JSON format).
 10 |     --log_dir DIR            Log dir name.
 11 |     --data_dir DIR           Data dir name.
 12 |     --restore FILE           File to restore weights from.
 13 |     --freeze-graph-model     Freeze weights of graph model components.
 14 |     --evaluate               example evaluation mode using a restored model
 15 | """
 16 | from typing import List, Tuple, Dict, Sequence, Any
 17 | 
 18 | from docopt import docopt
 19 | from collections import defaultdict, namedtuple
 20 | import numpy as np
 21 | import tensorflow as tf
 22 | import sys, traceback
 23 | import pdb
 24 | import json
 25 | 
 26 | from chem_tensorflow import ChemModel
 27 | from utils import glorot_init, SMALL_NUMBER
 28 | 
 29 | 
 30 | GGNNWeights = namedtuple('GGNNWeights', ['edge_weights',
 31 |                                          'edge_biases',
 32 |                                          'edge_type_attention_weights',
 33 |                                          'rnn_cells',])
 34 | 
 35 | 
 36 | class SparseGGNNChemModel(ChemModel):
 37 |     def __init__(self, args):
 38 |         super().__init__(args)
 39 | 
 40 |     @classmethod
 41 |     def default_params(cls):
 42 |         params = dict(super().default_params())
 43 |         params.update({
 44 |             'batch_size': 100000,
 45 |             'use_edge_bias': False,
 46 |             'use_propagation_attention': False,
 47 |             'use_edge_msg_avg_aggregation': True,
 48 |             'residual_connections': {  # For layer i, specify list of layers whose output is added as an input
 49 |                                      "2": [0],
 50 |                                      "4": [0, 2]
 51 |                                     },
 52 | 
 53 |             'layer_timesteps': [2, 2, 1, 2, 1],  # number of layers & propagation steps per layer
 54 | 
 55 |             'graph_rnn_cell': 'GRU',  # GRU, CudnnCompatibleGRUCell, or RNN
 56 |             'graph_rnn_activation': 'tanh',  # tanh, ReLU
 57 |             'graph_state_dropout_keep_prob': 1.,
 58 |             'task_sample_ratios': {},
 59 |             'edge_weight_dropout_keep_prob': .8
 60 |         })
 61 |         return params
 62 | 
 63 |     def prepare_specific_graph_model(self) -> None:
 64 |         h_dim = self.params['hidden_size']
 65 |         self.placeholders['initial_node_representation'] = tf.placeholder(tf.float32, [None, h_dim],
 66 |                                                                           name='node_features')
 67 |         self.placeholders['adjacency_lists'] = [tf.placeholder(tf.int32, [None, 2], name='adjacency_e%s' % e)
 68 |                                                 for e in range(self.num_edge_types)]
 69 |         self.placeholders['num_incoming_edges_per_type'] = tf.placeholder(tf.float32, [None, self.num_edge_types],
 70 |                                                                           name='num_incoming_edges_per_type')
 71 |         self.placeholders['graph_nodes_list'] = tf.placeholder(tf.int32, [None], name='graph_nodes_list')
 72 |         self.placeholders['graph_state_keep_prob'] = tf.placeholder(tf.float32, None, name='graph_state_keep_prob')
 73 |         self.placeholders['edge_weight_dropout_keep_prob'] = tf.placeholder(tf.float32, None, name='edge_weight_dropout_keep_prob')
 74 | 
 75 |         activation_name = self.params['graph_rnn_activation'].lower()
 76 |         if activation_name == 'tanh':
 77 |             activation_fun = tf.nn.tanh
 78 |         elif activation_name == 'relu':
 79 |             activation_fun = tf.nn.relu
 80 |         else:
 81 |             raise Exception("Unknown activation function type '%s'." % activation_name)
 82 | 
 83 |         # Generate per-layer values for edge weights, biases and gated units:
 84 |         self.weights = {}  # Used by super-class to place generic things
 85 |         self.gnn_weights = GGNNWeights([], [], [], [])
 86 |         for layer_idx in range(len(self.params['layer_timesteps'])):
 87 |             with tf.variable_scope('gnn_layer_%i' % layer_idx):
 88 |                 edge_weights = tf.Variable(glorot_init([self.num_edge_types * h_dim, h_dim]),
 89 |                                            name='gnn_edge_weights_%i' % layer_idx)
 90 |                 edge_weights = tf.reshape(edge_weights, [self.num_edge_types, h_dim, h_dim])
 91 |                 edge_weights = tf.nn.dropout(edge_weights, keep_prob=self.placeholders['edge_weight_dropout_keep_prob'])
 92 |                 self.gnn_weights.edge_weights.append(edge_weights)
 93 | 
 94 |                 if self.params['use_propagation_attention']:
 95 |                     self.gnn_weights.edge_type_attention_weights.append(tf.Variable(np.ones([self.num_edge_types], dtype=np.float32),
 96 |                                                                                     name='edge_type_attention_weights_%i' % layer_idx))
 97 | 
 98 |                 if self.params['use_edge_bias']:
 99 |                     self.gnn_weights.edge_biases.append(tf.Variable(np.zeros([self.num_edge_types, h_dim], dtype=np.float32),
100 |                                                                     name='gnn_edge_biases_%i' % layer_idx))
101 | 
102 |                 cell_type = self.params['graph_rnn_cell'].lower()
103 |                 if cell_type == 'gru':
104 |                     cell = tf.nn.rnn_cell.GRUCell(h_dim, activation=activation_fun)
105 |                 elif cell_type == 'cudnncompatiblegrucell':
106 |                     assert(activation_name == 'tanh')
107 |                     import tensorflow.contrib.cudnn_rnn as cudnn_rnn
108 |                     cell = cudnn_rnn.CudnnCompatibleGRUCell(h_dim)
109 |                 elif cell_type == 'rnn':
110 |                     cell = tf.nn.rnn_cell.BasicRNNCell(h_dim, activation=activation_fun)
111 |                 else:
112 |                     raise Exception("Unknown RNN cell type '%s'." % cell_type)
113 |                 cell = tf.nn.rnn_cell.DropoutWrapper(cell,
114 |                                                      state_keep_prob=self.placeholders['graph_state_keep_prob'])
115 |                 self.gnn_weights.rnn_cells.append(cell)
116 | 
117 |     def compute_final_node_representations(self) -> tf.Tensor:
118 |         node_states_per_layer = []  # one entry per layer (final state of that layer), shape: number of nodes in batch v x D
119 |         node_states_per_layer.append(self.placeholders['initial_node_representation'])
120 |         num_nodes = tf.shape(self.placeholders['initial_node_representation'], out_type=tf.int32)[0]
121 | 
122 |         message_targets = []  # list of tensors of message targets of shape [E]
123 |         message_edge_types = []  # list of tensors of edge type of shape [E]
124 |         for edge_type_idx, adjacency_list_for_edge_type in enumerate(self.placeholders['adjacency_lists']):
125 |             edge_targets = adjacency_list_for_edge_type[:, 1]
126 |             message_targets.append(edge_targets)
127 |             message_edge_types.append(tf.ones_like(edge_targets, dtype=tf.int32) * edge_type_idx)
128 |         message_targets = tf.concat(message_targets, axis=0)  # Shape [M]
129 |         message_edge_types = tf.concat(message_edge_types, axis=0)  # Shape [M]
130 | 
131 |         for (layer_idx, num_timesteps) in enumerate(self.params['layer_timesteps']):
132 |             with tf.variable_scope('gnn_layer_%i' % layer_idx):
133 |                 # Used shape abbreviations:
134 |                 #   V ~ number of nodes
135 |                 #   D ~ state dimension
136 |                 #   E ~ number of edges of current type
137 |                 #   M ~ number of messages (sum of all E)
138 | 
139 |                 # Extract residual messages, if any:
140 |                 layer_residual_connections = self.params['residual_connections'].get(str(layer_idx))
141 |                 if layer_residual_connections is None:
142 |                     layer_residual_states = []
143 |                 else:
144 |                     layer_residual_states = [node_states_per_layer[residual_layer_idx]
145 |                                              for residual_layer_idx in layer_residual_connections]
146 | 
147 |                 if self.params['use_propagation_attention']:
148 |                     message_edge_type_factors = tf.nn.embedding_lookup(params=self.gnn_weights.edge_type_attention_weights[layer_idx],
149 |                                                                        ids=message_edge_types)  # Shape [M]
150 | 
151 |                 # Record new states for this layer. Initialised to last state, but will be updated below:
152 |                 node_states_per_layer.append(node_states_per_layer[-1])
153 |                 for step in range(num_timesteps):
154 |                     with tf.variable_scope('timestep_%i' % step):
155 |                         messages = []  # list of tensors of messages of shape [E, D]
156 |                         message_source_states = []  # list of tensors of edge source states of shape [E, D]
157 | 
158 |                         # Collect incoming messages per edge type
159 |                         for edge_type_idx, adjacency_list_for_edge_type in enumerate(self.placeholders['adjacency_lists']):
160 |                             edge_sources = adjacency_list_for_edge_type[:, 0]
161 |                             edge_source_states = tf.nn.embedding_lookup(params=node_states_per_layer[-1],
162 |                                                                         ids=edge_sources)  # Shape [E, D]
163 |                             all_messages_for_edge_type = tf.matmul(edge_source_states,
164 |                                                                    self.gnn_weights.edge_weights[layer_idx][edge_type_idx])  # Shape [E, D]
165 |                             messages.append(all_messages_for_edge_type)
166 |                             message_source_states.append(edge_source_states)
167 | 
168 |                         messages = tf.concat(messages, axis=0)  # Shape [M, D]
169 | 
170 |                         if self.params['use_propagation_attention']:
171 |                             message_source_states = tf.concat(message_source_states, axis=0)  # Shape [M, D]
172 |                             message_target_states = tf.nn.embedding_lookup(params=node_states_per_layer[-1],
173 |                                                                            ids=message_targets)  # Shape [M, D]
174 |                             message_attention_scores = tf.einsum('mi,mi->m', message_source_states, message_target_states)  # Shape [M]
175 |                             message_attention_scores = message_attention_scores * message_edge_type_factors
176 | 
177 |                             # The following is softmax-ing over the incoming messages per node.
178 |                             # As the number of incoming varies, we can't just use tf.softmax. Reimplement with logsumexp trick:
179 |                             # Step (1): Obtain shift constant as max of messages going into a node
180 |                             message_attention_score_max_per_target = tf.unsorted_segment_max(data=message_attention_scores,
181 |                                                                                              segment_ids=message_targets,
182 |                                                                                              num_segments=num_nodes)  # Shape [V]
183 |                             # Step (2): Distribute max out to the corresponding messages again, and shift scores:
184 |                             message_attention_score_max_per_message = tf.gather(params=message_attention_score_max_per_target,
185 |                                                                                 indices=message_targets)  # Shape [M]
186 |                             message_attention_scores -= message_attention_score_max_per_message
187 |                             # Step (3): Exp, sum up per target, compute exp(score) / exp(sum) as attention prob:
188 |                             message_attention_scores_exped = tf.exp(message_attention_scores)  # Shape [M]
189 |                             message_attention_score_sum_per_target = tf.unsorted_segment_sum(data=message_attention_scores_exped,
190 |                                                                                              segment_ids=message_targets,
191 |                                                                                              num_segments=num_nodes)  # Shape [V]
192 |                             message_attention_normalisation_sum_per_message = tf.gather(params=message_attention_score_sum_per_target,
193 |                                                                                         indices=message_targets)  # Shape [M]
194 |                             message_attention = message_attention_scores_exped / (message_attention_normalisation_sum_per_message + SMALL_NUMBER)  # Shape [M]
195 |                             # Step (4): Weigh messages using the attention prob:
196 |                             messages = messages * tf.expand_dims(message_attention, -1)
197 | 
198 |                         incoming_messages = tf.unsorted_segment_sum(data=messages,
199 |                                                                     segment_ids=message_targets,
200 |                                                                     num_segments=num_nodes)  # Shape [V, D]
201 | 
202 |                         if self.params['use_edge_bias']:
203 |                             incoming_messages += tf.matmul(self.placeholders['num_incoming_edges_per_type'],
204 |                                                            self.gnn_weights.edge_biases[layer_idx])  # Shape [V, D]
205 | 
206 |                         if self.params['use_edge_msg_avg_aggregation']:
207 |                             num_incoming_edges = tf.reduce_sum(self.placeholders['num_incoming_edges_per_type'],
208 |                                                                keep_dims=True, axis=-1)  # Shape [V, 1]
209 |                             incoming_messages /= num_incoming_edges + SMALL_NUMBER
210 | 
211 |                         incoming_information = tf.concat(layer_residual_states + [incoming_messages],
212 |                                                          axis=-1)  # Shape [V, D*(1 + num of residual connections)]
213 | 
214 |                         # pass updated vertex features into RNN cell
215 |                         node_states_per_layer[-1] = self.gnn_weights.rnn_cells[layer_idx](incoming_information,
216 |                                                                                           node_states_per_layer[-1])[1]  # Shape [V, D]
217 | 
218 |         return node_states_per_layer[-1]
219 | 
220 |     def gated_regression(self, last_h, regression_gate, regression_transform):
221 |         # last_h: [v x h]
222 |         gate_input = tf.concat([last_h, self.placeholders['initial_node_representation']], axis=-1)  # [v x 2h]
223 |         gated_outputs = tf.nn.sigmoid(regression_gate(gate_input)) * regression_transform(last_h)  # [v x 1]
224 | 
225 |         # Sum up all nodes per-graph
226 |         graph_representations = tf.unsorted_segment_sum(data=gated_outputs,
227 |                                                         segment_ids=self.placeholders['graph_nodes_list'],
228 |                                                         num_segments=self.placeholders['num_graphs'])  # [g x 1]
229 |         output = tf.squeeze(graph_representations)  # [g]
230 |         self.output = output
231 |         return output
232 | 
233 |     # ----- Data preprocessing and chunking into minibatches:
234 |     def process_raw_graphs(self, raw_data: Sequence[Any], is_training_data: bool) -> Any:
235 |         processed_graphs = []
236 |         for d in raw_data:
237 |             (adjacency_lists, num_incoming_edge_per_type) = self.__graph_to_adjacency_lists(d['graph'])
238 |             processed_graphs.append({"adjacency_lists": adjacency_lists,
239 |                                      "num_incoming_edge_per_type": num_incoming_edge_per_type,
240 |                                      "init": d["node_features"],
241 |                                      "labels": [d["targets"][task_id][0] for task_id in self.params['task_ids']]})
242 | 
243 |         if is_training_data:
244 |             np.random.shuffle(processed_graphs)
245 |             for task_id in self.params['task_ids']:
246 |                 task_sample_ratio = self.params['task_sample_ratios'].get(str(task_id))
247 |                 if task_sample_ratio is not None:
248 |                     ex_to_sample = int(len(processed_graphs) * task_sample_ratio)
249 |                     for ex_id in range(ex_to_sample, len(processed_graphs)):
250 |                         processed_graphs[ex_id]['labels'][task_id] = None
251 | 
252 |         return processed_graphs
253 | 
254 |     def __graph_to_adjacency_lists(self, graph) -> Tuple[Dict[int, np.ndarray], Dict[int, Dict[int, int]]]:
255 |         adj_lists = defaultdict(list)
256 |         num_incoming_edges_dicts_per_type = defaultdict(lambda: defaultdict(lambda: 0))
257 |         for src, e, dest in graph:
258 |             fwd_edge_type = e - 1  # Make edges start from 0
259 |             adj_lists[fwd_edge_type].append((src, dest))
260 |             num_incoming_edges_dicts_per_type[fwd_edge_type][dest] += 1
261 |             if self.params['tie_fwd_bkwd']:
262 |                 adj_lists[fwd_edge_type].append((dest, src))
263 |                 num_incoming_edges_dicts_per_type[fwd_edge_type][src] += 1
264 | 
265 |         final_adj_lists = {e: np.array(sorted(lm), dtype=np.int32)
266 |                            for e, lm in adj_lists.items()}
267 | 
268 |         # Add backward edges as an additional edge type that goes backwards:
269 |         if not (self.params['tie_fwd_bkwd']):
270 |             for (edge_type, edges) in adj_lists.items():
271 |                 bwd_edge_type = self.num_edge_types + edge_type
272 |                 final_adj_lists[bwd_edge_type] = np.array(sorted((y, x) for (x, y) in edges), dtype=np.int32)
273 |                 for (x, y) in edges:
274 |                     num_incoming_edges_dicts_per_type[bwd_edge_type][y] += 1
275 | 
276 |         return final_adj_lists, num_incoming_edges_dicts_per_type
277 | 
278 |     def make_minibatch_iterator(self, data: Any, is_training: bool):
279 |         """Create minibatches by flattening adjacency matrices into a single adjacency matrix with
280 |         multiple disconnected components."""
281 |         if is_training:
282 |             np.random.shuffle(data)
283 |         # Pack until we cannot fit more graphs in the batch
284 |         state_dropout_keep_prob = self.params['graph_state_dropout_keep_prob'] if is_training else 1.
285 |         edge_weights_dropout_keep_prob = self.params['edge_weight_dropout_keep_prob'] if is_training else 1.
286 |         num_graphs = 0
287 |         while num_graphs < len(data):
288 |             num_graphs_in_batch = 0
289 |             batch_node_features = []
290 |             batch_target_task_values = []
291 |             batch_target_task_mask = []
292 |             batch_adjacency_lists = [[] for _ in range(self.num_edge_types)]
293 |             batch_num_incoming_edges_per_type = []
294 |             batch_graph_nodes_list = []
295 |             node_offset = 0
296 | 
297 |             while num_graphs < len(data) and node_offset + len(data[num_graphs]['init']) < self.params['batch_size']:
298 |                 cur_graph = data[num_graphs]
299 |                 num_nodes_in_graph = len(cur_graph['init'])
300 |                 padded_features = np.pad(cur_graph['init'],
301 |                                          ((0, 0), (0, self.params['hidden_size'] - self.annotation_size)),
302 |                                          'constant')
303 |                 batch_node_features.extend(padded_features)
304 |                 batch_graph_nodes_list.append(np.full(shape=[num_nodes_in_graph], fill_value=num_graphs_in_batch, dtype=np.int32))
305 |                 for i in range(self.num_edge_types):
306 |                     if i in cur_graph['adjacency_lists']:
307 |                         batch_adjacency_lists[i].append(cur_graph['adjacency_lists'][i] + node_offset)
308 | 
309 |                 # Turn counters for incoming edges into np array:
310 |                 num_incoming_edges_per_type = np.zeros((num_nodes_in_graph, self.num_edge_types))
311 |                 for (e_type, num_incoming_edges_per_type_dict) in cur_graph['num_incoming_edge_per_type'].items():
312 |                     for (node_id, edge_count) in num_incoming_edges_per_type_dict.items():
313 |                         num_incoming_edges_per_type[node_id, e_type] = edge_count
314 |                 batch_num_incoming_edges_per_type.append(num_incoming_edges_per_type)
315 | 
316 |                 target_task_values = []
317 |                 target_task_mask = []
318 |                 for target_val in cur_graph['labels']:
319 |                     if target_val is None:  # This is one of the examples we didn't sample...
320 |                         target_task_values.append(0.)
321 |                         target_task_mask.append(0.)
322 |                     else:
323 |                         target_task_values.append(target_val)
324 |                         target_task_mask.append(1.)
325 |                 batch_target_task_values.append(target_task_values)
326 |                 batch_target_task_mask.append(target_task_mask)
327 |                 num_graphs += 1
328 |                 num_graphs_in_batch += 1
329 |                 node_offset += num_nodes_in_graph
330 | 
331 |             batch_feed_dict = {
332 |                 self.placeholders['initial_node_representation']: np.array(batch_node_features),
333 |                 self.placeholders['num_incoming_edges_per_type']: np.concatenate(batch_num_incoming_edges_per_type, axis=0),
334 |                 self.placeholders['graph_nodes_list']: np.concatenate(batch_graph_nodes_list),
335 |                 self.placeholders['target_values']: np.transpose(batch_target_task_values, axes=[1,0]),
336 |                 self.placeholders['target_mask']: np.transpose(batch_target_task_mask, axes=[1, 0]),
337 |                 self.placeholders['num_graphs']: num_graphs_in_batch,
338 |                 self.placeholders['graph_state_keep_prob']: state_dropout_keep_prob,
339 |                 self.placeholders['edge_weight_dropout_keep_prob']: edge_weights_dropout_keep_prob
340 |             }
341 | 
342 |             # Merge adjacency lists and information about incoming nodes:
343 |             for i in range(self.num_edge_types):
344 |                 if len(batch_adjacency_lists[i]) > 0:
345 |                     adj_list = np.concatenate(batch_adjacency_lists[i])
346 |                 else:
347 |                     adj_list = np.zeros((0, 2), dtype=np.int32)
348 |                 batch_feed_dict[self.placeholders['adjacency_lists'][i]] = adj_list
349 | 
350 |             yield batch_feed_dict
351 | 
352 |     def evaluate_one_batch(self, data):
353 |         fetch_list = self.output
354 |         batch_feed_dict = self.make_minibatch_iterator(data, is_training=False)
355 |         
356 |         for item in batch_feed_dict:
357 |             item[self.placeholders['graph_state_keep_prob']] = 1.0
358 |             item[self.placeholders['edge_weight_dropout_keep_prob']] = 1.0
359 |             item[self.placeholders['out_layer_dropout_keep_prob']] = 1.0
360 |             item[self.placeholders['target_values']] = [[]]
361 |             item[self.placeholders['target_mask']] = [[]]
362 |             print(self.sess.run(fetch_list, feed_dict=item))
363 | 
364 |     def example_evaluation(self):
365 |         ''' Demonstration of what test-time code would look like
366 |         we query the model with the first n_example_molecules from the validation file
367 |         '''
368 |         n_example_molecules = 10
369 |         with open('molecules_valid.json', 'r') as valid_file:
370 |             example_molecules = json.load(valid_file)[:n_example_molecules]
371 | 
372 |         for mol in example_molecules:
373 |             print(mol['targets'])
374 | 
375 |         example_molecules = self.process_raw_graphs(example_molecules, is_training_data=False)
376 |         self.evaluate_one_batch(example_molecules)
377 | 
378 | def main():
379 |     args = docopt(__doc__)
380 |     try:
381 |         model = SparseGGNNChemModel(args)
382 |         if args['--evaluate']:
383 |             model.example_evaluation()
384 |         else:
385 |             model.train()
386 |     except:
387 |         typ, value, tb = sys.exc_info()
388 |         traceback.print_exc()
389 |         pdb.post_mortem(tb)
390 | 
391 | 
392 | if __name__ == "__main__":
393 |     main()
394 | 


--------------------------------------------------------------------------------
/get_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from rdkit import Chem
 3 | import glob
 4 | import json
 5 | import numpy as np
 6 | 
 7 | if not os.path.exists('data'):
 8 |     os.mkdir('data')
 9 |     print('made directory ./data/')
10 | 
11 | download_path = os.path.join('data', 'dsgdb9nsd.xyz.tar.bz2')
12 | if not os.path.exists(download_path):
13 |     print('downloading data to %s ...' % download_path)
14 |     source = 'https://ndownloader.figshare.com/files/3195389'
15 |     os.system('wget -O %s %s' % (download_path, source))
16 |     print('finished downloading')
17 | 
18 | unzip_path = os.path.join('data', 'qm9_raw')
19 | if not os.path.exists(unzip_path):
20 |     print('extracting data to %s ...' % unzip_path)
21 |     os.mkdir(unzip_path)
22 |     os.system('tar xvjf %s -C %s' % (download_path, unzip_path))
23 |     print('finished extracting')
24 | 
25 | def preprocess():
26 |     index_of_mu = 4
27 | 
28 |     def read_xyz(file_path):
29 |         with open(file_path, 'r') as f:
30 |             lines = f.readlines()
31 |             smiles = lines[-2].split('\t')[0]
32 |             properties = lines[1].split('\t')
33 |             mu = float(properties[index_of_mu])
34 |         return {'smiles': smiles, 'mu': mu}
35 | 
36 |     print('loading train/validation split')
37 |     with open('valid_idx.json', 'r') as f:
38 |         valid_idx = json.load(f)['valid_idxs']
39 |     valid_files = [os.path.join(unzip_path, 'dsgdb9nsd_%s.xyz' % i) for i in valid_idx]
40 | 
41 |     print('reading data...')
42 |     raw_data = {'train': [], 'valid': []}
43 |     all_files = glob.glob(os.path.join(unzip_path, '*.xyz'))
44 |     for file_idx, file_path in enumerate(all_files):
45 |         if file_idx % 100 == 0:
46 |             print('%.1f %%    \r' % (file_idx / float(len(all_files)) * 100), end=""),
47 |         if file_path not in valid_files:
48 |             raw_data['train'].append(read_xyz(file_path))
49 |         else:
50 |             raw_data['valid'].append(read_xyz(file_path))
51 |     all_mu = [mol['mu'] for mol in raw_data['train']]
52 |     mean_mu = np.mean(all_mu)
53 |     std_mu = np.std(all_mu)
54 | 
55 |     def normalize_mu(mu):
56 |         return (mu - mean_mu) / std_mu
57 | 
58 |     def onehot(idx, len):
59 |         z = [0 for _ in range(len)]
60 |         z[idx] = 1
61 |         return z
62 | 
63 |     bond_dict = {'SINGLE': 1, 'DOUBLE': 2, 'TRIPLE': 3, "AROMATIC": 4}
64 |     def to_graph(smiles):
65 |         mol = Chem.MolFromSmiles(smiles)
66 |         mol = Chem.AddHs(mol)
67 |         edges = []
68 |         nodes = []
69 |         for bond in mol.GetBonds():
70 |             edges.append((bond.GetBeginAtomIdx(), bond_dict[str(bond.GetBondType())], bond.GetEndAtomIdx()))
71 |         for atom in mol.GetAtoms():
72 |             nodes.append(onehot(["H", "C", "N", "O", "F"].index(atom.GetSymbol()), 5))
73 |         return nodes, edges
74 | 
75 |     print('parsing smiles as graphs...')
76 |     processed_data = {'train': [], 'valid': []}
77 |     for section in ['train', 'valid']:
78 |         for i,(smiles, mu) in enumerate([(mol['smiles'], mol['mu']) for mol in raw_data[section]]):
79 |             if i % 100 == 0:
80 |                 print('%s: %.1f %%      \r' % (section, 100*i/float(len(raw_data[section]))), end="")
81 |             nodes, edges = to_graph(smiles)
82 |             processed_data[section].append({
83 |                 'targets': [[normalize_mu(mu)]],
84 |                 'graph': edges,
85 |                 'node_features': nodes
86 |             })
87 |         print('%s: 100 %%      ' % (section))
88 |         with open('molecules_%s.json' % section, 'w') as f:
89 |             json.dump(processed_data[section], f)
90 | 
91 | preprocess()
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | docopt==0.6.2
2 | tensorflow==1.3.0
3 | numpy==1.13.1
4 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env/python
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | import queue
 6 | import threading
 7 | 
 8 | SMALL_NUMBER = 1e-7
 9 | 
10 | 
11 | def glorot_init(shape):
12 |     initialization_range = np.sqrt(6.0 / (shape[-2] + shape[-1]))
13 |     return np.random.uniform(low=-initialization_range, high=initialization_range, size=shape).astype(np.float32)
14 | 
15 | 
16 | class ThreadedIterator:
17 |     """An iterator object that computes its elements in a parallel thread to be ready to be consumed.
18 |     The iterator should *not* return None"""
19 | 
20 |     def __init__(self, original_iterator, max_queue_size: int=2):
21 |         self.__queue = queue.Queue(maxsize=max_queue_size)
22 |         self.__thread = threading.Thread(target=lambda: self.worker(original_iterator))
23 |         self.__thread.start()
24 | 
25 |     def worker(self, original_iterator):
26 |         for element in original_iterator:
27 |             assert element is not None, 'By convention, iterator elements much not be None'
28 |             self.__queue.put(element, block=True)
29 |         self.__queue.put(None, block=True)
30 | 
31 |     def __iter__(self):
32 |         next_element = self.__queue.get(block=True)
33 |         while next_element is not None:
34 |             yield next_element
35 |             next_element = self.__queue.get(block=True)
36 |         self.__thread.join()
37 | 
38 | 
39 | class MLP(object):
40 |     def __init__(self, in_size, out_size, hid_sizes, dropout_keep_prob):
41 |         self.in_size = in_size
42 |         self.out_size = out_size
43 |         self.hid_sizes = hid_sizes
44 |         self.dropout_keep_prob = dropout_keep_prob
45 |         self.params = self.make_network_params()
46 | 
47 |     def make_network_params(self):
48 |         dims = [self.in_size] + self.hid_sizes + [self.out_size]
49 |         weight_sizes = list(zip(dims[:-1], dims[1:]))
50 |         weights = [tf.Variable(self.init_weights(s), name='MLP_W_layer%i' % i)
51 |                    for (i, s) in enumerate(weight_sizes)]
52 |         biases = [tf.Variable(np.zeros(s[-1]).astype(np.float32), name='MLP_b_layer%i' % i)
53 |                   for (i, s) in enumerate(weight_sizes)]
54 | 
55 |         network_params = {
56 |             "weights": weights,
57 |             "biases": biases,
58 |         }
59 | 
60 |         return network_params
61 | 
62 |     def init_weights(self, shape):
63 |         return np.sqrt(6.0 / (shape[-2] + shape[-1])) * (2 * np.random.rand(*shape).astype(np.float32) - 1)
64 | 
65 |     def __call__(self, inputs):
66 |         acts = inputs
67 |         for W, b in zip(self.params["weights"], self.params["biases"]):
68 |             hid = tf.matmul(acts, tf.nn.dropout(W, self.dropout_keep_prob)) + b
69 |             acts = tf.nn.relu(hid)
70 |         last_hidden = hid
71 |         return last_hidden


--------------------------------------------------------------------------------