├── .gitignore ├── CITATION.cff ├── LICENSE ├── README.md ├── common.py ├── experiment.py ├── gnn-comparison ├── .gitignore ├── Check isomorphisms and noisy labels.ipynb ├── Conda_Info ├── EndToEnd_Evaluation.py ├── LICENSE ├── Launch_Experiments.py ├── PrepareDatasets.py ├── README.md ├── config │ ├── __init__.py │ ├── base.py │ └── utils.py ├── config_BaselineChemical.yml ├── config_BaselineENZYMES.yml ├── config_BaselineIMDB.yml ├── config_BaselineSocial.yml ├── config_DGCNN.yml ├── config_DiffPool.yml ├── config_ECC.yml ├── config_GIN.yml ├── config_GraphSAGE.yml ├── evaluation │ ├── dataset_getter.py │ ├── model_selection │ │ ├── HoldOutSelector.py │ │ └── K_Fold_Selection.py │ └── risk_assessment │ │ ├── HoldOutAssessment.py │ │ └── K_Fold_Assessment.py ├── experiments │ ├── EndToEndExperiment.py │ └── Experiment.py ├── log │ ├── Logger.py │ └── __init__.py ├── models │ ├── __init__.py │ ├── gnn_wrapper │ │ ├── NetWrapper.py │ │ └── __init__.py │ ├── graph_classifiers │ │ ├── DGCNN.py │ │ ├── DeepMultisets.py │ │ ├── DiffPool.py │ │ ├── ECC.py │ │ ├── GIN.py │ │ ├── GraphSAGE.py │ │ ├── MLP_Classifier.py │ │ ├── MolecularFingerprint.py │ │ ├── __init__.py │ │ └── self_attention.py │ ├── modules.py │ ├── schedulers │ │ └── ECCScheduler.py │ └── utils │ │ ├── EarlyStopper.py │ │ └── __init__.py ├── requirements.txt └── utils │ ├── batch_utils.py │ ├── eval_across_folds.py │ └── utils.py ├── images ├── fig3.png └── fig5.png ├── main.py ├── models └── graph_model.py ├── requirements.txt ├── run-gat-2-8.py ├── run-gcn-2-8.py ├── run-ggnn-2-8.py ├── run-gin-2-8.py ├── tasks ├── dictionary_lookup.py └── tree_dataset.py └── tf-gnn-samples ├── .flake8 ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── gnns ├── __init__.py ├── ggnn.py ├── gnn_edge_mlp.py ├── gnn_film.py ├── rgat.py ├── rgcn.py ├── rgdcn.py └── rgin.py ├── models ├── __init__.py ├── ggnn_model.py ├── gnn_edge_mlp_model.py ├── gnn_film_model.py ├── no_struct_mlp_model.py ├── rgat_model.py ├── rgcn_model.py ├── rgdcn_model.py ├── rgin_model.py ├── self_attention.py └── sparse_graph_model.py ├── requirements.txt ├── run_ppi_benchs.py ├── run_qm9_benchs.py ├── run_qm9_benchs_fa.py ├── run_varmisuse_benchs.py ├── run_varmisuse_benchs_fa.py ├── tasks ├── __init__.py ├── citation_network_task.py ├── default_hypers │ ├── PPI_GGNN.json │ ├── PPI_GNN-Edge-MLP0.json │ ├── PPI_GNN-Edge-MLP1.json │ ├── PPI_GNN-FiLM.json │ ├── PPI_RGAT.json │ ├── PPI_RGCN.json │ ├── PPI_RGIN.json │ ├── QM9_GGNN.json │ ├── QM9_GNN-Edge-MLP0.json │ ├── QM9_GNN-Edge-MLP1.json │ ├── QM9_GNN-FiLM.json │ ├── QM9_NoStruct-MLP0.json │ ├── QM9_RGAT.json │ ├── QM9_RGCN.json │ ├── QM9_RGIN.json │ ├── VarMisuse_GGNN.json │ ├── VarMisuse_GNN-Edge-MLP0.json │ ├── VarMisuse_GNN-Edge-MLP1.json │ ├── VarMisuse_GNN-FiLM.json │ ├── VarMisuse_NoStruct-MLP1.json │ ├── VarMisuse_RGAT.json │ ├── VarMisuse_RGCN.json │ └── VarMisuse_RGIN.json ├── ppi_task.py ├── qm9_task.py ├── sparse_graph_task.py └── varmisuse_task.py ├── test.py ├── train.py └── utils ├── __init__.py ├── add_child_ids.py ├── citation_network_utils.py ├── compute_diameters.py ├── model_utils.py ├── prep_baseline.py ├── utils.py └── varmisuse_data_splitter.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | .idea 131 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | @inproceedings{ 2 | alon2021on, 3 | title={On the Bottleneck of Graph Neural Networks and its Practical Implications}, 4 | author={Uri Alon and Eran Yahav}, 5 | booktitle={International Conference on Learning Representations}, 6 | year={2021}, 7 | url={https://openreview.net/forum?id=i80OPhOCVH2} 8 | } 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 tech-srl 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, auto 2 | 3 | from tasks.dictionary_lookup import DictionaryLookupDataset 4 | 5 | from torch import nn 6 | from torch_geometric.nn import GCNConv, GatedGraphConv, GINConv, GATConv 7 | 8 | 9 | class Task(Enum): 10 | NEIGHBORS_MATCH = auto() 11 | 12 | @staticmethod 13 | def from_string(s): 14 | try: 15 | return Task[s] 16 | except KeyError: 17 | raise ValueError() 18 | 19 | def get_dataset(self, depth, train_fraction): 20 | if self is Task.NEIGHBORS_MATCH: 21 | dataset = DictionaryLookupDataset(depth) 22 | else: 23 | dataset = None 24 | 25 | return dataset.generate_data(train_fraction) 26 | 27 | 28 | class GNN_TYPE(Enum): 29 | GCN = auto() 30 | GGNN = auto() 31 | GIN = auto() 32 | GAT = auto() 33 | 34 | @staticmethod 35 | def from_string(s): 36 | try: 37 | return GNN_TYPE[s] 38 | except KeyError: 39 | raise ValueError() 40 | 41 | def get_layer(self, in_dim, out_dim): 42 | if self is GNN_TYPE.GCN: 43 | return GCNConv( 44 | in_channels=in_dim, 45 | out_channels=out_dim) 46 | elif self is GNN_TYPE.GGNN: 47 | return GatedGraphConv(out_channels=out_dim, num_layers=1) 48 | elif self is GNN_TYPE.GIN: 49 | return GINConv(nn.Sequential(nn.Linear(in_dim, out_dim), nn.BatchNorm1d(out_dim), nn.ReLU(), 50 | nn.Linear(out_dim, out_dim), nn.BatchNorm1d(out_dim), nn.ReLU())) 51 | elif self is GNN_TYPE.GAT: 52 | # 4-heads, although the paper by Velickovic et al. had used 6-8 heads. 53 | # The output will be the concatenation of the heads, yielding a vector of size out_dim 54 | num_heads = 4 55 | return GATConv(in_dim, out_dim // num_heads, heads=num_heads) 56 | 57 | 58 | class STOP(Enum): 59 | TRAIN = auto() 60 | TEST = auto() 61 | 62 | @staticmethod 63 | def from_string(s): 64 | try: 65 | return STOP[s] 66 | except KeyError: 67 | raise ValueError() 68 | 69 | 70 | def one_hot(key, depth): 71 | return [1 if i == key else 0 for i in range(depth)] 72 | -------------------------------------------------------------------------------- /gnn-comparison/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # pylint 104 | .pylintrc 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | # vscode 110 | .vscode 111 | 112 | # PROJECT 113 | *DATA*/** 114 | Old_Baselines/** 115 | *RESULTS*/** 116 | 117 | 118 | .idea/inspectionProfiles/Project_Default.xml 119 | *.xml 120 | models/.DS_Store 121 | .DS_Store 122 | -------------------------------------------------------------------------------- /gnn-comparison/Check isomorphisms and noisy labels.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import itertools\n", 10 | "import networkx as nx\n", 11 | "from torch_geometric.datasets import TUDataset\n", 12 | "from torch_geometric.utils import to_networkx\n", 13 | "\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "def convert(data):\n", 24 | " G = to_networkx(data)\n", 25 | " G.graph['label'] = data.y.item()\n", 26 | " return nx.to_undirected(G)\n", 27 | "\n", 28 | "\n", 29 | "def dataset_to_graphs(dataset):\n", 30 | " graphs = []\n", 31 | " for data in dataset:\n", 32 | " graphs.append(convert(data))\n", 33 | " return graphs\n", 34 | " \n", 35 | "\n", 36 | "def check(graphs):\n", 37 | " num_iso_pairs = 0\n", 38 | " num_inconsistent_labels = 0\n", 39 | " num_graphs = len(graphs)\n", 40 | " combinations = itertools.combinations(range(num_graphs), 2)\n", 41 | " \n", 42 | " for (i1, i2) in combinations:\n", 43 | " G1, G2 = graphs[i1], graphs[i2]\n", 44 | " label1, label2 = G1.graph['label'], G2.graph['label']\n", 45 | " \n", 46 | " if nx.is_isomorphic(G1, G2):\n", 47 | " num_iso_pairs += 1\n", 48 | " if label1 != label2:\n", 49 | " num_inconsistent_labels += 1\n", 50 | " \n", 51 | " print(f\"number of isomorphic pairs: {num_iso_pairs}\") \n", 52 | " print(f\"number of isomorphic pairs with inconsistent labels: {num_inconsistent_labels}\")\n", 53 | " print(f\"ratio of inconsistently labelled isomorphic pairs vs. isomorphic pairs: {num_inconsistent_labels / num_iso_pairs:.4f}\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": { 60 | "scrolled": false 61 | }, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "############## Checking dataset IMDB-BINARY ##############\n", 68 | "number of isomorphic pairs: 3356\n", 69 | "number of isomorphic pairs with inconsistent labels: 1119\n", 70 | "ratio of inconsistently labelled isomorphic pairs vs. isomorphic pairs: 0.3334\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "dataset_name = \"IMDB-BINARY\"\n", 76 | "\n", 77 | "print(f\"############## Checking dataset {dataset_name} ##############\")\n", 78 | "dataset = TUDataset(f'tmp/{dataset_name}', dataset_name)\n", 79 | "graphs = dataset_to_graphs(dataset)\n", 80 | "\n", 81 | "# WARNING: this might take several minutes depending on your hardware\n", 82 | "check(graphs)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "scrolled": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "def bfs_seq(G, start_id):\n", 94 | " \"\"\" taken from https://github.com/JiaxuanYou/graph-generation/blob/master/data.py \"\"\"\n", 95 | " dictionary = dict(nx.bfs_successors(G, start_id))\n", 96 | " start = [start_id]\n", 97 | " output = [start_id]\n", 98 | " while len(start) > 0:\n", 99 | " next = []\n", 100 | " while len(start) > 0:\n", 101 | " current = start.pop(0)\n", 102 | " neighbor = dictionary.get(current)\n", 103 | " if neighbor is not None:\n", 104 | " next = next + neighbor\n", 105 | " output = output + next\n", 106 | " start = next\n", 107 | " return output\n", 108 | "\n", 109 | "# 10 and 710 have different labels, but are isomorphic\n", 110 | "G1, G2 = graphs[10], graphs[710]\n", 111 | "\n", 112 | "# reorder nodes\n", 113 | "seq1, seq2 = bfs_seq(G1, 0), bfs_seq(G2, 0)\n", 114 | "G2 = nx.relabel_nodes(G2, {n:m for n, m in zip(seq2, seq1)})\n", 115 | "print(f\"G1 label: {G1.graph['label']} - G2 label: {G2.graph['label']}\")\n", 116 | "\n", 117 | "fig, axs = plt.subplots(1, 2)\n", 118 | "pos = nx.random_layout(G1, seed=42)\n", 119 | "nx.draw_networkx(G1, pos=pos, ax=axs.flat[0])\n", 120 | "nx.draw_networkx(G2, pos=pos, ax=axs.flat[1])" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.7.4" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 2 152 | } 153 | -------------------------------------------------------------------------------- /gnn-comparison/Conda_Info: -------------------------------------------------------------------------------- 1 | Install gcc_linux-64 and gxx_linux-64 (or similar names) with conda 2 | Use export CUDA_VISIBLE_DEVICES="" if Graph_Comparison_Models crashes when launched in parallel 3 | Use export OMP_NUM_THREADS=1 in case of issues with MultiProcessing 4 | 5 | -------------------------------------------------------------------------------- /gnn-comparison/EndToEnd_Evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | from config.base import Grid, Config 5 | 6 | from evaluation.model_selection.HoldOutSelector import HoldOutSelector 7 | from evaluation.risk_assessment.K_Fold_Assessment import KFoldAssessment 8 | from experiments.EndToEndExperiment import EndToEndExperiment 9 | 10 | 11 | def main(config_file, dataset_name, 12 | outer_k, outer_processes, inner_k, inner_processes, result_folder, debug=False): 13 | 14 | # Needed to avoid thread spawning, conflicts with multi-processing. You may set a number > 1 but take into account 15 | # the number of processes on the machine 16 | torch.set_num_threads(1) 17 | 18 | experiment_class = EndToEndExperiment 19 | 20 | model_configurations = Grid(config_file, dataset_name) 21 | model_configuration = Config(**model_configurations[0]) 22 | 23 | exp_path = os.path.join(result_folder, f'{model_configuration.exp_name}_assessment') 24 | 25 | model_selector = HoldOutSelector(max_processes=inner_processes) 26 | risk_assesser = KFoldAssessment(outer_k, model_selector, exp_path, model_configurations, 27 | outer_processes=outer_processes) 28 | 29 | risk_assesser.risk_assessment(experiment_class, debug=debug) 30 | -------------------------------------------------------------------------------- /gnn-comparison/Launch_Experiments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from EndToEnd_Evaluation import main as endtoend 3 | 4 | 5 | def get_args(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--config-file', dest='config_file') 8 | parser.add_argument('--experiment', dest='experiment', default='endtoend') 9 | parser.add_argument('--result-folder', dest='result_folder', default='RESULTS') 10 | parser.add_argument('--dataset-name', dest='dataset_name', default='none') 11 | parser.add_argument('--outer-folds', dest='outer_folds', default=10) 12 | parser.add_argument('--outer-processes', dest='outer_processes', default=2) 13 | parser.add_argument('--inner-folds', dest='inner_folds', default=5) 14 | parser.add_argument('--inner-processes', dest='inner_processes', default=1) 15 | parser.add_argument('--debug', action="store_true", dest='debug') 16 | return parser.parse_args() 17 | 18 | 19 | if __name__ == "__main__": 20 | args = get_args() 21 | 22 | if args.dataset_name != 'none': 23 | datasets = [args.dataset_name] 24 | else: 25 | datasets = ['IMDB-MULTI', 'IMDB-BINARY', 'PROTEINS', 'NCI1', 'ENZYMES', 'DD', 26 | 'REDDIT-BINARY', 'REDDIT-MULTI-5K', 'COLLAB', 'REDDIT-MULTI-12K'] 27 | 28 | config_file = args.config_file 29 | experiment = args.experiment 30 | 31 | for dataset_name in datasets: 32 | try: 33 | endtoend(config_file, dataset_name, 34 | outer_k=int(args.outer_folds), outer_processes=int(args.outer_processes), 35 | inner_k=int(args.inner_folds), inner_processes=int(args.inner_processes), 36 | result_folder=args.result_folder, debug=args.debug) 37 | 38 | except Exception as e: 39 | raise e # print(e) -------------------------------------------------------------------------------- /gnn-comparison/PrepareDatasets.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from datasets import * 4 | 5 | 6 | DATASETS = { 7 | 'REDDIT-BINARY': RedditBinary, 8 | 'REDDIT-MULTI-5K': Reddit5K, 9 | 'COLLAB': Collab, 10 | 'IMDB-BINARY': IMDBBinary, 11 | 'IMDB-MULTI': IMDBMulti, 12 | 'NCI1': NCI1, 13 | 'ENZYMES': Enzymes, 14 | 'PROTEINS': Proteins, 15 | 'DD': DD 16 | } 17 | 18 | 19 | def get_args_dict(): 20 | parser = argparse.ArgumentParser() 21 | 22 | parser.add_argument('DATA_DIR', 23 | help='where to save the datasets') 24 | parser.add_argument('--dataset-name', dest='dataset_name', 25 | choices=DATASETS.keys(), default='all', help='dataset name [Default: \'all\']') 26 | parser.add_argument('--outer-k', dest='outer_k', type=int, 27 | default=10, help='evaluation folds [Default: 10]') 28 | parser.add_argument('--inner-k', dest='inner_k', type=int, 29 | default=None, help='model selection folds [Default: None]') 30 | parser.add_argument('--use-one', action='store_true', 31 | default=False, help='use 1 as feature') 32 | parser.add_argument('--use-degree', dest='use_node_degree', action='store_true', 33 | default=False, help='use degree as feature') 34 | parser.add_argument('--no-kron', dest='precompute_kron_indices', action='store_false', 35 | default=True, help='don\'t precompute kron reductions') 36 | 37 | return vars(parser.parse_args()) 38 | 39 | 40 | def preprocess_dataset(name, args_dict): 41 | dataset_class = DATASETS[name] 42 | if name == 'ENZYMES': 43 | args_dict.update(use_node_attrs=True) 44 | dataset_class(**args_dict) 45 | 46 | 47 | if __name__ == "__main__": 48 | args_dict = get_args_dict() 49 | 50 | print(args_dict) 51 | 52 | dataset_name = args_dict.pop('dataset_name') 53 | if dataset_name == 'all': 54 | for name in DATASETS: 55 | preprocess_dataset(name, args_dict) 56 | else: 57 | preprocess_dataset(dataset_name, args_dict) -------------------------------------------------------------------------------- /gnn-comparison/README.md: -------------------------------------------------------------------------------- 1 | # A Fair Comparison of Graph Neural Networks for Graph Classification 2 | 3 | ## Summary 4 | 5 | The library includes data and scripts to reproduce the experiments reported in the paper. 6 | 7 | This research software is provided as is. If you happen to use or modify this code, please remember to cite the paper: 8 | 9 | [*Federico Errica and Marco Podda, Davide Bacciu, Alessio Micheli: A Fair Comparison of Graph Neural Networks for Graph Classification. Proceedings of the 8th International Conference on Learning Representations (ICLR 2020).*](https://openreview.net/pdf?id=HygDF6NFPB) 10 | 11 | ### Instructions 12 | 13 | To reproduce the experiments, first preprocess datasets as follows: 14 | 15 | `python PrepareDatasets.py DATA/CHEMICAL --dataset-name --outer-k 10` 16 | 17 | `python PrepareDatasets.py DATA/SOCIAL_1 --dataset-name --use-one --outer-k 10` 18 | 19 | `python PrepareDatasets.py DATA/SOCIAL_DEGREE --dataset-name --use-degree --outer-k 10` 20 | 21 | Where `` is the name of the dataset. Then, substitute the split (json) files with the ones provided in the `data_splits` folder. 22 | 23 | Please note that dataset folders should be organized as follows: 24 | 25 | CHEMICAL: 26 | NCI1 27 | DD 28 | ENZYMES 29 | PROTEINS 30 | SOCIAL[_1 | _DEGREE]: 31 | IMDB-BINARY 32 | IMDB-MULTI 33 | REDDIT-BINARY 34 | REDDIT-MULTI-5K 35 | COLLAB 36 | 37 | Then, you can launch experiments by typing: 38 | 39 | `cp -r DATA/[CHEMICAL|SOCIAL_1|SOCIAL_DEGREE]/ DATA` 40 | `python Launch_Experiments.py --config-file --dataset-name --result-folder --debug` 41 | 42 | Where `` is your config file (e.g. config_BaselineChemical.yml), and `` is the dataset name chosen as before. 43 | 44 | ### Troubleshooting 45 | 46 | The installation of Pytorch Geometric depends on other libraries (torch_scatter, torch_cluster, torch_sparse) that have to be installed separately and before torch_geometric. Do not use pip install -r requirements.txt because it will not work. Please refer to the [official instructions](https://github.com/rusty1s/pytorch_geometric) to install the required libraries. 47 | 48 | -------------------------------------------------------------------------------- /gnn-comparison/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/config/__init__.py -------------------------------------------------------------------------------- /gnn-comparison/config/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import json 3 | import yaml 4 | import pickle 5 | 6 | 7 | def read_config_file(dict_or_filelike): 8 | if isinstance(dict_or_filelike, dict): 9 | return dict_or_filelike 10 | 11 | path = Path(dict_or_filelike) 12 | if path.suffix == ".json": 13 | return json.load(open(path, "r")) 14 | elif path.suffix in [".yaml", ".yml"]: 15 | return yaml.load(open(path, "r"), Loader=yaml.FullLoader) 16 | elif path.suffix in [".pkl", ".pickle"]: 17 | return pickle.load(open(path, "rb")) 18 | 19 | raise ValueError("Only JSON, YaML and pickle files supported.") 20 | -------------------------------------------------------------------------------- /gnn-comparison/config_BaselineChemical.yml: -------------------------------------------------------------------------------- 1 | model: 2 | - MolecularFingerprint 3 | device: 4 | - cpu 5 | batch_size: 6 | - 32 7 | - 128 8 | hidden_units: 9 | - 32 10 | - 128 11 | - 256 12 | learning_rate: 13 | - 0.000001 14 | - 0.001 15 | - 0.1 16 | l2: 17 | - 0.0001 18 | - 0.001 19 | - 0.01 20 | classifier_epochs: 21 | - 5000 22 | optimizer: 23 | - Adam 24 | scheduler: 25 | - null 26 | loss: 27 | - MulticlassClassificationLoss 28 | gradient_clipping: 29 | - null 30 | early_stopper: 31 | - 32 | class: Patience 33 | args: 34 | patience: 500 35 | use_loss: False 36 | - 37 | class: Patience 38 | args: 39 | patience: 500 40 | use_loss: True 41 | shuffle: 42 | - True 43 | -------------------------------------------------------------------------------- /gnn-comparison/config_BaselineENZYMES.yml: -------------------------------------------------------------------------------- 1 | model: 2 | - DeepMultisets 3 | device: 4 | - cpu 5 | batch_size: 6 | - 32 7 | hidden_units: 8 | - 32 9 | - 64 10 | - 128 11 | - 256 12 | learning_rate: 13 | - 0.00001 14 | - 0.0001 15 | - 0.001 16 | l2: 17 | - 0.01 18 | - 0.0001 19 | - 0.00001 20 | classifier_epochs: 21 | - 5000 22 | optimizer: 23 | - Adam 24 | scheduler: 25 | - null 26 | loss: 27 | - MulticlassClassificationLoss 28 | gradient_clipping: 29 | - null 30 | early_stopper: 31 | - 32 | class: Patience 33 | args: 34 | patience: 500 35 | use_loss: False 36 | - 37 | class: Patience 38 | args: 39 | patience: 500 40 | use_loss: True 41 | shuffle: 42 | - True 43 | -------------------------------------------------------------------------------- /gnn-comparison/config_BaselineIMDB.yml: -------------------------------------------------------------------------------- 1 | model: 2 | - DeepMultisets 3 | device: 4 | - cpu 5 | batch_size: 6 | - 32 7 | - 128 8 | hidden_units: 9 | - 32 10 | - 128 11 | - 256 12 | learning_rate: 13 | - 0.000001 14 | - 0.001 15 | - 0.1 16 | l2: 17 | - 0.0001 18 | - 0.001 19 | - 0.01 20 | classifier_epochs: 21 | - 3000 22 | optimizer: 23 | - Adam 24 | scheduler: 25 | - null 26 | loss: 27 | - MulticlassClassificationLoss 28 | gradient_clipping: 29 | - null 30 | early_stopper: 31 | - 32 | class: Patience 33 | args: 34 | patience: 500 35 | use_loss: False 36 | - 37 | class: Patience 38 | args: 39 | patience: 500 40 | use_loss: True 41 | shuffle: 42 | - True 43 | -------------------------------------------------------------------------------- /gnn-comparison/config_BaselineSocial.yml: -------------------------------------------------------------------------------- 1 | model: 2 | - DeepMultisets 3 | device: 4 | - cpu 5 | batch_size: 6 | - 32 7 | hidden_units: 8 | - 32 9 | - 128 10 | learning_rate: 11 | - 0.001 12 | - 0.1 13 | l2: 14 | - 0.0001 15 | - 0.001 16 | - 0.01 17 | classifier_epochs: 18 | - 3000 19 | optimizer: 20 | - Adam 21 | scheduler: 22 | - null 23 | loss: 24 | - MulticlassClassificationLoss 25 | gradient_clipping: 26 | - null 27 | early_stopper: 28 | - 29 | class: Patience 30 | args: 31 | patience: 500 32 | use_loss: False 33 | - 34 | class: Patience 35 | args: 36 | patience: 500 37 | use_loss: True 38 | shuffle: 39 | - True 40 | -------------------------------------------------------------------------------- /gnn-comparison/config_DGCNN.yml: -------------------------------------------------------------------------------- 1 | model: 2 | - DGCNN 3 | device: 4 | - cpu 5 | batch_size: 6 | - 50 7 | last_layer_fa: 8 | - false 9 | dense_dim: 10 | - 128 11 | k: 12 | - 0.9 13 | - 0.6 14 | learning_rate: 15 | - 0.0001 16 | - 0.00001 17 | classifier_epochs: 18 | - 1000 19 | optimizer: 20 | - Adam 21 | scheduler: 22 | - null 23 | loss: 24 | - MulticlassClassificationLoss 25 | gradient_clipping: 26 | - null 27 | early_stopper: 28 | - 29 | class: Patience 30 | args: 31 | patience: 500 32 | use_loss: False 33 | - 34 | class: Patience 35 | args: 36 | patience: 500 37 | use_loss: True 38 | shuffle: 39 | - True 40 | l2: 41 | - 0. 42 | embedding_dim: 43 | - 32 44 | - 64 45 | num_layers: 46 | - 2 47 | - 3 48 | - 4 49 | -------------------------------------------------------------------------------- /gnn-comparison/config_DiffPool.yml: -------------------------------------------------------------------------------- 1 | model: 2 | - DiffPool 3 | device: 4 | - cpu 5 | batch_size: 6 | - 8 7 | last_layer_fa: 8 | - false 9 | learning_rate: 10 | - 0.001 11 | - 0.0001 12 | - 0.00001 13 | classifier_epochs: 14 | - 3000 15 | optimizer: 16 | - Adam 17 | scheduler: 18 | - null 19 | loss: 20 | - DiffPoolMulticlassClassificationLoss 21 | l2: 22 | - 0. 23 | gradient_clipping: 24 | - 2.0 25 | early_stopper: 26 | - 27 | class: Patience 28 | args: 29 | patience: 500 30 | use_loss: False 31 | - 32 | class: Patience 33 | args: 34 | patience: 500 35 | use_loss: True 36 | shuffle: 37 | - True 38 | num_layers: 39 | # - 1 40 | - 2 41 | gnn_dim_hidden: 42 | # - 32 43 | - 64 44 | dim_embedding: 45 | # - 64 46 | - 128 47 | dim_embedding_MLP: 48 | - 50 49 | 50 | 51 | -------------------------------------------------------------------------------- /gnn-comparison/config_ECC.yml: -------------------------------------------------------------------------------- 1 | model: 2 | - ECC 3 | device: 4 | - cpu 5 | batch_size: 6 | - 32 7 | last_layer_fa: 8 | - false 9 | learning_rate: 10 | - 0.1 11 | - 0.01 12 | classifier_epochs: 13 | - 1000 14 | optimizer: 15 | - SGD 16 | scheduler: 17 | - 18 | class: ECCLR 19 | args: 20 | gamma: 0.1 21 | step_size: 10 22 | loss: 23 | - MulticlassClassificationLoss 24 | gradient_clipping: 25 | - null 26 | early_stopper: 27 | - 28 | class: Patience 29 | args: 30 | patience: 500 31 | use_loss: False 32 | - 33 | class: Patience 34 | args: 35 | patience: 500 36 | use_loss: True 37 | shuffle: 38 | - True 39 | l2: 40 | - 0. 41 | dropout: 42 | - 0.05 43 | - 0.25 44 | dropout_final: 45 | - 0.1 46 | num_layers: 47 | # - 1 48 | - 2 49 | dim_embedding: 50 | - 32 51 | - 64 52 | -------------------------------------------------------------------------------- /gnn-comparison/config_GIN.yml: -------------------------------------------------------------------------------- 1 | model: 2 | - GIN 3 | device: 4 | - cpu 5 | last_layer_fa: 6 | - false 7 | batch_size: 8 | - 32 9 | - 128 10 | learning_rate: 11 | - 0.01 12 | classifier_epochs: 13 | - 1000 14 | hidden_units: # Note: GIN add a first layer that simply adds up all node features 15 | - [64, 64, 64, 64] 16 | - [32, 32, 32, 32] 17 | - [64] 18 | - [32, 32] 19 | optimizer: 20 | - Adam 21 | scheduler: 22 | - 23 | class: StepLR 24 | args: 25 | step_size: 50 26 | gamma: 0.5 27 | loss: 28 | - MulticlassClassificationLoss 29 | train_eps: 30 | - true 31 | - false 32 | l2: 33 | - 0. 34 | aggregation: 35 | - mean 36 | - sum 37 | gradient_clipping: 38 | - null 39 | dropout: 40 | - 0.5 41 | - 0. 42 | early_stopper: 43 | - 44 | class: Patience 45 | args: 46 | patience: 500 47 | use_loss: False 48 | - 49 | class: Patience 50 | args: 51 | patience: 500 52 | use_loss: True 53 | shuffle: 54 | - True 55 | resume: 56 | - False 57 | -------------------------------------------------------------------------------- /gnn-comparison/config_GraphSAGE.yml: -------------------------------------------------------------------------------- 1 | model: 2 | - GraphSAGE 3 | device: 4 | - cpu 5 | batch_size: 6 | - 32 7 | last_layer_fa: 8 | - false 9 | learning_rate: 10 | - 0.0001 11 | - 0.01 12 | - 0.001 13 | l2: 14 | - 0. 15 | classifier_epochs: 16 | - 1000 17 | optimizer: 18 | - Adam 19 | scheduler: 20 | - null 21 | loss: 22 | - MulticlassClassificationLoss 23 | gradient_clipping: 24 | - null 25 | early_stopper: 26 | - 27 | class: Patience 28 | args: 29 | patience: 500 30 | use_loss: False 31 | - 32 | class: Patience 33 | args: 34 | patience: 500 35 | use_loss: True 36 | shuffle: 37 | - True 38 | dim_embedding: 39 | - 32 40 | - 64 41 | num_layers: 42 | - 3 43 | - 5 44 | aggregation: 45 | - add 46 | - max 47 | - mean 48 | -------------------------------------------------------------------------------- /gnn-comparison/evaluation/dataset_getter.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class DatasetGetter: 4 | 5 | def __init__(self, outer_k=None, inner_k=None): 6 | self.outer_k = outer_k 7 | self.inner_k = inner_k 8 | 9 | def set_inner_k(self, k): 10 | self.inner_k = k 11 | 12 | def get_train_val(self, dataset, batch_size, shuffle=True): 13 | return dataset.get_model_selection_fold(self.outer_k, self.inner_k, batch_size, shuffle) 14 | 15 | def get_test(self, dataset, batch_size, shuffle=True): 16 | return dataset.get_test_fold(self.outer_k, batch_size, shuffle) -------------------------------------------------------------------------------- /gnn-comparison/evaluation/model_selection/HoldOutSelector.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import concurrent.futures 4 | 5 | from log.Logger import Logger 6 | 7 | 8 | class HoldOutSelector: 9 | """ 10 | Class implementing a sufficiently general framework to do model selection 11 | """ 12 | 13 | def __init__(self, max_processes): 14 | self.max_processes = max_processes 15 | 16 | # Create the experiments folder straight away 17 | self._CONFIG_BASE = 'config_' 18 | self._CONFIG_FILENAME = 'config_results.json' 19 | self.WINNER_CONFIG_FILENAME = 'winner_config.json' 20 | 21 | def process_results(self, HOLDOUT_MS_FOLDER, no_configurations): 22 | 23 | best_vl = 0. 24 | 25 | for i in range(1, no_configurations+1): 26 | try: 27 | config_filename = os.path.join(HOLDOUT_MS_FOLDER, self._CONFIG_BASE + str(i), 28 | self._CONFIG_FILENAME) 29 | 30 | with open(config_filename, 'r') as fp: 31 | config_dict = json.load(fp) 32 | 33 | vl = config_dict['VL_score'] 34 | 35 | if best_vl <= vl: 36 | best_i = i 37 | best_vl = vl 38 | best_config = config_dict 39 | 40 | except Exception as e: 41 | print(e) 42 | 43 | print('Model selection winner for experiment', HOLDOUT_MS_FOLDER, 'is config ', best_i, ':') 44 | for k in best_config.keys(): 45 | print('\t', k, ':', best_config[k]) 46 | 47 | return best_config 48 | 49 | def model_selection(self, dataset_getter, experiment_class, exp_path, model_configs, debug=False, other=None): 50 | """ 51 | :param experiment_class: the kind of experiment used 52 | :param debug: 53 | :return: the best performing configuration on average over the k folds. TL;DR RETURNS A MODEL, NOT AN ESTIMATE! 54 | """ 55 | HOLDOUT_MS_FOLDER = os.path.join(exp_path, 'HOLDOUT_MS') 56 | 57 | if not os.path.exists(HOLDOUT_MS_FOLDER): 58 | os.makedirs(HOLDOUT_MS_FOLDER) 59 | 60 | config_id = 0 61 | 62 | pool = concurrent.futures.ProcessPoolExecutor(max_workers=self.max_processes) 63 | 64 | for config in model_configs: # generate_grid(model_configs): 65 | 66 | # Create a separate folder for each experiment 67 | exp_config_name = os.path.join(HOLDOUT_MS_FOLDER, self._CONFIG_BASE + str(config_id + 1)) 68 | if not os.path.exists(exp_config_name): 69 | os.makedirs(exp_config_name) 70 | 71 | json_config = os.path.join(exp_config_name, self._CONFIG_FILENAME) 72 | if not os.path.exists(json_config): 73 | if not debug: 74 | pool.submit(self._model_selection_helper, dataset_getter, experiment_class, config, 75 | exp_config_name, other) 76 | else: # DEBUG 77 | self._model_selection_helper(dataset_getter, experiment_class, config, exp_config_name, 78 | other) 79 | else: 80 | # Do not recompute experiments for this fold. 81 | print(f"Config {json_config} already present! Shutting down to prevent loss of previous experiments") 82 | continue 83 | 84 | config_id += 1 85 | 86 | pool.shutdown() # wait the batch of configs to terminate 87 | 88 | best_config = self.process_results(HOLDOUT_MS_FOLDER, config_id) 89 | 90 | with open(os.path.join(HOLDOUT_MS_FOLDER, self.WINNER_CONFIG_FILENAME), 'w') as fp: 91 | json.dump(best_config, fp) 92 | 93 | return best_config 94 | 95 | def _model_selection_helper(self, dataset_getter, experiment_class, config, exp_config_name, 96 | other=None): 97 | """ 98 | :param dataset_getter: 99 | :param experiment_class: 100 | :param config: 101 | :param exp_config_name: 102 | :param other: 103 | :return: 104 | """ 105 | 106 | # Create the experiment object which will be responsible for running a specific experiment 107 | experiment = experiment_class(config, exp_config_name) 108 | 109 | # Set up a log file for this experiment (run in a separate process) 110 | logger = Logger(str(os.path.join(experiment.exp_path, 'experiment.log')), mode='a') 111 | logger.log('Configuration: ' + str(experiment.model_config)) 112 | 113 | config_filename = os.path.join(experiment.exp_path, self._CONFIG_FILENAME) 114 | 115 | # ------------- PREPARE DICTIONARY TO STORE RESULTS -------------- # 116 | 117 | selection_dict = { 118 | 'config': experiment.model_config.config_dict, 119 | 'TR_score': 0., 120 | 'VL_score': 0., 121 | } 122 | 123 | dataset_getter.set_inner_k(None) # need to stay this way 124 | 125 | training_score, validation_score = experiment.run_valid(dataset_getter, logger, other) 126 | 127 | selection_dict['TR_score'] = float(training_score) 128 | selection_dict['VL_score'] = float(validation_score) 129 | 130 | logger.log('TR Accuracy: ' + str(training_score) + ' VL Accuracy: ' + str(validation_score)) 131 | 132 | with open(config_filename, 'w') as fp: 133 | json.dump(selection_dict, fp) 134 | -------------------------------------------------------------------------------- /gnn-comparison/evaluation/model_selection/K_Fold_Selection.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import numpy as np 5 | import concurrent.futures 6 | from copy import deepcopy 7 | 8 | from log.Logger import Logger 9 | 10 | 11 | class KFoldSelection: 12 | """ 13 | Class implementing a sufficiently general framework to do model selection 14 | """ 15 | 16 | def __init__(self, folds, max_processes): 17 | self.folds = folds 18 | self.max_processes = max_processes 19 | 20 | # Create the experiments folder straight away 21 | self._CONFIG_BASE = 'config_' 22 | self._CONFIG_FILENAME = 'config_results.json' 23 | self.WINNER_CONFIG_FILENAME = 'winner_config.json' 24 | 25 | def process_results(self, KFOLD_FOLDER, no_configurations): 26 | 27 | best_avg_vl = 0. 28 | best_std_vl = 100. 29 | 30 | for i in range(1, no_configurations+1): 31 | try: 32 | config_filename = os.path.join(KFOLD_FOLDER, self._CONFIG_BASE + str(i), self._CONFIG_FILENAME) 33 | 34 | with open(config_filename, 'r') as fp: 35 | config_dict = json.load(fp) 36 | 37 | avg_vl = config_dict['avg_VL_score'] 38 | std_vl = config_dict['std_VL_score'] 39 | 40 | if (best_avg_vl < avg_vl) or (best_avg_vl == avg_vl and best_std_vl > std_vl): 41 | best_i = i 42 | best_avg_vl = avg_vl 43 | best_config = config_dict 44 | 45 | except Exception as e: 46 | print(e) 47 | 48 | print('Model selection winner for experiment', KFOLD_FOLDER, 'is config ', best_i, ':') 49 | for k in best_config.keys(): 50 | print('\t', k, ':', best_config[k]) 51 | 52 | return best_config 53 | 54 | def model_selection(self, dataset_getter, experiment_class, exp_path, model_configs, debug=False, other=None): 55 | """ 56 | :param experiment_class: the kind of experiment used 57 | :param debug: 58 | :return: the best performing configuration on average over the k folds. TL;DR RETURNS A MODEL, NOT AN ESTIMATE! 59 | """ 60 | 61 | exp_path = exp_path 62 | KFOLD_FOLDER = os.path.join(exp_path, str(self.folds) + '_FOLD_MS') 63 | 64 | if not os.path.exists(KFOLD_FOLDER): 65 | os.makedirs(KFOLD_FOLDER) 66 | 67 | config_id = 0 68 | 69 | pool = concurrent.futures.ProcessPoolExecutor(max_workers=self.max_processes) 70 | for config in model_configs: 71 | 72 | # I need to make a copy of this dictionary 73 | # It seems it gets shared between processes! 74 | cfg = deepcopy(config) 75 | 76 | # Create a separate folder for each experiment 77 | exp_config_name = os.path.join(KFOLD_FOLDER, self._CONFIG_BASE + str(config_id + 1)) 78 | if not os.path.exists(exp_config_name): 79 | os.makedirs(exp_config_name) 80 | 81 | if not debug: 82 | pool.submit(self._model_selection_helper, dataset_getter, experiment_class, cfg, 83 | exp_config_name, other) 84 | else: # DEBUG 85 | self._model_selection_helper(dataset_getter, experiment_class, cfg, 86 | exp_config_name, other) 87 | 88 | config_id += 1 89 | 90 | pool.shutdown() 91 | 92 | best_config = self.process_results(KFOLD_FOLDER, config_id) 93 | 94 | with open(os.path.join(KFOLD_FOLDER, self.WINNER_CONFIG_FILENAME), 'w') as fp: 95 | json.dump(best_config, fp) 96 | 97 | return best_config 98 | 99 | def _model_selection_helper(self, dataset_getter, experiment_class, config, exp_config_name, 100 | other=None): 101 | 102 | # Set up a log file for this experiment (run in a separate process) 103 | logger = Logger(str(os.path.join(exp_config_name, 'experiment.log')), mode='a') 104 | 105 | logger.log('Configuration: ' + str(config)) 106 | 107 | config_filename = os.path.join(exp_config_name, self._CONFIG_FILENAME) 108 | 109 | # ------------- PREPARE DICTIONARY TO STORE RESULTS -------------- # 110 | 111 | k_fold_dict = { 112 | 'config': config, 113 | 'folds': [{} for _ in range(self.folds)], 114 | 'avg_TR_score': 0., 115 | 'avg_VL_score': 0., 116 | 'std_TR_score': 0., 117 | 'std_VL_score': 0. 118 | } 119 | 120 | for k in range(self.folds): 121 | 122 | dataset_getter.set_inner_k(k) 123 | 124 | fold_exp_folder = os.path.join(exp_config_name, 'FOLD_' + str(k + 1)) 125 | # Create the experiment object which will be responsible for running a specific experiment 126 | experiment = experiment_class(config, fold_exp_folder) 127 | 128 | training_score, validation_score = experiment.run_valid(dataset_getter, logger, other) 129 | 130 | logger.log(str(k+1) + ' split, TR Accuracy: ' + str(training_score) + 131 | ' VL Accuracy: ' + str(validation_score)) 132 | 133 | k_fold_dict['folds'][k]['TR_score'] = training_score 134 | k_fold_dict['folds'][k]['VL_score'] = validation_score 135 | 136 | tr_scores = np.array([k_fold_dict['folds'][k]['TR_score'] for k in range(self.folds)]) 137 | vl_scores = np.array([k_fold_dict['folds'][k]['VL_score'] for k in range(self.folds)]) 138 | 139 | k_fold_dict['avg_TR_score'] = tr_scores.mean() 140 | k_fold_dict['std_TR_score'] = tr_scores.std() 141 | k_fold_dict['avg_VL_score'] = vl_scores.mean() 142 | k_fold_dict['std_VL_score'] = vl_scores.std() 143 | 144 | logger.log('TR avg is ' + str(k_fold_dict['avg_TR_score']) + ' std is ' + str(k_fold_dict['std_TR_score']) + 145 | ' VL avg is ' + str(k_fold_dict['avg_VL_score']) + ' std is ' + str(k_fold_dict['std_VL_score'])) 146 | 147 | with open(config_filename, 'w') as fp: 148 | json.dump(k_fold_dict, fp) 149 | -------------------------------------------------------------------------------- /gnn-comparison/evaluation/risk_assessment/HoldOutAssessment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | from config.base import Config 5 | from evaluation.dataset_getter import DatasetGetter 6 | from log.Logger import Logger 7 | 8 | 9 | class HoldOutAssessment: 10 | """ 11 | Class implementing a sufficiently general framework to do model ASSESSMENT 12 | """ 13 | 14 | def __init__(self, model_selector, exp_path, model_configs, max_processes=2): 15 | self.max_processes = max_processes 16 | self.model_configs = model_configs # Dictionary with key:list of possible values 17 | self.model_selector = model_selector 18 | 19 | # Create the experiments folder straight away 20 | self.exp_path = exp_path 21 | self._HOLDOUT_FOLDER = os.path.join(exp_path, 'HOLDOUT_ASS') 22 | self._ASSESSMENT_FILENAME = 'assessment_results.json' 23 | 24 | def risk_assessment(self, experiment_class, debug=False, other=None): 25 | """ 26 | :param experiment_class: the kind of experiment used 27 | :param debug: 28 | :return: An average over the outer test folds. RETURNS AN ESTIMATE, NOT A MODEL!!! 29 | """ 30 | if not os.path.exists(self._HOLDOUT_FOLDER): 31 | os.makedirs(self._HOLDOUT_FOLDER) 32 | else: 33 | print("Folder already present! Shutting down to prevent loss of previous experiments") 34 | return 35 | 36 | self._risk_assessment_helper(experiment_class, self._HOLDOUT_FOLDER, debug, other) 37 | 38 | def _risk_assessment_helper(self, experiment_class, exp_path, debug=False, other=None): 39 | 40 | dataset_getter = DatasetGetter(None) 41 | 42 | best_config = self.model_selector.model_selection(dataset_getter, experiment_class, exp_path, 43 | self.model_configs, debug, other) 44 | 45 | # Retrain with the best configuration and test 46 | experiment = experiment_class(best_config['config'], exp_path) 47 | 48 | # Set up a log file for this experiment (I am in a forked process) 49 | logger = Logger(str(os.path.join(experiment.exp_path, 'experiment.log')), mode='a') 50 | 51 | dataset_getter.set_inner_k(None) 52 | 53 | training_scores, test_scores = [], [] 54 | 55 | # Mitigate bad random initializations 56 | for i in range(3): 57 | training_score, test_score = experiment.run_test(dataset_getter, logger, other) 58 | print(f'Final training run {i + 1}: {training_score}, {test_score}') 59 | 60 | training_scores.append(training_score) 61 | test_scores.append(test_score) 62 | 63 | training_score = sum(training_scores)/3 64 | test_score = sum(test_scores)/3 65 | 66 | logger.log('TR score: ' + str(training_score) + ' TS score: ' + str(test_score)) 67 | 68 | with open(os.path.join(self._HOLDOUT_FOLDER, self._ASSESSMENT_FILENAME), 'w') as fp: 69 | json.dump({'best_config': best_config, 'HOLDOUT_TR': training_score, 'HOLDOUT_TS': test_score}, fp) 70 | -------------------------------------------------------------------------------- /gnn-comparison/experiments/EndToEndExperiment.py: -------------------------------------------------------------------------------- 1 | from models.gnn_wrapper.NetWrapper import NetWrapper 2 | 3 | from experiments.Experiment import Experiment 4 | 5 | 6 | class EndToEndExperiment(Experiment): 7 | 8 | def __init__(self, model_configuration, exp_path): 9 | super(EndToEndExperiment, self).__init__(model_configuration, exp_path) 10 | 11 | def run_valid(self, dataset_getter, logger, other=None): 12 | """ 13 | This function returns the training and validation or test accuracy 14 | :return: (training accuracy, validation/test accuracy) 15 | """ 16 | 17 | # print(self.model_config, dataset_getter.outer_k, dataset_getter.inner_k) 18 | 19 | dataset_class = self.model_config.dataset # dataset_class() 20 | 21 | if 'dense' in self.model_config: 22 | dataset = dataset_class(dense=self.model_config.dense) 23 | else: 24 | dataset = dataset_class() 25 | 26 | model_class = self.model_config.model 27 | loss_class = self.model_config.loss 28 | optim_class = self.model_config.optimizer 29 | sched_class = self.model_config.scheduler 30 | stopper_class = self.model_config.early_stopper 31 | clipping = self.model_config.gradient_clipping 32 | 33 | shuffle = self.model_config['shuffle'] if 'shuffle' in self.model_config else True 34 | 35 | train_loader, val_loader = dataset_getter.get_train_val(dataset, self.model_config['batch_size'], 36 | shuffle=shuffle) 37 | 38 | model = model_class(dim_features=dataset.dim_features, dim_target=dataset.dim_target, config=self.model_config) 39 | net = NetWrapper(model, loss_function=loss_class(), device=self.model_config['device']) 40 | 41 | optimizer = optim_class(model.parameters(), 42 | lr=self.model_config['learning_rate'], weight_decay=self.model_config['l2']) 43 | 44 | if sched_class is not None: 45 | scheduler = sched_class(optimizer) 46 | else: 47 | scheduler = None 48 | 49 | train_loss, train_acc, val_loss, val_acc, _, _, _ = net.train(train_loader=train_loader, 50 | max_epochs=self.model_config['classifier_epochs'], 51 | optimizer=optimizer, scheduler=scheduler, 52 | clipping=clipping, 53 | validation_loader=val_loader, 54 | early_stopping=stopper_class, 55 | logger=logger) 56 | return train_acc, val_acc 57 | 58 | def run_test(self, dataset_getter, logger, other=None): 59 | """ 60 | This function returns the training and test accuracy. DO NOT USE THE TEST FOR TRAINING OR EARLY STOPPING! 61 | :return: (training accuracy, test accuracy) 62 | """ 63 | 64 | dataset_class = self.model_config.dataset # dataset_class() 65 | 66 | if 'dense' in self.model_config: 67 | dataset = dataset_class(dense=self.model_config.dense) 68 | else: 69 | dataset = dataset_class() 70 | 71 | shuffle = self.model_config['shuffle'] if 'shuffle' in self.model_config else True 72 | 73 | model_class = self.model_config.model 74 | loss_class = self.model_config.loss 75 | optim_class = self.model_config.optimizer 76 | sched_class = self.model_config.scheduler 77 | stopper_class = self.model_config.early_stopper 78 | clipping = self.model_config.gradient_clipping 79 | 80 | train_loader, val_loader = dataset_getter.get_train_val(dataset, self.model_config['batch_size'], 81 | shuffle=shuffle) 82 | test_loader = dataset_getter.get_test(dataset, self.model_config['batch_size'], shuffle=shuffle) 83 | 84 | model = model_class(dim_features=dataset.dim_features, dim_target=dataset.dim_target, 85 | config=self.model_config) 86 | net = NetWrapper(model, loss_function=loss_class(), device=self.model_config['device']) 87 | 88 | optimizer = optim_class(model.parameters(), 89 | lr=self.model_config['learning_rate'], weight_decay=self.model_config['l2']) 90 | 91 | if sched_class is not None: 92 | scheduler = sched_class(optimizer) 93 | else: 94 | scheduler = None 95 | 96 | train_loss, train_acc, val_loss, val_acc, test_loss, test_acc, _ = \ 97 | net.train(train_loader=train_loader, max_epochs=self.model_config['classifier_epochs'], 98 | optimizer=optimizer, scheduler=scheduler, clipping=clipping, 99 | validation_loader=val_loader, test_loader=test_loader, early_stopping=stopper_class, 100 | logger=logger) 101 | 102 | return train_acc, test_acc 103 | -------------------------------------------------------------------------------- /gnn-comparison/experiments/Experiment.py: -------------------------------------------------------------------------------- 1 | import random 2 | from config.base import Config 3 | 4 | 5 | class Experiment: 6 | """ 7 | Experiment provides a layer of abstraction to avoid that all models implement the same interface 8 | """ 9 | 10 | def __init__(self, model_configuration, exp_path): 11 | self.model_config = Config.from_dict(model_configuration) 12 | self.exp_path = exp_path 13 | 14 | def run_valid(self, get_train_val, logger, other=None): 15 | """ 16 | This function returns the training and validation accuracy. DO WHATEVER YOU WANT WITH VL SET, 17 | BECAUSE YOU WILL MAKE PERFORMANCE ASSESSMENT ON A TEST SET 18 | :return: (training accuracy, validation accuracy) 19 | """ 20 | raise NotImplementedError('You must implement this function!') 21 | 22 | def run_test(self, get_train_val, get_test, logger, other=None): 23 | """ 24 | This function returns the training and test accuracy 25 | :return: (training accuracy, test accuracy) 26 | """ 27 | raise NotImplementedError('You must implement this function!') 28 | 29 | 30 | class ToyExperiment(Experiment): 31 | 32 | def __init__(self, model_configuration, exp_path): 33 | super(ToyExperiment, self).__init__(model_configuration, exp_path) 34 | 35 | def run_valid(self, get_train_val, logger, other=None): 36 | """ 37 | This function returns the training and validation or test accuracy 38 | :return: (training accuracy, validation/test accuracy) 39 | """ 40 | return random.uniform(0, 100), random.uniform(0, 100) 41 | 42 | def run_test(self, get_train_val, logger, get_test, other=None): 43 | """ 44 | This function returns the training and test accuracy. DO NOT USE THE TEST FOR ANY REASON 45 | :return: (training accuracy, test accuracy) 46 | """ 47 | return random.uniform(0, 100), random.uniform(0, 100) 48 | -------------------------------------------------------------------------------- /gnn-comparison/log/Logger.py: -------------------------------------------------------------------------------- 1 | class Logger: 2 | def __init__(self, filepath, mode, lock=None): 3 | """ 4 | Implements write routine 5 | :param filepath: the file where to write 6 | :param mode: can be 'w' or 'a' 7 | :param lock: pass a shared lock for multi process write access 8 | """ 9 | self.filepath = filepath 10 | if mode not in ['w', 'a']: 11 | assert False, 'Mode must be one of w, r or a' 12 | else: 13 | self.mode = mode 14 | self.lock = lock 15 | 16 | def log(self, str): 17 | if self.lock: 18 | self.lock.acquire() 19 | 20 | try: 21 | with open(self.filepath, self.mode) as f: 22 | f.write(str + '\n') 23 | except Exception as e: 24 | print(e) 25 | 26 | if self.lock: 27 | self.lock.release() 28 | 29 | 30 | -------------------------------------------------------------------------------- /gnn-comparison/log/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/log/__init__.py -------------------------------------------------------------------------------- /gnn-comparison/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/models/__init__.py -------------------------------------------------------------------------------- /gnn-comparison/models/gnn_wrapper/NetWrapper.py: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import timedelta 3 | import torch 4 | from torch import optim 5 | 6 | 7 | def format_time(avg_time): 8 | avg_time = timedelta(seconds=avg_time) 9 | total_seconds = int(avg_time.total_seconds()) 10 | hours, remainder = divmod(total_seconds, 3600) 11 | minutes, seconds = divmod(remainder, 60) 12 | return f"{hours:02d}:{minutes:02d}:{int(seconds):02d}.{str(avg_time.microseconds)[:3]}" 13 | 14 | 15 | class NetWrapper: 16 | 17 | def __init__(self, model, loss_function, device='cpu', classification=True): 18 | self.model = model 19 | self.loss_fun = loss_function 20 | self.device = torch.device(device) 21 | self.classification = classification 22 | 23 | def _train(self, train_loader, optimizer, clipping=None): 24 | model = self.model.to(self.device) 25 | 26 | model.train() 27 | 28 | loss_all = 0 29 | acc_all = 0 30 | for data in train_loader: 31 | 32 | data = data.to(self.device) 33 | optimizer.zero_grad() 34 | output = model(data) 35 | 36 | if not isinstance(output, tuple): 37 | output = (output,) 38 | 39 | if self.classification: 40 | loss, acc = self.loss_fun(data.y, *output) 41 | loss.backward() 42 | 43 | try: 44 | num_graphs = data.num_graphs 45 | except TypeError: 46 | num_graphs = data.adj.size(0) 47 | 48 | loss_all += loss.item() * num_graphs 49 | acc_all += acc.item() * num_graphs 50 | else: 51 | loss = self.loss_fun(data.y, *output) 52 | loss.backward() 53 | loss_all += loss.item() 54 | 55 | if clipping is not None: # Clip gradient before updating weights 56 | torch.nn.utils.clip_grad_norm_(model.parameters(), clipping) 57 | optimizer.step() 58 | 59 | if self.classification: 60 | return acc_all / len(train_loader.dataset), loss_all / len(train_loader.dataset) 61 | else: 62 | return None, loss_all / len(train_loader.dataset) 63 | 64 | def classify_graphs(self, loader): 65 | model = self.model.to(self.device) 66 | model.eval() 67 | 68 | loss_all = 0 69 | acc_all = 0 70 | for data in loader: 71 | data = data.to(self.device) 72 | output = model(data) 73 | 74 | if not isinstance(output, tuple): 75 | output = (output,) 76 | 77 | if self.classification: 78 | loss, acc = self.loss_fun(data.y, *output) 79 | 80 | try: 81 | num_graphs = data.num_graphs 82 | except TypeError: 83 | num_graphs = data.adj.size(0) 84 | 85 | loss_all += loss.item() * num_graphs 86 | acc_all += acc.item() * num_graphs 87 | else: 88 | loss = self.loss_fun(data.y, *output) 89 | loss_all += loss.item() 90 | 91 | if self.classification: 92 | return acc_all / len(loader.dataset), loss_all / len(loader.dataset) 93 | else: 94 | return None, loss_all / len(loader.dataset) 95 | 96 | def train(self, train_loader, max_epochs=100, optimizer=torch.optim.Adam, scheduler=None, clipping=None, 97 | validation_loader=None, test_loader=None, early_stopping=None, logger=None, log_every=10): 98 | 99 | early_stopper = early_stopping() if early_stopping is not None else None 100 | 101 | val_loss, val_acc = -1, -1 102 | test_loss, test_acc = None, None 103 | 104 | time_per_epoch = [] 105 | 106 | for epoch in range(1, max_epochs+1): 107 | 108 | start = time.time() 109 | train_acc, train_loss = self._train(train_loader, optimizer, clipping) 110 | end = time.time() - start 111 | time_per_epoch.append(end) 112 | 113 | if scheduler is not None: 114 | scheduler.step(epoch) 115 | 116 | if test_loader is not None: 117 | test_acc, test_loss = self.classify_graphs(test_loader) 118 | 119 | if validation_loader is not None: 120 | val_acc, val_loss = self.classify_graphs(validation_loader) 121 | 122 | # Early stopping (lazy if evaluation) 123 | if early_stopper is not None and early_stopper.stop(epoch, val_loss, val_acc, 124 | test_loss, test_acc, 125 | train_loss, train_acc): 126 | msg = f'Stopping at epoch {epoch}, best is {early_stopper.get_best_vl_metrics()}' 127 | if logger is not None: 128 | logger.log(msg) 129 | print(msg) 130 | else: 131 | print(msg) 132 | break 133 | 134 | if epoch % log_every == 0 or epoch == 1: 135 | msg = f'Epoch: {epoch}, TR loss: {train_loss} TR acc: {train_acc}, VL loss: {val_loss} VL acc: {val_acc} ' \ 136 | f'TE loss: {test_loss} TE acc: {test_acc}' 137 | if logger is not None: 138 | logger.log(msg) 139 | print(msg) 140 | else: 141 | print(msg) 142 | 143 | time_per_epoch = torch.tensor(time_per_epoch) 144 | avg_time_per_epoch = float(time_per_epoch.mean()) 145 | 146 | elapsed = format_time(avg_time_per_epoch) 147 | 148 | if early_stopper is not None: 149 | train_loss, train_acc, val_loss, val_acc, test_loss, test_acc, best_epoch = early_stopper.get_best_vl_metrics() 150 | 151 | return train_loss, train_acc, val_loss, val_acc, test_loss, test_acc, elapsed 152 | -------------------------------------------------------------------------------- /gnn-comparison/models/gnn_wrapper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/models/gnn_wrapper/__init__.py -------------------------------------------------------------------------------- /gnn-comparison/models/graph_classifiers/DGCNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch_geometric 3 | from torch import nn 4 | from torch.nn import functional as F 5 | from torch_geometric.nn import MessagePassing, global_sort_pool 6 | from torch_geometric.utils import add_self_loops, degree 7 | 8 | 9 | class DGCNN(nn.Module): 10 | """ 11 | Uses fixed architecture 12 | """ 13 | 14 | def __init__(self, dim_features, dim_target, config): 15 | super(DGCNN, self).__init__() 16 | 17 | self.ks = {'NCI1': { '0.6': 30, '0.9': 46 }, 18 | 'PROTEINS_full': { '0.6': 32, '0.9': 81 }, 19 | 'DD': {'0.6': 291, '0.9': 503 }, 20 | 'ENZYMES': { '0.6': 36, '0.9': 48 }, 21 | 'IMDB-BINARY': { '0.6': 18, '0.9': 31 }, 22 | 'IMDB-MULTI': { '0.6': 11, '0.9': 22 }, 23 | 'REDDIT-BINARY': { '0.6': 370, '0.9': 1002 }, 24 | 'REDDIT-MULTI-5K': { '0.6': 469, '0.9': 1081 }, 25 | 'COLLAB': { '0.6': 61, '0.9': 130 }, 26 | } 27 | 28 | self.k = self.ks[config.dataset.name][str(config['k'])] 29 | self.embedding_dim = config['embedding_dim'] 30 | self.num_layers = config['num_layers'] 31 | self.last_layer_fa = config['last_layer_fa'] 32 | if self.last_layer_fa: 33 | print('Using LastLayerFA') 34 | 35 | self.convs = [] 36 | for layer in range(self.num_layers): 37 | input_dim = dim_features if layer == 0 else self.embedding_dim 38 | self.convs.append(DGCNNConv(input_dim, self.embedding_dim)) 39 | self.total_latent_dim = self.num_layers * self.embedding_dim 40 | 41 | # Add last embedding 42 | self.convs.append(DGCNNConv(self.embedding_dim, 1)) 43 | self.total_latent_dim += 1 44 | 45 | self.convs = nn.ModuleList(self.convs) 46 | 47 | # should we leave this fixed? 48 | self.conv1d_params1 = nn.Conv1d(1, 16, self.total_latent_dim, self.total_latent_dim) 49 | self.maxpool1d = nn.MaxPool1d(2, 2) 50 | self.conv1d_params2 = nn.Conv1d(16, 32, 5, 1) 51 | 52 | dense_dim = int((self.k - 2) / 2 + 1) 53 | self.input_dense_dim = (dense_dim - 5 + 1) * 32 54 | 55 | self.hidden_dense_dim = config['dense_dim'] 56 | self.dense_layer = nn.Sequential(nn.Linear(self.input_dense_dim, self.hidden_dense_dim), 57 | nn.ReLU(), 58 | nn.Dropout(p=0.5), 59 | nn.Linear(self.hidden_dense_dim, dim_target)) 60 | 61 | def forward(self, data): 62 | # Implement Equation 4.2 of the paper i.e. concat all layers' graph representations and apply linear model 63 | # note: this can be decomposed in one smaller linear model per layer 64 | x, edge_index, batch = data.x, data.edge_index, data.batch 65 | 66 | hidden_repres = [] 67 | 68 | for i, conv in enumerate(self.convs): 69 | edges = edge_index 70 | if self.last_layer_fa and i == len(self.convs) - 1: 71 | block_map = torch.eq(batch.unsqueeze(0), batch.unsqueeze(-1)).int() 72 | edges, _ = torch_geometric.utils.dense_to_sparse(block_map) 73 | x = torch.tanh(conv(x, edges)) 74 | hidden_repres.append(x) 75 | 76 | # apply sortpool 77 | x_to_sortpool = torch.cat(hidden_repres, dim=1) 78 | x_1d = global_sort_pool(x_to_sortpool, batch, self.k) # in the code the authors sort the last channel only 79 | 80 | # apply 1D convolutional layers 81 | x_1d = torch.unsqueeze(x_1d, dim=1) 82 | conv1d_res = F.relu(self.conv1d_params1(x_1d)) 83 | conv1d_res = self.maxpool1d(conv1d_res) 84 | conv1d_res = F.relu(self.conv1d_params2(conv1d_res)) 85 | conv1d_res = conv1d_res.reshape(conv1d_res.shape[0], -1) 86 | 87 | # apply dense layer 88 | out_dense = self.dense_layer(conv1d_res) 89 | return out_dense 90 | 91 | 92 | class DGCNNConv(MessagePassing): 93 | """ 94 | Extended from tuorial on GCNs of Pytorch Geometrics 95 | """ 96 | 97 | def __init__(self, in_channels, out_channels): 98 | super(DGCNNConv, self).__init__(aggr='add') # "Add" aggregation. 99 | self.lin = nn.Linear(in_channels, out_channels) 100 | 101 | def forward(self, x, edge_index): 102 | # x has shape [N, in_channels] 103 | # edge_index has shape [2, E] 104 | 105 | # Step 1: Add self-loops to the adjacency matrix. 106 | edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0)) 107 | 108 | # Step 2: Linearly transform node feature matrix. 109 | x = self.lin(x) 110 | 111 | # Step 3-5: Start propagating messages. 112 | return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x) 113 | 114 | def message(self, x_j, edge_index, size): 115 | # x_j has shape [E, out_channels] 116 | 117 | # Step 3: Normalize node features. 118 | src, dst = edge_index # we assume source_to_target message passing 119 | deg = degree(src, size[0], dtype=x_j.dtype) 120 | deg = deg.pow(-1) 121 | norm = deg[dst] 122 | 123 | return norm.view(-1, 1) * x_j # broadcasting the normalization term to all out_channels === hidden features 124 | 125 | def update(self, aggr_out): 126 | # aggr_out has shape [N, out_channels] 127 | 128 | # Step 5: Return new node embeddings. 129 | return aggr_out 130 | 131 | def __repr__(self): 132 | return '{}({}, {})'.format(self.__class__.__name__, self.in_channels, 133 | self.out_channels) 134 | -------------------------------------------------------------------------------- /gnn-comparison/models/graph_classifiers/DeepMultisets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.nn import Linear 4 | from torch_geometric.nn import global_add_pool 5 | 6 | 7 | class DeepMultisets(torch.nn.Module): 8 | 9 | def __init__(self, dim_features, dim_target, config): 10 | super(DeepMultisets, self).__init__() 11 | 12 | hidden_units = config['hidden_units'] 13 | 14 | self.fc_vertex = Linear(dim_features, hidden_units) 15 | self.fc_global1 = Linear(hidden_units, hidden_units) 16 | self.fc_global2 = Linear(hidden_units, dim_target) 17 | 18 | def forward(self, data): 19 | x, batch = data.x, data.batch 20 | 21 | x = F.relu(self.fc_vertex(x)) 22 | x = global_add_pool(x, batch) # sums all vertex embeddings belonging to the same graph! 23 | x = F.relu(self.fc_global1(x)) 24 | x = self.fc_global2(x) 25 | return x 26 | 27 | -------------------------------------------------------------------------------- /gnn-comparison/models/graph_classifiers/GIN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.nn import BatchNorm1d 4 | from torch.nn import Sequential, Linear, ReLU 5 | import torch_geometric 6 | from torch_geometric.nn import GINConv, global_add_pool, global_mean_pool 7 | 8 | from models.graph_classifiers.self_attention import SelfAttention 9 | 10 | 11 | class GIN(torch.nn.Module): 12 | 13 | def __init__(self, dim_features, dim_target, config): 14 | super(GIN, self).__init__() 15 | 16 | self.config = config 17 | self.dropout = config['dropout'] 18 | self.embeddings_dim = [config['hidden_units'][0]] + config['hidden_units'] 19 | self.no_layers = len(self.embeddings_dim) 20 | self.first_h = [] 21 | self.nns = [] 22 | self.convs = [] 23 | self.linears = [] 24 | self.last_layer_fa = config['last_layer_fa'] 25 | if self.last_layer_fa: 26 | print('Using LastLayerFA') 27 | 28 | train_eps = config['train_eps'] 29 | if config['aggregation'] == 'sum': 30 | self.pooling = global_add_pool 31 | elif config['aggregation'] == 'mean': 32 | self.pooling = global_mean_pool 33 | 34 | for layer, out_emb_dim in enumerate(self.embeddings_dim): 35 | 36 | if layer == 0: 37 | self.first_h = Sequential(Linear(dim_features, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU(), 38 | Linear(out_emb_dim, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU()) 39 | self.linears.append(Linear(out_emb_dim, dim_target)) 40 | else: 41 | input_emb_dim = self.embeddings_dim[layer-1] 42 | self.nns.append(Sequential(Linear(input_emb_dim, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU(), 43 | Linear(out_emb_dim, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU())) 44 | self.convs.append(GINConv(self.nns[-1], train_eps=train_eps)) # Eq. 4.2 45 | 46 | self.linears.append(Linear(out_emb_dim, dim_target)) 47 | 48 | 49 | self.nns = torch.nn.ModuleList(self.nns) 50 | self.convs = torch.nn.ModuleList(self.convs) 51 | self.linears = torch.nn.ModuleList(self.linears) # has got one more for initial input 52 | 53 | def forward(self, data): 54 | x, edge_index, batch = data.x, data.edge_index, data.batch 55 | 56 | out = 0 57 | 58 | for layer in range(self.no_layers): 59 | if layer == 0: 60 | x = self.first_h(x) 61 | 62 | out += F.dropout(self.pooling(self.linears[layer](x), batch), p=self.dropout) 63 | else: 64 | # Layer l ("convolution" layer) 65 | edges = edge_index 66 | if self.last_layer_fa and layer == self.no_layers - 1: 67 | block_map = torch.eq(batch.unsqueeze(0), batch.unsqueeze(-1)).int() 68 | edges, _ = torch_geometric.utils.dense_to_sparse(block_map) 69 | x = self.convs[layer-1](x, edges) 70 | out += F.dropout(self.linears[layer](self.pooling(x, batch)), p=self.dropout, training=self.training) 71 | 72 | return out 73 | -------------------------------------------------------------------------------- /gnn-comparison/models/graph_classifiers/GraphSAGE.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch_geometric 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from torch_geometric.nn import SAGEConv, global_max_pool 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | class GraphSAGE(nn.Module): 13 | def __init__(self, dim_features, dim_target, config): 14 | super().__init__() 15 | 16 | num_layers = config['num_layers'] 17 | dim_embedding = config['dim_embedding'] 18 | self.aggregation = config['aggregation'] # can be mean or max 19 | self.last_layer_fa = config['last_layer_fa'] 20 | if self.last_layer_fa: 21 | print('Using LastLayerFA') 22 | 23 | if self.aggregation == 'max': 24 | self.fc_max = nn.Linear(dim_embedding, dim_embedding) 25 | 26 | self.layers = nn.ModuleList([]) 27 | for i in range(num_layers): 28 | dim_input = dim_features if i == 0 else dim_embedding 29 | 30 | conv = SAGEConv(dim_input, dim_embedding) 31 | # Overwrite aggregation method (default is set to mean 32 | conv.aggr = self.aggregation 33 | 34 | self.layers.append(conv) 35 | 36 | # For graph classification 37 | self.fc1 = nn.Linear(num_layers * dim_embedding, dim_embedding) 38 | self.fc2 = nn.Linear(dim_embedding, dim_target) 39 | 40 | def forward(self, data): 41 | x, edge_index, batch = data.x, data.edge_index, data.batch 42 | 43 | x_all = [] 44 | 45 | for i, layer in enumerate(self.layers): 46 | edges = edge_index 47 | if self.last_layer_fa and i == len(self.layers) - 1: 48 | block_map = torch.eq(batch.unsqueeze(0), batch.unsqueeze(-1)).int() 49 | edges, _ = torch_geometric.utils.dense_to_sparse(block_map) 50 | x = layer(x, edges) 51 | if self.aggregation == 'max': 52 | x = torch.relu(self.fc_max(x)) 53 | x_all.append(x) 54 | 55 | x = torch.cat(x_all, dim=1) 56 | x = global_max_pool(x, batch) 57 | 58 | x = F.relu(self.fc1(x)) 59 | x = self.fc2(x) 60 | return x 61 | -------------------------------------------------------------------------------- /gnn-comparison/models/graph_classifiers/MLP_Classifier.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.nn import Linear 4 | 5 | 6 | class MLPClassifier(torch.nn.Module): 7 | 8 | def __init__(self, dim_features, dim_target, config): 9 | super(MLPClassifier, self).__init__() 10 | 11 | hidden_units = config['hidden_units'] 12 | 13 | self.fc_global = Linear(dim_features, hidden_units) 14 | self.out = Linear(hidden_units, dim_target) 15 | 16 | def forward(self, x, batch): 17 | return self.out(F.relu(self.fc_global(x))) 18 | -------------------------------------------------------------------------------- /gnn-comparison/models/graph_classifiers/MolecularFingerprint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import ReLU 3 | from torch_geometric.nn import global_add_pool 4 | 5 | 6 | class MolecularFingerprint(torch.nn.Module): 7 | 8 | def __init__(self, dim_features, dim_target, config): 9 | super(MolecularFingerprint, self).__init__() 10 | hidden_dim = config['hidden_units'] 11 | 12 | self.mlp = torch.nn.Sequential(torch.nn.Linear(dim_features, hidden_dim), ReLU(), 13 | torch.nn.Linear(hidden_dim, dim_target), ReLU()) 14 | 15 | def forward(self, data): 16 | return self.mlp(global_add_pool(data.x, data.batch)) 17 | -------------------------------------------------------------------------------- /gnn-comparison/models/graph_classifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/models/graph_classifiers/__init__.py -------------------------------------------------------------------------------- /gnn-comparison/models/graph_classifiers/self_attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class SelfAttention(torch.nn.Module): 4 | def __init__(self, 5 | num_heads, 6 | model_dim, 7 | dropout_keep_prob): 8 | super(SelfAttention, self).__init__() 9 | 10 | self.num_heads = num_heads 11 | self.model_dim = model_dim 12 | self.dropout_keep_prob = dropout_keep_prob 13 | self.q_layer = torch.nn.Linear(model_dim, model_dim * self.num_heads, bias=False) 14 | self.out_layer = torch.nn.Linear(model_dim * self.num_heads, model_dim, bias=False) 15 | self.out_layer2 = torch.nn.Linear(model_dim * 2, model_dim, bias=False) 16 | self.relu = torch.nn.ReLU() 17 | self.softmax = torch.nn.Softmax(dim=-1) 18 | self.dropout = torch.nn.Dropout(1- dropout_keep_prob) 19 | 20 | def forward(self, batched_inputs, attn_mask=None): 21 | q = self._linear_projection(batched_inputs) 22 | qs = self._split_heads(q) 23 | tiled_inputs = batched_inputs.unsqueeze(1).repeat(1, self.num_heads, 1, 1) 24 | outputs = self._scaled_dot_product(qs, tiled_inputs, tiled_inputs, attn_mask) # (batch, num_heads, max_contexts, value_dim) 25 | outputs = self._concat_heads(outputs) # (batch, max_contexts, value_dim * num_heads) 26 | if self.num_heads > 1: 27 | outputs = self.out_layer(outputs) # (batch, max_contexts, model_dim) 28 | outputs = self.relu(outputs) # (batch, max_contexts, model_dim) 29 | #outputs = self.dropout(outputs) 30 | outputs = torch.cat([outputs, batched_inputs], dim=-1) # (batch, max_contexts, 2 * model_dim) 31 | outputs = self.out_layer2(outputs) # (batch, max_contexts, model_dim)c 32 | outputs = self.relu(outputs) # (batch, max_contexts, model_dim) 33 | return outputs 34 | 35 | def _linear_projection(self, batched_inputs): 36 | q = self.q_layer(batched_inputs) # (batch, max_contexts, key_dim * num_heads) 37 | # k = tf.layers.dense(batched_inputs, units=self.model_dim, 38 | # use_bias=False) # (batch, max_contexts, key_dim * num_heads) 39 | return q 40 | 41 | def _split_heads(self, q): 42 | 43 | def split_last_dimension_then_transpose(tensor, num_heads, dim): 44 | tensor = tensor.view([-1, tensor.size()[1], num_heads, 45 | dim]) # (batch, max_contexts, num_heads, dim) 46 | return tensor.transpose(1,2) # (batch, num_heads, max_contexts, dim) 47 | 48 | qs = split_last_dimension_then_transpose(q, self.num_heads, 49 | self.model_dim) # (batch, num_heads, max_contexts, key_dim) 50 | # ks = split_last_dimension_then_transpose(k, self.num_heads, 51 | # self.model_dim) # (batch, num_heads, max_contexts, key_dim) 52 | return qs 53 | 54 | def _scaled_dot_product(self, qs, ks, tiled_inputs, valid_mask): 55 | queries_dot_keys = torch.matmul(qs, ks.transpose(2,3)) # (batch, num_heads, max_contexts, max_contexts) 56 | scaled_scores = queries_dot_keys #/ ((self.model_dim // self.num_heads) ** 0.5) # (batch, num_heads, max_contexts, max_contexts) 57 | 58 | if valid_mask is not None: 59 | mask = torch.log(valid_mask.view(valid_mask.size()[0], 1, 1, valid_mask.size()[1])) # (batch, 1, 1, max_contexts) 60 | scaled_scores += mask 61 | 62 | attention_weights = self.softmax(scaled_scores) # (batch, num_heads, max_contexts, max_contexts) 63 | return torch.matmul(attention_weights, tiled_inputs) # (batch, num_heads, max_contexts, value_dim) 64 | 65 | def _concat_heads(self, outputs): 66 | # outputs: (batch, num_heads, max_contexts, value_dim) 67 | max_contexts = outputs.size()[2] 68 | tensor = outputs.transpose(1, 2) # [batch, max_contexts, num_heads, value_dim] 69 | return tensor.contiguous().view([-1, max_contexts, self.model_dim * self.num_heads]) 70 | -------------------------------------------------------------------------------- /gnn-comparison/models/modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class ClassificationLoss(nn.Module): 6 | def __init__(self): 7 | super().__init__() 8 | self.loss = None 9 | 10 | def forward(self, targets, *outputs): 11 | """ 12 | :param targets: 13 | :param outputs: 14 | :return: loss and accuracy values 15 | """ 16 | outputs = outputs[0] 17 | loss = self.loss(outputs, targets) 18 | accuracy = self._calculate_accuracy(outputs, targets) 19 | return loss, accuracy 20 | 21 | def _get_correct(self, outputs): 22 | raise NotImplementedError() 23 | 24 | def _calculate_accuracy(self, outputs, targets): 25 | correct = self._get_correct(outputs) 26 | return 100. * (correct == targets).sum().float() / targets.size(0) 27 | 28 | 29 | class BinaryClassificationLoss(ClassificationLoss): 30 | def __init__(self, reduction=None): 31 | super().__init__() 32 | if reduction is not None: 33 | self.loss = nn.BCEWithLogitsLoss(reduction=reduction) 34 | else: 35 | self.loss = nn.BCEWithLogitsLoss() 36 | 37 | def _get_correct(self, outputs): 38 | return outputs > 0.5 39 | 40 | 41 | class MulticlassClassificationLoss(ClassificationLoss): 42 | def __init__(self, reduction=None): 43 | super().__init__() 44 | if reduction is not None: 45 | self.loss = nn.CrossEntropyLoss(reduction=reduction) 46 | else: 47 | self.loss = nn.CrossEntropyLoss() 48 | 49 | def _get_correct(self, outputs): 50 | return torch.argmax(outputs, dim=1) 51 | 52 | 53 | class RegressionLoss(nn.Module): 54 | def __init__(self): 55 | super().__init__() 56 | self.loss = None 57 | 58 | def forward(self, targets, *outputs): 59 | """ 60 | 61 | :param targets: 62 | :param outputs: 63 | :return: a loss value 64 | """ 65 | raise NotImplementedError() 66 | 67 | 68 | class CovarianceResidualError(RegressionLoss): # For Cascade "Correlation" 69 | def __init__(self): 70 | super().__init__() 71 | 72 | def forward(self, targets, *outputs): 73 | _, _, graph_emb, errors = outputs 74 | 75 | errors_minus_mean = errors - torch.mean(errors, dim=0) 76 | activations_minus_mean = graph_emb - torch.mean(graph_emb, dim=0) 77 | 78 | # todo check against commented code 79 | cov_per_pattern = torch.zeros(errors.shape) 80 | 81 | cov_error = 0. 82 | for o in range(errors.shape[1]): # for each output unit 83 | for i in range(errors.shape[0]): # for each pattern 84 | cov_per_pattern[i, o] = errors_minus_mean[i, o]*activations_minus_mean[i, 0] 85 | 86 | cov_error = cov_error + torch.abs(torch.sum(cov_per_pattern[:, o])) 87 | 88 | #print(torch.mean(cov_per_pattern, dim=0), torch.mean(errors_minus_mean), torch.mean(graph_emb)) 89 | 90 | ''' 91 | activations_minus_mean = torch.sum(activations_minus_mean, dim=1) 92 | activations_minus_mean = torch.unsqueeze(activations_minus_mean, dim=1) 93 | 94 | activations_minus_mean = torch.t(activations_minus_mean) 95 | 96 | cov_per_pattern = torch.mm(activations_minus_mean, errors_minus_mean) 97 | 98 | cov_abs = torch.abs(cov_per_pattern) 99 | 100 | # sum over output "units" 101 | cov_error = torch.sum(cov_abs) 102 | ''' 103 | 104 | # Minus --> maximization problem! 105 | return - cov_error 106 | 107 | 108 | class NN4GMulticlassClassificationLoss(MulticlassClassificationLoss): 109 | 110 | def mse(self, ts, ys, return_sum): 111 | 112 | targets_oh = torch.zeros(ys.shape) 113 | ts = ts.unsqueeze(1) 114 | targets_oh.scatter_(1, ts, value=1.) # src must not be specified 115 | ts = targets_oh 116 | 117 | if return_sum == True: 118 | return torch.sum(0.5 * (ts - ys) ** 2) / len(ts) 119 | else: 120 | return 0.5 * (ts - ys) ** 2 / len(ts) 121 | 122 | def forward(self, targets, *outputs): 123 | 124 | preds, _, _, _ = outputs 125 | 126 | # Try MSE 127 | loss = self.mse(targets, preds, return_sum=True) 128 | 129 | #loss = self.loss(preds, targets) 130 | 131 | accuracy = self._calculate_accuracy(preds, targets) 132 | return loss, accuracy 133 | 134 | 135 | class DiffPoolMulticlassClassificationLoss(MulticlassClassificationLoss): 136 | """ 137 | DiffPool - No Link Prediction Loss 138 | """ 139 | 140 | def forward(self, targets, *outputs): 141 | preds, lp_loss, ent_loss = outputs 142 | 143 | if targets.dim() > 1 and targets.size(1) == 1: 144 | targets = targets.squeeze(1) 145 | 146 | loss = self.loss(preds, targets) 147 | accuracy = self._calculate_accuracy(preds, targets) 148 | return loss + lp_loss + ent_loss, accuracy 149 | -------------------------------------------------------------------------------- /gnn-comparison/models/schedulers/ECCScheduler.py: -------------------------------------------------------------------------------- 1 | from torch.optim.lr_scheduler import StepLR 2 | 3 | 4 | class ECCLR(StepLR): 5 | 6 | def __init__(self, optimizer, step_size=1, gamma=0.1, last_epoch=-1): 7 | self.step_size = step_size # does not matter 8 | self.gamma = gamma 9 | super(ECCLR, self).__init__(optimizer, step_size=step_size, gamma=gamma, last_epoch=last_epoch) 10 | 11 | def get_lr(self): 12 | if self.last_epoch in [25, 35, 45]: 13 | return [group['lr'] * self.gamma 14 | for group in self.optimizer.param_groups] 15 | else: 16 | return [group['lr'] for group in self.optimizer.param_groups] 17 | -------------------------------------------------------------------------------- /gnn-comparison/models/utils/EarlyStopper.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | 4 | class EarlyStopper: 5 | 6 | def stop(self, epoch, val_loss, val_acc=None, test_loss=None, test_acc=None, train_loss=None, train_acc=None): 7 | raise NotImplementedError("Implement this method!") 8 | 9 | def get_best_vl_metrics(self): 10 | return self.train_loss, self.train_acc, self.val_loss, self.val_acc, self.test_loss, self.test_acc, self.best_epoch 11 | 12 | 13 | class GLStopper(EarlyStopper): 14 | 15 | ''' 16 | Implement Generalization Loss technique (Prechelt 1997) 17 | ''' 18 | 19 | def __init__(self, starting_epoch, alpha=5, use_loss=True): 20 | self.local_optimum = float("inf") if use_loss else -float("inf") 21 | self.use_loss = use_loss 22 | self.alpha = alpha 23 | self.best_epoch = -1 24 | self.counter = None 25 | self.starting_epoch = starting_epoch 26 | 27 | self.train_loss, self.train_acc = None, None 28 | self.val_loss, self.val_acc = None, None 29 | self.test_loss, self.test_acc = None, None 30 | 31 | def stop(self, epoch, val_loss, val_acc=None, test_loss=None, test_acc=None, train_loss=None, train_acc=None): 32 | 33 | if epoch <= self.starting_epoch: 34 | return False 35 | 36 | if self.use_loss: 37 | if val_loss <= self.local_optimum: 38 | self.local_optimum = val_loss 39 | self.best_epoch = epoch 40 | self.train_loss, self.train_acc = train_loss, train_acc 41 | self.val_loss, self.val_acc = val_loss, val_acc 42 | self.test_loss, self.test_acc = test_loss, test_acc 43 | return False 44 | else: 45 | return 100*(val_loss/self.local_optimum - 1) > self.alpha 46 | else: 47 | if val_acc >= self.local_optimum: 48 | self.local_optimum = val_acc 49 | self.best_epoch = epoch 50 | self.train_loss, self.train_acc = train_loss, train_acc 51 | self.val_loss, self.val_acc = val_loss, val_acc 52 | self.test_loss, self.test_acc = test_loss, test_acc 53 | return False 54 | else: 55 | return (self.local_optimum/val_acc - 1) > self.alpha 56 | 57 | 58 | class Patience(EarlyStopper): 59 | 60 | ''' 61 | Implement common "patience" technique 62 | ''' 63 | 64 | def __init__(self, patience=20, use_loss=True): 65 | self.local_val_optimum = float("inf") if use_loss else -float("inf") 66 | self.use_loss = use_loss 67 | self.patience = patience 68 | self.best_epoch = -1 69 | self.counter = -1 70 | 71 | self.train_loss, self.train_acc = None, None 72 | self.val_loss, self.val_acc = None, None 73 | self.test_loss, self.test_acc = None, None 74 | 75 | def stop(self, epoch, val_loss, val_acc=None, test_loss=None, test_acc=None, train_loss=None, train_acc=None): 76 | if self.use_loss: 77 | if val_loss <= self.local_val_optimum: 78 | self.counter = 0 79 | self.local_val_optimum = val_loss 80 | self.best_epoch = epoch 81 | self.train_loss, self.train_acc = train_loss, train_acc 82 | self.val_loss, self.val_acc = val_loss, val_acc 83 | self.test_loss, self.test_acc = test_loss, test_acc 84 | return False 85 | else: 86 | self.counter += 1 87 | return self.counter >= self.patience 88 | else: 89 | if val_acc >= self.local_val_optimum: 90 | self.counter = 0 91 | self.local_val_optimum = val_acc 92 | self.best_epoch = epoch 93 | self.train_loss, self.train_acc = train_loss, train_acc 94 | self.val_loss, self.val_acc = val_loss, val_acc 95 | self.test_loss, self.test_acc = test_loss, test_acc 96 | return False 97 | else: 98 | self.counter += 1 99 | return self.counter >= self.patience 100 | 101 | -------------------------------------------------------------------------------- /gnn-comparison/models/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/models/utils/__init__.py -------------------------------------------------------------------------------- /gnn-comparison/requirements.txt: -------------------------------------------------------------------------------- 1 | networkx 2 | requests 3 | pyyaml 4 | torch 5 | torch_scatter 6 | torch_sparse 7 | torch_cluster 8 | torch_geometric 9 | -------------------------------------------------------------------------------- /gnn-comparison/utils/batch_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import networkx as nx 4 | 5 | from torch_geometric.data import Data, Batch 6 | from torch_geometric.utils import dense_to_sparse, scatter_, to_dense_adj 7 | 8 | 9 | def construct_mask_indices(sizes): 10 | num_rows, num_cols = sum(sizes), len(sizes) 11 | 12 | indices = [] 13 | for i, size in enumerate(sizes): 14 | cum_sum = sum(sizes[:i]) 15 | indices.append((cum_sum, cum_sum + size)) 16 | return indices 17 | 18 | 19 | def _make_block_diag(mats, mat_sizes): 20 | block_diag = torch.zeros(sum(mat_sizes), sum(mat_sizes)) 21 | 22 | for i, (mat, size) in enumerate(zip(mats, mat_sizes)): 23 | cum_size = sum(mat_sizes[:i]) 24 | block_diag[cum_size:cum_size+size,cum_size:cum_size+size] = mat 25 | 26 | return block_diag 27 | 28 | 29 | def make_block_diag(data): 30 | data = data.to_data_list() 31 | adjs = [to_dense_adj(d.edge_index).squeeze(0) for d in data] 32 | adj_sizes = [a.size(0) for a in adjs] 33 | bd_mat = _make_block_diag(adjs, adj_sizes) 34 | mask_indices = construct_mask_indices(adj_sizes) 35 | return bd_mat, mask_indices 36 | 37 | 38 | def get_adj(block_diag, index): 39 | from_i, to_i = index 40 | return block_diag[from_i:to_i, from_i:to_i] 41 | 42 | 43 | def mock_batch(batch_size): 44 | """construct pyG batch""" 45 | graphs = [] 46 | while len(graphs) < batch_size: 47 | G = nx.erdos_renyi_graph(np.random.choice([300, 500]), 0.5) 48 | if G.number_of_edges() > 1: 49 | graphs.append(G) 50 | 51 | adjs = [torch.from_numpy(nx.to_numpy_array(G)) for G in graphs] 52 | graph_data = [dense_to_sparse(A) for A in adjs] 53 | data_list = [Data(x=x, edge_index=e) for (e, x) in graph_data] 54 | return Batch.from_data_list(data_list) 55 | 56 | 57 | def test(): 58 | batch_size = 3 59 | data = mock_batch(batch_size=batch_size) 60 | 61 | # create block diagonal matrix of batch 62 | # block size: [nodes_in_batch] x [nodes_in_batch] 63 | block_diag, indices = make_block_diag(data) 64 | for i in range(batch_size): 65 | graph_adj = get_adj(block_diag, indices[i]) 66 | print(graph_adj) -------------------------------------------------------------------------------- /gnn-comparison/utils/eval_across_folds.py: -------------------------------------------------------------------------------- 1 | import json 2 | from argparse import ArgumentParser 3 | import glob 4 | import numpy as np 5 | 6 | if __name__ == '__main__': 7 | parser = ArgumentParser() 8 | parser.add_argument("--dir", dest="results_dir", required=True) 9 | 10 | args = parser.parse_args() 11 | results_dir = args.results_dir 12 | num_folds = len(glob.glob(f'{results_dir}/GIN_NCI1_assessment/*/*/')) 13 | num_config = len(glob.glob(f'{results_dir}/GIN_NCI1_assessment/*/OUTER_FOLD_1/HOLDOUT_MS/*/')) 14 | 15 | results = {} 16 | config_values = {} 17 | for config_id in range(1, num_config + 1): 18 | config_result_jsons = glob.glob(f'{results_dir}/GIN_NCI1_assessment/*/*/HOLDOUT_MS/config_{config_id}/config_results.json') 19 | count = 0 20 | values = [] 21 | for json_path in config_result_jsons: 22 | with open(json_path, 'r') as file: 23 | obj = json.load(file) 24 | count += 1 25 | values.append(obj['VL_score']) 26 | if count > 0: 27 | config_values[config_id] = obj['config'] 28 | results[config_id] = (np.mean(values), np.std(values), count) 29 | 30 | sorted_configs = [(k, v) for k, v in sorted(results.items(), key=lambda item: item[1][0], reverse=True)] 31 | for config_id, results in sorted_configs: 32 | print(f'Config id: {config_id}: {results[0]} std: {results[1]} (count: {results[2]})') 33 | print(config_values[config_id]) 34 | print() -------------------------------------------------------------------------------- /gnn-comparison/utils/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | 4 | 5 | class NumpyEncoder(json.JSONEncoder): 6 | def default(self, obj): 7 | if isinstance(obj, np.ndarray): 8 | return obj.tolist() 9 | return json.JSONEncoder.default(self, obj) 10 | 11 | 12 | def one_hot(value, num_classes): 13 | vec = np.zeros(num_classes) 14 | vec[value - 1] = 1 15 | return vec 16 | 17 | 18 | def get_max_num_nodes(dataset_str): 19 | import datasets 20 | dataset = getattr(datasets, dataset_str)() 21 | 22 | max_num_nodes = -1 23 | for d in dataset.dataset: 24 | max_num_nodes = max(max_num_nodes, d.num_nodes) 25 | return max_num_nodes 26 | -------------------------------------------------------------------------------- /images/fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/images/fig3.png -------------------------------------------------------------------------------- /images/fig5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/images/fig5.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from attrdict import AttrDict 3 | 4 | from experiment import Experiment 5 | from common import Task, GNN_TYPE, STOP 6 | 7 | if __name__ == '__main__': 8 | parser = ArgumentParser() 9 | parser.add_argument("--task", dest="task", default=Task.NEIGHBORS_MATCH, type=Task.from_string, choices=list(Task), 10 | required=False) 11 | parser.add_argument("--type", dest="type", default=GNN_TYPE.GCN, type=GNN_TYPE.from_string, choices=list(GNN_TYPE), 12 | required=False) 13 | parser.add_argument("--dim", dest="dim", default=32, type=int, required=False) 14 | parser.add_argument("--depth", dest="depth", default=3, type=int, required=False) 15 | parser.add_argument("--num_layers", dest="num_layers", default=None, type=int, required=False) 16 | parser.add_argument("--train_fraction", dest="train_fraction", default=0.8, type=float, required=False) 17 | parser.add_argument("--max_epochs", dest="max_epochs", default=50000, type=int, required=False) 18 | parser.add_argument("--eval_every", dest="eval_every", default=100, type=int, required=False) 19 | parser.add_argument("--batch_size", dest="batch_size", default=1024, type=int, required=False) 20 | parser.add_argument("--accum_grad", dest="accum_grad", default=1, type=int, required=False) 21 | parser.add_argument("--stop", dest="stop", default=STOP.TRAIN, type=STOP.from_string, choices=list(STOP), 22 | required=False) 23 | parser.add_argument("--patience", dest="patience", default=20, type=int, required=False) 24 | parser.add_argument("--loader_workers", dest="loader_workers", default=0, type=int, required=False) 25 | parser.add_argument('--last_layer_fully_adjacent', action='store_true') 26 | parser.add_argument('--no_layer_norm', action='store_true') 27 | parser.add_argument('--no_activation', action='store_true') 28 | parser.add_argument('--no_residual', action='store_true') 29 | parser.add_argument('--unroll', action='store_true', help='use the same weights across GNN layers') 30 | 31 | args = parser.parse_args() 32 | Experiment(args).run() 33 | 34 | 35 | def get_fake_args( 36 | task=Task.NEIGHBORS_MATCH, 37 | type=GNN_TYPE.GCN, 38 | dim=32, 39 | depth=3, 40 | num_layers=None, 41 | train_fraction=0.8, 42 | max_epochs=50000, 43 | eval_every=100, 44 | batch_size=1024, 45 | accum_grad=1, 46 | patience=20, 47 | stop=STOP.TRAIN, 48 | loader_workers=0, 49 | last_layer_fully_adjacent=False, 50 | no_layer_norm=False, 51 | no_activation=False, 52 | no_residual=False, 53 | unroll=False, 54 | ): 55 | return AttrDict({ 56 | 'task': task, 57 | 'type': type, 58 | 'dim': dim, 59 | 'depth': depth, 60 | 'num_layers': num_layers, 61 | 'train_fraction': train_fraction, 62 | 'max_epochs': max_epochs, 63 | 'eval_every': eval_every, 64 | 'batch_size': batch_size, 65 | 'accum_grad': accum_grad, 66 | 'stop': stop, 67 | 'patience': patience, 68 | 'loader_workers': loader_workers, 69 | 'last_layer_fully_adjacent': last_layer_fully_adjacent, 70 | 'no_layer_norm': no_layer_norm, 71 | 'no_activation': no_activation, 72 | 'no_residual': no_residual, 73 | 'unroll': unroll, 74 | }) 75 | -------------------------------------------------------------------------------- /models/graph_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | 6 | class GraphModel(torch.nn.Module): 7 | def __init__(self, gnn_type, num_layers, dim0, h_dim, out_dim, last_layer_fully_adjacent, 8 | unroll, layer_norm, use_activation, use_residual): 9 | super(GraphModel, self).__init__() 10 | self.gnn_type = gnn_type 11 | self.unroll = unroll 12 | self.last_layer_fully_adjacent = last_layer_fully_adjacent 13 | self.use_layer_norm = layer_norm 14 | self.use_activation = use_activation 15 | self.use_residual = use_residual 16 | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 17 | 18 | self.num_layers = num_layers 19 | self.layer0_keys = nn.Embedding(num_embeddings=dim0 + 1, embedding_dim=h_dim) 20 | self.layer0_values = nn.Embedding(num_embeddings=dim0 + 1, embedding_dim=h_dim) 21 | self.layers = nn.ModuleList() 22 | self.layer_norms = nn.ModuleList() 23 | if unroll: 24 | self.layers.append(gnn_type.get_layer( 25 | in_dim=h_dim, 26 | out_dim=h_dim)) 27 | else: 28 | for i in range(num_layers): 29 | self.layers.append(gnn_type.get_layer( 30 | in_dim=h_dim, 31 | out_dim=h_dim)) 32 | if self.use_layer_norm: 33 | for i in range(num_layers): 34 | self.layer_norms.append(nn.LayerNorm(h_dim)) 35 | 36 | self.out_dim = out_dim 37 | # self.out_layer = nn.Linear(in_features=h_dim, out_features=out_dim, bias=False) 38 | self.out_layer = nn.Linear(in_features=h_dim, out_features=out_dim + 1, bias=False) 39 | 40 | def forward(self, data): 41 | x, edge_index, batch, roots = data.x, data.edge_index, data.batch, data.root_mask 42 | 43 | x_key, x_val = x[:, 0], x[:, 1] 44 | x_key_embed = self.layer0_keys(x_key) 45 | x_val_embed = self.layer0_values(x_val) 46 | x = x_key_embed + x_val_embed 47 | 48 | for i in range(self.num_layers): 49 | if self.unroll: 50 | layer = self.layers[0] 51 | else: 52 | layer = self.layers[i] 53 | new_x = x 54 | if self.last_layer_fully_adjacent and i == self.num_layers - 1: 55 | root_indices = torch.nonzero(roots, as_tuple=False).squeeze(-1) 56 | target_roots = root_indices.index_select(dim=0, index=batch) 57 | source_nodes = torch.arange(0, data.num_nodes).to(self.device) 58 | edges = torch.stack([source_nodes, target_roots], dim=0) 59 | 60 | else: 61 | edges = edge_index 62 | new_x = layer(new_x, edges) 63 | if self.use_activation: 64 | new_x = F.relu(new_x) 65 | if self.use_residual: 66 | x = x + new_x 67 | else: 68 | x = new_x 69 | if self.use_layer_norm: 70 | x = self.layer_norms[i](x) 71 | 72 | root_nodes = x[roots] 73 | logits = self.out_layer(root_nodes) 74 | # logits = F.linear(root_nodes, self.layer0_values.weight) 75 | return logits 76 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | attrdict==2.0.1 2 | torch>=1.4.0 3 | torch-geometric>=1.4.2 4 | torch-scatter>=2.0.4 5 | torch-sparse>=0.6.0 6 | torchvision>=0.5.0 7 | sklearn 8 | 9 | -------------------------------------------------------------------------------- /run-gat-2-8.py: -------------------------------------------------------------------------------- 1 | import main 2 | from common import Task, STOP, GNN_TYPE 3 | from attrdict import AttrDict 4 | from experiment import Experiment 5 | import torch 6 | 7 | override_params = { 8 | 2: {'batch_size': 64, 'eval_every': 1000}, 9 | 3: {'batch_size': 64}, 10 | 4: {'batch_size': 1024}, 11 | 5: {'batch_size': 1024}, 12 | 6: {'batch_size': 1024}, 13 | 7: {'batch_size': 2048}, 14 | 8: {'batch_size': 1024, 'accum_grad': 2}, # effective batch size of 2048, with less GPU memory 15 | } 16 | 17 | 18 | class Results: 19 | def __init__(self, train_acc, test_acc, epoch): 20 | self.train_acc = train_acc 21 | self.test_acc = test_acc 22 | self.epoch = epoch 23 | 24 | 25 | if __name__ == '__main__': 26 | 27 | task = Task.NEIGHBORS_MATCH 28 | gnn_type = GNN_TYPE.GAT 29 | stopping_criterion = STOP.TRAIN 30 | min_depth = 2 31 | max_depth = 8 32 | 33 | results_all_depths = {} 34 | for depth in range(min_depth, max_depth + 1): 35 | num_layers = depth + 1 36 | args = main.get_fake_args(task=task, depth=depth, num_layers=num_layers, loader_workers=7, 37 | type=gnn_type, stop=stopping_criterion, 38 | no_activation=True, no_residual=False) 39 | if depth in override_params: 40 | for key, value in AttrDict(override_params[depth]).items(): 41 | args[key] = value 42 | train_acc, test_acc, epoch = Experiment(args).run() 43 | torch.cuda.empty_cache() 44 | results_all_depths[depth] = Results(train_acc=train_acc, test_acc=test_acc, epoch=epoch) 45 | print() 46 | 47 | print(f'Task: {task}') 48 | print('depth, train_acc, test_acc, epoch, train_acc, test_acc, epoch,') 49 | for depth in range(min_depth, max_depth + 1): 50 | res = results_all_depths[depth] 51 | print(f'{depth}, {res.train_acc}, {res.test_acc}, {res.epoch}') 52 | -------------------------------------------------------------------------------- /run-gcn-2-8.py: -------------------------------------------------------------------------------- 1 | import main 2 | from common import Task, STOP, GNN_TYPE 3 | from attrdict import AttrDict 4 | from experiment import Experiment 5 | import torch 6 | 7 | override_params = { 8 | 2: {'batch_size': 64, 'eval_every': 1000}, 9 | 3: {'batch_size': 64}, 10 | 4: {'batch_size': 1024}, 11 | 5: {'batch_size': 1024}, 12 | 6: {'batch_size': 1024}, 13 | 7: {'batch_size': 2048}, 14 | 8: {'batch_size': 1024, 'accum_grad': 2}, # effective batch size of 2048, with less GPU memory 15 | } 16 | 17 | 18 | class Results: 19 | def __init__(self, train_acc, test_acc, epoch): 20 | self.train_acc = train_acc 21 | self.test_acc = test_acc 22 | self.epoch = epoch 23 | 24 | 25 | if __name__ == '__main__': 26 | 27 | task = Task.NEIGHBORS_MATCH 28 | gnn_type = GNN_TYPE.GCN 29 | stopping_criterion = STOP.TRAIN 30 | min_depth = 2 31 | max_depth = 8 32 | 33 | results_all_depths = {} 34 | for depth in range(min_depth, max_depth + 1): 35 | num_layers = depth + 1 36 | args = main.get_fake_args(task=task, depth=depth, num_layers=num_layers, loader_workers=7, 37 | type=gnn_type, stop=stopping_criterion) 38 | if depth in override_params: 39 | for key, value in AttrDict(override_params[depth]).items(): 40 | args[key] = value 41 | train_acc, test_acc, epoch = Experiment(args).run() 42 | torch.cuda.empty_cache() 43 | results_all_depths[depth] = Results(train_acc=train_acc, test_acc=test_acc, epoch=epoch) 44 | print() 45 | 46 | print(f'Task: {task}') 47 | print('depth, train_acc, test_acc, epoch, train_acc, test_acc, epoch,') 48 | for depth in range(min_depth, max_depth + 1): 49 | res = results_all_depths[depth] 50 | print(f'{depth}, {res.train_acc}, {res.test_acc}, {res.epoch}') 51 | -------------------------------------------------------------------------------- /run-ggnn-2-8.py: -------------------------------------------------------------------------------- 1 | import main 2 | from common import Task, STOP, GNN_TYPE 3 | from attrdict import AttrDict 4 | from experiment import Experiment 5 | import torch 6 | 7 | override_params = { 8 | 2: {'batch_size': 64, 'eval_every': 1000}, 9 | 3: {'batch_size': 64}, 10 | 4: {'batch_size': 1024}, 11 | 5: {'batch_size': 1024}, 12 | 6: {'batch_size': 1024}, 13 | 7: {'batch_size': 1024, 'accum_grad': 2}, 14 | 8: {'batch_size': 512, 'accum_grad': 4}, # effective batch size of 2048, with less GPU memory 15 | } 16 | 17 | 18 | class Results: 19 | def __init__(self, train_acc, test_acc, epoch): 20 | self.train_acc = train_acc 21 | self.test_acc = test_acc 22 | self.epoch = epoch 23 | 24 | 25 | if __name__ == '__main__': 26 | 27 | task = Task.NEIGHBORS_MATCH 28 | gnn_type = GNN_TYPE.GGNN 29 | stopping_criterion = STOP.TRAIN 30 | min_depth = 2 31 | max_depth = 8 32 | 33 | results_all_depths = {} 34 | for depth in range(min_depth, max_depth + 1): 35 | num_layers = depth + 1 36 | args = main.get_fake_args(task=task, depth=depth, num_layers=num_layers, loader_workers=7, 37 | type=gnn_type, stop=stopping_criterion, 38 | no_layer_norm=True, no_activation=True, no_residual=True) 39 | if depth in override_params: 40 | for key, value in AttrDict(override_params[depth]).items(): 41 | args[key] = value 42 | train_acc, test_acc, epoch = Experiment(args).run() 43 | torch.cuda.empty_cache() 44 | results_all_depths[depth] = Results(train_acc=train_acc, test_acc=test_acc, epoch=epoch) 45 | print() 46 | 47 | print(f'Task: {task}') 48 | print('depth, train_acc, test_acc, epoch, train_acc, test_acc, epoch,') 49 | for depth in range(min_depth, max_depth + 1): 50 | res = results_all_depths[depth] 51 | print(f'{depth}, {res.train_acc}, {res.test_acc}, {res.epoch}') 52 | -------------------------------------------------------------------------------- /run-gin-2-8.py: -------------------------------------------------------------------------------- 1 | import main 2 | from common import Task, STOP, GNN_TYPE 3 | from attrdict import AttrDict 4 | from experiment import Experiment 5 | import torch 6 | 7 | override_params = { 8 | 2: {'batch_size': 64, 'eval_every': 1000}, 9 | 3: {'batch_size': 64}, 10 | 4: {'batch_size': 1024}, 11 | 5: {'batch_size': 1024}, 12 | 6: {'batch_size': 1024}, 13 | 7: {'batch_size': 1024, 'accum_grad': 2}, 14 | 8: {'batch_size': 512, 'accum_grad': 4}, # effective batch size of 2048, with less GPU memory 15 | } 16 | 17 | 18 | class Results: 19 | def __init__(self, train_acc, test_acc, epoch): 20 | self.train_acc = train_acc 21 | self.test_acc = test_acc 22 | self.epoch = epoch 23 | 24 | 25 | if __name__ == '__main__': 26 | 27 | task = Task.NEIGHBORS_MATCH 28 | gnn_type = GNN_TYPE.GIN 29 | stopping_criterion = STOP.TRAIN 30 | min_depth = 2 31 | max_depth = 8 32 | 33 | results_all_depths = {} 34 | for depth in range(min_depth, max_depth + 1): 35 | num_layers = depth + 1 36 | args = main.get_fake_args(task=task, depth=depth, num_layers=num_layers, loader_workers=7, 37 | type=gnn_type, stop=stopping_criterion) 38 | if depth in override_params: 39 | for key, value in AttrDict(override_params[depth]).items(): 40 | args[key] = value 41 | train_acc, test_acc, epoch = Experiment(args).run() 42 | torch.cuda.empty_cache() 43 | results_all_depths[depth] = Results(train_acc=train_acc, test_acc=test_acc, epoch=epoch) 44 | print() 45 | 46 | print(f'Task: {task}') 47 | print('depth, train_acc, test_acc, epoch, train_acc, test_acc, epoch,') 48 | for depth in range(min_depth, max_depth + 1): 49 | res = results_all_depths[depth] 50 | print(f'{depth}, {res.train_acc}, {res.test_acc}, {res.epoch}') 51 | -------------------------------------------------------------------------------- /tasks/dictionary_lookup.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import itertools 3 | import random 4 | import math 5 | 6 | from tasks.tree_dataset import TreeDataset 7 | import common 8 | 9 | 10 | class DictionaryLookupDataset(TreeDataset): 11 | def __init__(self, depth): 12 | super(DictionaryLookupDataset, self).__init__(depth) 13 | 14 | def get_combinations(self): 15 | # returns: an iterable of [key, permutation(leaves)] 16 | # number of combinations: (num_leaves!)*num_choices 17 | num_leaves = len(self.leaf_indices) 18 | num_permutations = 1000 19 | max_examples = 32000 20 | 21 | if self.depth > 3: 22 | per_depth_num_permutations = min(num_permutations, math.factorial(num_leaves), max_examples // num_leaves) 23 | permutations = [np.random.permutation(range(1, num_leaves + 1)) for _ in 24 | range(per_depth_num_permutations)] 25 | else: 26 | permutations = random.sample(list(itertools.permutations(range(1, num_leaves + 1))), 27 | min(num_permutations, math.factorial(num_leaves))) 28 | 29 | return itertools.chain.from_iterable( 30 | 31 | zip(range(1, num_leaves + 1), itertools.repeat(perm)) 32 | for perm in permutations) 33 | 34 | def get_nodes_features(self, combination): 35 | # combination: a list of indices 36 | # Each leaf contains a one-hot encoding of a key, and a one-hot encoding of the value 37 | # Every other node is empty, for now 38 | selected_key, values = combination 39 | 40 | # The root is [one-hot selected key] + [0 ... 0] 41 | nodes = [ (selected_key, 0) ] 42 | 43 | for i in range(1, self.num_nodes): 44 | if i in self.leaf_indices: 45 | leaf_num = self.leaf_indices.index(i) 46 | node = (leaf_num+1, values[leaf_num]) 47 | else: 48 | node = (0, 0) 49 | nodes.append(node) 50 | return nodes 51 | 52 | def label(self, combination): 53 | selected_key, values = combination 54 | return int(values[selected_key - 1]) 55 | 56 | def get_dims(self): 57 | # get input and output dims 58 | in_dim = len(self.leaf_indices) 59 | out_dim = len(self.leaf_indices) 60 | return in_dim, out_dim 61 | -------------------------------------------------------------------------------- /tasks/tree_dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch_geometric 3 | 4 | from torch_geometric.data import Data 5 | from torch.nn import functional as F 6 | from sklearn.model_selection import train_test_split 7 | 8 | 9 | class TreeDataset(object): 10 | def __init__(self, depth): 11 | super(TreeDataset, self).__init__() 12 | self.depth = depth 13 | self.num_nodes, self.edges, self.leaf_indices = self._create_blank_tree() 14 | self.criterion = F.cross_entropy 15 | 16 | def add_child_edges(self, cur_node, max_node): 17 | edges = [] 18 | leaf_indices = [] 19 | stack = [(cur_node, max_node)] 20 | while len(stack) > 0: 21 | cur_node, max_node = stack.pop() 22 | if cur_node == max_node: 23 | leaf_indices.append(cur_node) 24 | continue 25 | left_child = cur_node + 1 26 | right_child = cur_node + 1 + ((max_node - cur_node) // 2) 27 | edges.append([left_child, cur_node]) 28 | edges.append([right_child, cur_node]) 29 | stack.append((right_child, max_node)) 30 | stack.append((left_child, right_child - 1)) 31 | return edges, leaf_indices 32 | 33 | def _create_blank_tree(self): 34 | max_node_id = 2 ** (self.depth + 1) - 2 35 | edges, leaf_indices = self.add_child_edges(cur_node=0, max_node=max_node_id) 36 | return max_node_id + 1, edges, leaf_indices 37 | 38 | def create_blank_tree(self, add_self_loops=True): 39 | edge_index = torch.tensor(self.edges).t() 40 | if add_self_loops: 41 | edge_index, _ = torch_geometric.utils.add_remaining_self_loops(edge_index=edge_index, ) 42 | return edge_index 43 | 44 | def generate_data(self, train_fraction): 45 | data_list = [] 46 | 47 | for comb in self.get_combinations(): 48 | edge_index = self.create_blank_tree(add_self_loops=True) 49 | nodes = torch.tensor(self.get_nodes_features(comb), dtype=torch.long) 50 | root_mask = torch.tensor([True] + [False] * (len(nodes) - 1)) 51 | label = self.label(comb) 52 | data_list.append(Data(x=nodes, edge_index=edge_index, root_mask=root_mask, y=label)) 53 | 54 | dim0, out_dim = self.get_dims() 55 | X_train, X_test = train_test_split( 56 | data_list, train_size=train_fraction, shuffle=True, stratify=[data.y for data in data_list]) 57 | 58 | 59 | return X_train, X_test, dim0, out_dim, self.criterion 60 | 61 | # Every sub-class should implement the following methods: 62 | def get_combinations(self): 63 | raise NotImplementedError 64 | 65 | def get_nodes_features(self, combination): 66 | raise NotImplementedError 67 | 68 | def label(self, combination): 69 | raise NotImplementedError 70 | 71 | def get_dims(self): 72 | raise NotImplementedError 73 | 74 | -------------------------------------------------------------------------------- /tf-gnn-samples/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 150 3 | ignore = 4 | # W605: invalid escape sequence -- triggered by pseudo-LaTeX in comments 5 | W605, -------------------------------------------------------------------------------- /tf-gnn-samples/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | molecules_*.json 104 | data/* -------------------------------------------------------------------------------- /tf-gnn-samples/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## CONTRIBUTING 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 5 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 6 | 7 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 8 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 9 | provided by the bot. You will only need to do this once across all repos using our CLA. 10 | 11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 14 | -------------------------------------------------------------------------------- /tf-gnn-samples/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /tf-gnn-samples/gnns/__init__.py: -------------------------------------------------------------------------------- 1 | from .ggnn import sparse_ggnn_layer 2 | from .gnn_edge_mlp import sparse_gnn_edge_mlp_layer 3 | from .gnn_film import sparse_gnn_film_layer 4 | from .rgat import sparse_rgat_layer 5 | from .rgcn import sparse_rgcn_layer 6 | from .rgdcn import sparse_rgdcn_layer 7 | from .rgin import sparse_rgin_layer 8 | -------------------------------------------------------------------------------- /tf-gnn-samples/gnns/ggnn.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | import tensorflow as tf 4 | 5 | from utils import get_gated_unit, get_aggregation_function 6 | 7 | 8 | def sparse_ggnn_layer(node_embeddings: tf.Tensor, 9 | adjacency_lists: List[tf.Tensor], 10 | state_dim: Optional[int], 11 | num_timesteps: int = 1, 12 | gated_unit_type: str = "gru", 13 | activation_function: str = "tanh", 14 | message_aggregation_function: str = "sum" 15 | ) -> tf.Tensor: 16 | """ 17 | Compute new graph states by neural message passing and gated units on the nodes. 18 | For this, we assume existing node states h^t_v and a list of per-edge-type adjacency 19 | matrices A_\ell. 20 | 21 | We compute new states as follows: 22 | h^{t+1}_v := Cell(h^t_v, \sum_\ell 23 | \sum_{(u, v) \in A_\ell} 24 | W_\ell * h^t_u) 25 | The learnable parameters of this are the recurrent Cell and the W_\ell \in R^{D,D}. 26 | 27 | We use the following abbreviations in shape descriptions: 28 | * V: number of nodes 29 | * D: state dimension 30 | * L: number of different edge types 31 | * E: number of edges of a given edge type 32 | 33 | Arguments: 34 | node_embeddings: float32 tensor of shape [V, D], the original representation of 35 | each node in the graph. 36 | adjacency_lists: List of L adjacency lists, represented as int32 tensors of shape 37 | [E, 2]. Concretely, adjacency_lists[l][k,:] == [v, u] means that the k-th edge 38 | of type l connects node v to node u. 39 | state_dim: Optional size of output dimension of the GNN layer. If not set, defaults 40 | to D, the dimensionality of the input. If different from the input dimension, 41 | parameter num_timesteps has to be 1. 42 | num_timesteps: Number of repeated applications of this message passing layer. 43 | gated_unit_type: Type of the recurrent unit used (one of RNN, GRU and LSTM). 44 | activation_function: Type of activation function used. 45 | message_aggregation_function: Type of aggregation function used for messages. 46 | 47 | Returns: 48 | float32 tensor of shape [V, state_dim] 49 | """ 50 | num_nodes = tf.shape(node_embeddings, out_type=tf.int32)[0] 51 | if state_dim is None: 52 | state_dim = tf.shape(node_embeddings, out_type=tf.int32)[1] 53 | 54 | # === Prepare things we need across all timesteps: 55 | message_aggregation_fn = get_aggregation_function(message_aggregation_function) 56 | gated_cell = get_gated_unit(state_dim, gated_unit_type, activation_function) 57 | edge_type_to_message_transformation_layers = [] # Layers to compute the message from a source state 58 | edge_type_to_message_targets = [] # List of tensors of message targets 59 | for edge_type_idx, adjacency_list_for_edge_type in enumerate(adjacency_lists): 60 | edge_type_to_message_transformation_layers.append( 61 | tf.keras.layers.Dense(units=state_dim, 62 | use_bias=False, 63 | activation=None, 64 | name="Edge_%i_Weight" % edge_type_idx)) 65 | edge_type_to_message_targets.append(adjacency_list_for_edge_type[:, 1]) 66 | 67 | # Let M be the number of messages (sum of all E): 68 | message_targets = tf.concat(edge_type_to_message_targets, axis=0) # Shape [M] 69 | 70 | cur_node_states = node_embeddings 71 | for _ in range(num_timesteps): 72 | messages = [] # list of tensors of messages of shape [E, D] 73 | message_source_states = [] # list of tensors of edge source states of shape [E, D] 74 | 75 | # Collect incoming messages per edge type 76 | for edge_type_idx, adjacency_list_for_edge_type in enumerate(adjacency_lists): 77 | edge_sources = adjacency_list_for_edge_type[:, 0] 78 | edge_source_states = tf.nn.embedding_lookup(params=cur_node_states, 79 | ids=edge_sources) # Shape [E, D] 80 | all_messages_for_edge_type = \ 81 | edge_type_to_message_transformation_layers[edge_type_idx](edge_source_states) # Shape [E,D] 82 | messages.append(all_messages_for_edge_type) 83 | message_source_states.append(edge_source_states) 84 | 85 | messages = tf.concat(messages, axis=0) # Shape [M, D] 86 | aggregated_messages = \ 87 | message_aggregation_fn(data=messages, 88 | segment_ids=message_targets, 89 | num_segments=num_nodes) # Shape [V, D] 90 | 91 | # pass updated vertex features into RNN cell 92 | new_node_states = gated_cell(aggregated_messages, [cur_node_states])[0] # Shape [V, D] 93 | cur_node_states = new_node_states 94 | 95 | return cur_node_states 96 | -------------------------------------------------------------------------------- /tf-gnn-samples/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparse_graph_model import Sparse_Graph_Model 2 | from .ggnn_model import GGNN_Model 3 | from .gnn_edge_mlp_model import GNN_Edge_MLP_Model 4 | from .gnn_film_model import GNN_FiLM_Model 5 | from .rgat_model import RGAT_Model 6 | from .rgcn_model import RGCN_Model 7 | from .rgdcn_model import RGDCN_Model 8 | from .rgin_model import RGIN_Model 9 | -------------------------------------------------------------------------------- /tf-gnn-samples/models/ggnn_model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, List 2 | 3 | import tensorflow as tf 4 | 5 | from .sparse_graph_model import Sparse_Graph_Model 6 | from tasks import Sparse_Graph_Task 7 | from gnns import sparse_ggnn_layer 8 | 9 | 10 | class GGNN_Model(Sparse_Graph_Model): 11 | @classmethod 12 | def default_params(cls): 13 | params = super().default_params() 14 | params.update({ 15 | 'hidden_size': 128, 16 | 'graph_rnn_cell': 'GRU', # RNN, GRU, or LSTM 17 | 'graph_activation_function': "tanh", 18 | "message_aggregation_function": "sum", 19 | 'graph_layer_input_dropout_keep_prob': 1.0, 20 | 'graph_dense_between_every_num_gnn_layers': 10000, 21 | 'graph_residual_connection_every_num_layers': 10000, 22 | }) 23 | return params 24 | 25 | @staticmethod 26 | def name(params: Dict[str, Any]) -> str: 27 | return "GGNN" 28 | 29 | def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None: 30 | super().__init__(params, task, run_id, result_dir) 31 | 32 | def _apply_gnn_layer(self, 33 | node_representations: tf.Tensor, 34 | adjacency_lists: List[tf.Tensor], 35 | type_to_num_incoming_edges: tf.Tensor, 36 | num_timesteps: int) -> tf.Tensor: 37 | return sparse_ggnn_layer( 38 | node_embeddings=node_representations, 39 | adjacency_lists=adjacency_lists, 40 | state_dim=self.params['hidden_size'], 41 | num_timesteps=num_timesteps, 42 | gated_unit_type=self.params['graph_rnn_cell'], 43 | activation_function=self.params['graph_activation_function'], 44 | message_aggregation_function=self.params['message_aggregation_function'], 45 | ) 46 | -------------------------------------------------------------------------------- /tf-gnn-samples/models/gnn_edge_mlp_model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, List 2 | 3 | import tensorflow as tf 4 | 5 | from .sparse_graph_model import Sparse_Graph_Model 6 | from tasks import Sparse_Graph_Task 7 | from gnns import sparse_gnn_edge_mlp_layer 8 | 9 | 10 | class GNN_Edge_MLP_Model(Sparse_Graph_Model): 11 | @classmethod 12 | def default_params(cls): 13 | params = super().default_params() 14 | params.update({ 15 | 'max_nodes_in_batch': 25000, 16 | 'hidden_size': 128, 17 | "graph_activation_function": "gelu", 18 | "message_aggregation_function": "sum", 19 | 'graph_inter_layer_norm': True, 20 | 'use_target_state_as_input': True, 21 | 'num_edge_hidden_layers': 1, 22 | }) 23 | return params 24 | 25 | @staticmethod 26 | def name(params: Dict[str, Any]) -> str: 27 | return "GNN-Edge-MLP%i" % (params['num_edge_hidden_layers']) 28 | 29 | def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None: 30 | super().__init__(params, task, run_id, result_dir) 31 | 32 | def _apply_gnn_layer(self, 33 | node_representations: tf.Tensor, 34 | adjacency_lists: List[tf.Tensor], 35 | type_to_num_incoming_edges: tf.Tensor, 36 | num_timesteps: int, 37 | ) -> tf.Tensor: 38 | return sparse_gnn_edge_mlp_layer( 39 | node_embeddings=node_representations, 40 | adjacency_lists=adjacency_lists, 41 | type_to_num_incoming_edges=type_to_num_incoming_edges, 42 | state_dim=self.params['hidden_size'], 43 | num_timesteps=num_timesteps, 44 | activation_function=self.params['graph_activation_function'], 45 | message_aggregation_function=self.params['message_aggregation_function'], 46 | use_target_state_as_input=self.params['use_target_state_as_input'], 47 | num_edge_hidden_layers=self.params['num_edge_hidden_layers'] 48 | ) 49 | -------------------------------------------------------------------------------- /tf-gnn-samples/models/gnn_film_model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, List 2 | 3 | import tensorflow as tf 4 | 5 | from .sparse_graph_model import Sparse_Graph_Model 6 | from tasks import Sparse_Graph_Task 7 | from gnns import sparse_gnn_film_layer 8 | 9 | 10 | class GNN_FiLM_Model(Sparse_Graph_Model): 11 | @classmethod 12 | def default_params(cls): 13 | params = super().default_params() 14 | params.update({ 15 | "hidden_size": 128, 16 | "graph_activation_function": "ReLU", 17 | "message_aggregation_function": "sum", 18 | "normalize_messages_by_num_incoming": False, 19 | }) 20 | return params 21 | 22 | @staticmethod 23 | def name(params: Dict[str, Any]) -> str: 24 | return "GNN-FiLM" 25 | 26 | def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None: 27 | super().__init__(params, task, run_id, result_dir) 28 | 29 | def _apply_gnn_layer(self, 30 | node_representations: tf.Tensor, 31 | adjacency_lists: List[tf.Tensor], 32 | type_to_num_incoming_edges: tf.Tensor, 33 | num_timesteps: int) -> tf.Tensor: 34 | 35 | return sparse_gnn_film_layer( 36 | node_embeddings=node_representations, 37 | adjacency_lists=adjacency_lists, 38 | type_to_num_incoming_edges=type_to_num_incoming_edges, 39 | state_dim=self.params['hidden_size'], 40 | num_timesteps=num_timesteps, 41 | activation_function=self.params['graph_activation_function'], 42 | message_aggregation_function=self.params['message_aggregation_function'], 43 | normalize_by_num_incoming=self.params["normalize_messages_by_num_incoming"], 44 | ) 45 | -------------------------------------------------------------------------------- /tf-gnn-samples/models/no_struct_mlp_model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, List 2 | 3 | import tensorflow as tf 4 | 5 | from utils import MLP 6 | from .sparse_graph_model import Sparse_Graph_Model 7 | from tasks import Sparse_Graph_Task 8 | from gnns import sparse_gnn_edge_mlp_layer 9 | 10 | 11 | class No_Struct_MLP_Model(Sparse_Graph_Model): 12 | @classmethod 13 | def default_params(cls): 14 | params = super().default_params() 15 | params.update({ 16 | 'max_nodes_in_batch': 25000, 17 | 'hidden_size': 128, 18 | "graph_activation_function": "gelu", 19 | "message_aggregation_function": "sum", 20 | 'graph_inter_layer_norm': True, 21 | 'use_target_state_as_input': True, 22 | 'num_edge_hidden_layers': 0, 23 | }) 24 | return params 25 | 26 | @staticmethod 27 | def name(params: Dict[str, Any]) -> str: 28 | return "NoStruct-MLP%i" % (params['num_edge_hidden_layers']) 29 | 30 | def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None: 31 | super().__init__(params, task, run_id, result_dir) 32 | 33 | def _apply_gnn_layer(self, 34 | node_representations: tf.Tensor, 35 | adjacency_lists: List[tf.Tensor], 36 | type_to_num_incoming_edges: tf.Tensor, 37 | num_timesteps: int 38 | ) -> tf.Tensor: 39 | graph_to_nodes = self._Sparse_Graph_Model__placeholders['graph_to_nodes'] 40 | graph_nodes_list = self._Sparse_Graph_Model__placeholders['graph_nodes_list'] # (None, ) 41 | max_nodes = tf.shape(graph_to_nodes)[1] 42 | tiled_nodes = tf.tile(tf.expand_dims(graph_to_nodes, axis=-1), (1, 1, max_nodes)) 43 | pairs = tf.concat( 44 | [tf.expand_dims(tiled_nodes, axis=-1), tf.expand_dims(tf.transpose(tiled_nodes, [0, 2, 1]), axis=-1)], 45 | axis=-1) 46 | flat_pairs = tf.reshape(pairs, [-1, 2]) 47 | relevant_edges = tf.reshape(tf.gather(flat_pairs, tf.where(tf.reduce_min(flat_pairs, axis=-1) >= 0)), [-1, 2]) 48 | 49 | num_types = tf.shape(type_to_num_incoming_edges)[0] 50 | num_nodes_in_graph = tf.reduce_sum(tf.cast(tf.greater(graph_to_nodes, -1), dtype=tf.float32), axis=-1) 51 | num_incoming_nodes_per_node = tf.gather(params=num_nodes_in_graph, indices=graph_nodes_list) 52 | type_to_num_incoming_edges = tf.tile(tf.expand_dims(num_incoming_nodes_per_node, axis=0), [num_types, 1]) 53 | 54 | return sparse_gnn_edge_mlp_layer( 55 | node_embeddings=node_representations, 56 | adjacency_lists=[relevant_edges for _ in adjacency_lists], 57 | type_to_num_incoming_edges=type_to_num_incoming_edges, 58 | state_dim=self.params['hidden_size'], 59 | num_timesteps=num_timesteps, 60 | activation_function=self.params['graph_activation_function'], 61 | message_aggregation_function=self.params['message_aggregation_function'], 62 | use_target_state_as_input=self.params['use_target_state_as_input'], 63 | num_edge_hidden_layers=self.params['num_edge_hidden_layers'], 64 | ) 65 | -------------------------------------------------------------------------------- /tf-gnn-samples/models/rgat_model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, List 2 | 3 | import tensorflow as tf 4 | 5 | from .sparse_graph_model import Sparse_Graph_Model 6 | from tasks import Sparse_Graph_Task 7 | from gnns import sparse_rgat_layer 8 | 9 | 10 | class RGAT_Model(Sparse_Graph_Model): 11 | @classmethod 12 | def default_params(cls): 13 | params = super().default_params() 14 | params.update({ 15 | 'hidden_size': 128, 16 | 'num_heads': 4, 17 | 'graph_activation_function': 'tanh', 18 | 'graph_layer_input_dropout_keep_prob': 1.0, 19 | 'graph_dense_between_every_num_gnn_layers': 10000, 20 | 'graph_residual_connection_every_num_layers': 10000, 21 | }) 22 | return params 23 | 24 | @staticmethod 25 | def name(params: Dict[str, Any]) -> str: 26 | return "RGAT" 27 | 28 | def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None: 29 | super().__init__(params, task, run_id, result_dir) 30 | 31 | def _apply_gnn_layer(self, 32 | node_representations: tf.Tensor, 33 | adjacency_lists: List[tf.Tensor], 34 | type_to_num_incoming_edges: tf.Tensor, 35 | num_timesteps: int) -> tf.Tensor: 36 | return sparse_rgat_layer( 37 | node_embeddings=node_representations, 38 | adjacency_lists=adjacency_lists, 39 | state_dim=self.params['hidden_size'], 40 | num_timesteps=num_timesteps, 41 | num_heads=self.params['num_heads'], 42 | activation_function=self.params['graph_activation_function'], 43 | ) 44 | -------------------------------------------------------------------------------- /tf-gnn-samples/models/rgcn_model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, List 2 | 3 | import tensorflow as tf 4 | 5 | from .sparse_graph_model import Sparse_Graph_Model 6 | from tasks import Sparse_Graph_Task 7 | from gnns import sparse_rgcn_layer 8 | 9 | 10 | class RGCN_Model(Sparse_Graph_Model): 11 | @classmethod 12 | def default_params(cls): 13 | params = super().default_params() 14 | params.update({ 15 | 'hidden_size': 128, 16 | "graph_activation_function": "ReLU", 17 | "message_aggregation_function": "sum", 18 | 'graph_layer_input_dropout_keep_prob': 1.0, 19 | 'graph_dense_between_every_num_gnn_layers': 10000, 20 | 'graph_residual_connection_every_num_layers': 10000, 21 | }) 22 | return params 23 | 24 | @staticmethod 25 | def name(params: Dict[str, Any]) -> str: 26 | return "RGCN" 27 | 28 | def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None: 29 | super().__init__(params, task, run_id, result_dir) 30 | 31 | def _apply_gnn_layer(self, 32 | node_representations: tf.Tensor, 33 | adjacency_lists: List[tf.Tensor], 34 | type_to_num_incoming_edges: tf.Tensor, 35 | num_timesteps: int) -> tf.Tensor: 36 | return sparse_rgcn_layer( 37 | node_embeddings=node_representations, 38 | adjacency_lists=adjacency_lists, 39 | type_to_num_incoming_edges=type_to_num_incoming_edges, 40 | state_dim=self.params['hidden_size'], 41 | num_timesteps=num_timesteps, 42 | activation_function=self.params['graph_activation_function'], 43 | message_aggregation_function=self.params['message_aggregation_function'] 44 | ) 45 | -------------------------------------------------------------------------------- /tf-gnn-samples/models/rgdcn_model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, List 2 | 3 | import tensorflow as tf 4 | 5 | from .sparse_graph_model import Sparse_Graph_Model 6 | from tasks import Sparse_Graph_Task 7 | from gnns import sparse_rgdcn_layer 8 | 9 | 10 | class RGDCN_Model(Sparse_Graph_Model): 11 | @classmethod 12 | def default_params(cls): 13 | params = super().default_params() 14 | params.update({ 15 | 'max_nodes_in_batch': 25000, 16 | 'hidden_size': 128, 17 | 'num_channels': 8, 18 | "use_full_state_for_channel_weights": False, 19 | "tie_channel_weights": False, 20 | "graph_activation_function": "ReLU", 21 | "message_aggregation_function": "sum", 22 | 'graph_inter_layer_norm': True, 23 | }) 24 | return params 25 | 26 | @staticmethod 27 | def name(params: Dict[str, Any]) -> str: 28 | return "RGDCN" 29 | 30 | def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None: 31 | params['channel_dim'] = params['hidden_size'] // params['num_channels'] 32 | super().__init__(params, task, run_id, result_dir) 33 | 34 | def _apply_gnn_layer(self, 35 | node_representations: tf.Tensor, 36 | adjacency_lists: List[tf.Tensor], 37 | type_to_num_incoming_edges: tf.Tensor, 38 | num_timesteps: int) -> tf.Tensor: 39 | return sparse_rgdcn_layer( 40 | node_embeddings=node_representations, 41 | adjacency_lists=adjacency_lists, 42 | type_to_num_incoming_edges=type_to_num_incoming_edges, 43 | num_channels=self.params['num_channels'], 44 | channel_dim=self.params['channel_dim'], 45 | num_timesteps=num_timesteps, 46 | use_full_state_for_channel_weights=self.params['use_full_state_for_channel_weights'], 47 | tie_channel_weights=self.params['tie_channel_weights'], 48 | activation_function=self.params['graph_activation_function'], 49 | message_aggregation_function=self.params['message_aggregation_function'], 50 | ) 51 | -------------------------------------------------------------------------------- /tf-gnn-samples/models/rgin_model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, List 2 | 3 | import tensorflow as tf 4 | 5 | from .sparse_graph_model import Sparse_Graph_Model 6 | from tasks import Sparse_Graph_Task 7 | from gnns import sparse_rgin_layer 8 | 9 | 10 | class RGIN_Model(Sparse_Graph_Model): 11 | @classmethod 12 | def default_params(cls): 13 | params = super().default_params() 14 | params.update({ 15 | 'hidden_size': 128, 16 | "graph_activation_function": "ReLU", 17 | 'message_aggregation_function': "sum", 18 | 'graph_dense_between_every_num_gnn_layers': 10000, 19 | 'graph_inter_layer_norm': True, 20 | 'use_target_state_as_input': False, 21 | 'graph_num_edge_MLP_hidden_layers': 1, 22 | 'graph_num_aggr_MLP_hidden_layers': None, 23 | }) 24 | return params 25 | 26 | @staticmethod 27 | def name(params: Dict[str, Any]) -> str: 28 | return "RGIN" 29 | 30 | def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None: 31 | super().__init__(params, task, run_id, result_dir) 32 | 33 | def _apply_gnn_layer(self, 34 | node_representations: tf.Tensor, 35 | adjacency_lists: List[tf.Tensor], 36 | type_to_num_incoming_edges: tf.Tensor, 37 | num_timesteps: int 38 | ) -> tf.Tensor: 39 | return sparse_rgin_layer( 40 | node_embeddings=node_representations, 41 | adjacency_lists=adjacency_lists, 42 | state_dim=self.params['hidden_size'], 43 | num_timesteps=num_timesteps, 44 | activation_function=self.params['graph_activation_function'], 45 | message_aggregation_function=self.params['message_aggregation_function'], 46 | use_target_state_as_input=self.params['use_target_state_as_input'], 47 | num_edge_MLP_hidden_layers=self.params['graph_num_edge_MLP_hidden_layers'], 48 | num_aggr_MLP_hidden_layers=self.params['graph_num_aggr_MLP_hidden_layers'], 49 | ) 50 | -------------------------------------------------------------------------------- /tf-gnn-samples/models/self_attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | class SelfAttention: 6 | def __init__(self, 7 | num_heads, 8 | model_dim, 9 | dropout_keep_prob): 10 | 11 | self.num_heads = num_heads 12 | self.model_dim = model_dim 13 | self.dropout_keep_prob = dropout_keep_prob 14 | 15 | def multi_head(self, batched_inputs, valid_mask=None): 16 | q = self._linear_projection(batched_inputs) 17 | qs = self._split_heads(q) 18 | tiled_inputs = tf.tile(tf.expand_dims(batched_inputs, axis=1), [1, self.num_heads, 1, 1]) 19 | outputs = self._scaled_dot_product(qs, tiled_inputs, tiled_inputs, valid_mask) # (batch, num_heads, max_contexts, value_dim) 20 | output = self._concat_heads(outputs) # (batch, max_contexts, value_dim * num_heads) 21 | output = tf.layers.dense(output, units=self.model_dim, use_bias=False, 22 | activation=tf.nn.relu) # (batch, max_contexts, model_dim) 23 | 24 | output = tf.nn.dropout(output, keep_prob=self.dropout_keep_prob) 25 | return output 26 | 27 | def _linear_projection(self, batched_inputs): 28 | q = tf.layers.dense(batched_inputs, units=self.model_dim * self.num_heads, 29 | use_bias=False) # (batch, max_contexts, key_dim * num_heads) 30 | # k = tf.layers.dense(batched_inputs, units=self.model_dim, 31 | # use_bias=False) # (batch, max_contexts, key_dim * num_heads) 32 | return q 33 | 34 | def _split_heads(self, q): 35 | 36 | def split_last_dimension_then_transpose(tensor, num_heads, dim): 37 | tensor = tf.reshape(tensor, [-1, tf.shape(tensor)[1], num_heads, 38 | dim]) # (batch, max_contexts, num_heads, dim) 39 | return tf.transpose(tensor, [0, 2, 1, 3]) # (batch, num_heads, max_contexts, dim) 40 | 41 | qs = split_last_dimension_then_transpose(q, self.num_heads, 42 | self.model_dim) # (batch, num_heads, max_contexts, key_dim) 43 | # ks = split_last_dimension_then_transpose(k, self.num_heads, 44 | # self.model_dim) # (batch, num_heads, max_contexts, key_dim) 45 | return qs 46 | 47 | def _scaled_dot_product(self, qs, ks, tiled_inputs, valid_mask): 48 | queries_dot_keys = tf.matmul(qs, ks, transpose_b=True) # (batch, num_heads, max_contexts, max_contexts) 49 | scaled_scores = queries_dot_keys #/ ((self.model_dim // self.num_heads) ** 0.5) # (batch, num_heads, max_contexts, max_contexts) 50 | 51 | if valid_mask is not None: 52 | mask = tf.log(tf.reshape(valid_mask, ( 53 | tf.shape(valid_mask)[0], 1, 1, tf.shape(valid_mask)[1]))) # (batch, 1, 1, max_contexts) 54 | scaled_scores += mask 55 | 56 | attention_weights = tf.nn.softmax(scaled_scores, axis=-1) # (batch, num_heads, max_contexts, max_contexts) 57 | return tf.matmul(attention_weights, tiled_inputs) # (batch, num_heads, max_contexts, value_dim) 58 | 59 | def _concat_heads(self, outputs): 60 | # outputs: (batch, num_heads, max_contexts, value_dim) 61 | max_contexts = tf.shape(outputs)[2] 62 | tensor = tf.transpose(outputs, [0, 2, 1, 3]) # [batch, max_contexts, num_heads, value_dim // num_heads] 63 | return tf.reshape(tensor, [-1, max_contexts, self.model_dim * self.num_heads]) 64 | 65 | 66 | if __name__ == '__main__': 67 | sess = tf.InteractiveSession() 68 | selfatt = SelfAttention(num_heads=2, model_dim=4, dropout_keep_prob=1.0) 69 | result_op = selfatt.multi_head(tf.constant(np.arange(24).reshape((2, 3, 4)), dtype=tf.float32)) 70 | sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer())) 71 | result = sess.run(result_op) 72 | print(result.shape) 73 | print(result) -------------------------------------------------------------------------------- /tf-gnn-samples/requirements.txt: -------------------------------------------------------------------------------- 1 | docopt 2 | numpy 3 | dpu-utils>=0.1.30 4 | tensorflow-gpu>=1.13.1 -------------------------------------------------------------------------------- /tf-gnn-samples/run_ppi_benchs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | run_ppi_benchs.py [options] LOG_TARGET_DIR 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --num-runs NUM Number of runs to perform for each configuration. [default: 10] 9 | --debug Turn on debugger. 10 | """ 11 | import os 12 | import subprocess 13 | import re 14 | import numpy as np 15 | 16 | from docopt import docopt 17 | from dpu_utils.utils import run_and_debug 18 | 19 | MODEL_TYPES = ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"] 20 | 21 | TEST_RES_RE = re.compile('^Metrics: Avg MicroF1: (0.\d+)') 22 | TIME_RE = re.compile('^Training took (\d+)s') 23 | 24 | 25 | def run(args): 26 | target_dir = args['LOG_TARGET_DIR'] 27 | os.makedirs(target_dir, exist_ok=True) 28 | print("Starting PPI experiments, will write logfiles for runs into %s." % target_dir) 29 | num_seeds = int(args.get('--num-runs')) 30 | print("| %- 13s | %- 17s | %- 10s |" % ("Model", "Avg. MicroF1", "Avg. Time")) 31 | print("|" + "-" * 15 + "|" + "-" * 19 + "|" + "-" * 12 + "|") 32 | for model in MODEL_TYPES: 33 | model_f1s = [] 34 | model_times = [] 35 | for seed in range(1, 1 + num_seeds): 36 | logfile = os.path.join(target_dir, "%s_seed%i.txt" % (model.lower(), seed)) 37 | with open(logfile, "w") as log_fh: 38 | subprocess.check_call(["python", 39 | "train.py", 40 | "--quiet", 41 | "--run-test", 42 | model, 43 | "PPI", 44 | "--model-param-overrides", 45 | "{\"random_seed\": %i}" % seed, 46 | ], 47 | stdout=log_fh, 48 | stderr=log_fh) 49 | with open(logfile, "r") as log_fh: 50 | for line in log_fh.readlines(): 51 | time_match = TIME_RE.search(line) 52 | res_match = TEST_RES_RE.search(line) 53 | if time_match is not None: 54 | model_times.append(int(time_match.groups()[0])) 55 | elif res_match is not None: 56 | model_f1s.append(float(res_match.groups()[0])) 57 | 58 | print("| %- 13s | %.3f (+/- %.3f) | % 4.1f |" 59 | % (model, 60 | np.mean(model_f1s), 61 | np.std(model_f1s), 62 | np.mean(model_times))) 63 | 64 | 65 | if __name__ == "__main__": 66 | args = docopt(__doc__) 67 | run_and_debug(lambda: run(args), enable_debugging=args['--debug']) 68 | -------------------------------------------------------------------------------- /tf-gnn-samples/run_qm9_benchs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | run_qm9_benchs.py [options] LOG_TARGET_DIR 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --num-runs NUM Number of runs to perform for each configuration. [default: 5] 9 | --debug Turn on debugger. 10 | """ 11 | import os 12 | import subprocess 13 | import re 14 | import numpy as np 15 | 16 | from docopt import docopt 17 | from dpu_utils.utils import run_and_debug 18 | 19 | MODEL_TYPES = ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"] 20 | TASKS = ["mu", "alpha", "HOMO", "LUMO", "gap", "R2", "ZPVE", "U0", "U", "H", "G", "Cv", "Omega"] 21 | 22 | TEST_RES_RE = re.compile('^Metrics: MAEs: \d+:([0-9.]+) \| Error Ratios: \d+:([0-9.]+)') 23 | TIME_RE = re.compile('^Training took (\d+)s') 24 | 25 | 26 | def run(args): 27 | target_dir = args['LOG_TARGET_DIR'] 28 | os.makedirs(target_dir, exist_ok=True) 29 | print("Starting QM9 experiments, will write logfiles for runs into %s." % target_dir) 30 | num_seeds = int(args.get('--num-runs')) 31 | results = {} 32 | for model in MODEL_TYPES: 33 | results[model] = [{"test_errors": [], "times": []} for _ in TASKS] 34 | for task_id in range(len(TASKS)): 35 | for seed in range(1, 1 + num_seeds): 36 | logfile = os.path.join(target_dir, "%s_task%i_seed%i.txt" % (model, task_id, seed)) 37 | with open(logfile, "w") as log_fh: 38 | subprocess.check_call(["python", 39 | "train.py", 40 | "--run-test", 41 | model, 42 | "QM9", 43 | "--model-param-overrides", 44 | "{\"random_seed\": %i}" % seed, 45 | "--task-param-overrides", 46 | "{\"task_ids\": [%i]}" % task_id, 47 | ], 48 | stdout=log_fh, 49 | stderr=log_fh) 50 | with open(logfile, "r") as log_fh: 51 | for line in log_fh.readlines(): 52 | time_match = TIME_RE.search(line) 53 | res_match = TEST_RES_RE.search(line) 54 | if time_match is not None: 55 | results[model][task_id]["times"].append(int(time_match.groups()[0])) 56 | elif res_match is not None: 57 | results[model][task_id]["test_errors"].append(float(res_match.groups()[1])) 58 | 59 | row_fmt_string = "%7s " + "&% 35s " * len(MODEL_TYPES) + "\\\\" 60 | print(row_fmt_string % tuple([""] + MODEL_TYPES)) 61 | for task_id, task in enumerate(TASKS): 62 | model_results = [] 63 | for model in MODEL_TYPES: 64 | err = np.mean(results[model][task_id]["test_errors"]) 65 | std = np.std(results[model][task_id]["test_errors"]) 66 | time_in_min = np.mean(results[model][task_id]["times"]) / 60 67 | model_results.append("%.2f & ($\pm %.2f$; $%.1f$min)" % (err, std, time_in_min)) 68 | print(row_fmt_string % tuple([task] + model_results)) 69 | 70 | 71 | if __name__ == "__main__": 72 | args = docopt(__doc__) 73 | run_and_debug(lambda: run(args), enable_debugging=args['--debug']) 74 | -------------------------------------------------------------------------------- /tf-gnn-samples/run_qm9_benchs_fa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | run_qm9_benchs.py [options] LOG_TARGET_DIR 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --num-runs NUM Number of runs to perform for each configuration. [default: 5] 9 | --debug Turn on debugger. 10 | """ 11 | import os 12 | import subprocess 13 | import re 14 | import numpy as np 15 | 16 | from docopt import docopt 17 | from dpu_utils.utils import run_and_debug 18 | 19 | #MODEL_TYPES = ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"] 20 | MODEL_TYPES = ["GNN-Edge-MLP0"] #, "RGAT", "GNN_FiLM"] # ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"] 21 | TASKS = ["mu", "alpha", "HOMO", "LUMO", "gap", "R2", "ZPVE", "U0", "U", "H", "G", "Cv", "Omega"] 22 | 23 | TEST_RES_RE = re.compile('^Metrics: MAEs: \d+:([0-9.]+) \| Error Ratios: \d+:([0-9.]+)') 24 | TIME_RE = re.compile('^Training took (\d+)s') 25 | 26 | 27 | def run(args): 28 | target_dir = args['LOG_TARGET_DIR'] 29 | os.makedirs(target_dir, exist_ok=True) 30 | print("Starting QM9 experiments, will write logfiles for runs into %s." % target_dir) 31 | num_seeds = int(args.get('--num-runs')) 32 | results = {} 33 | for model in MODEL_TYPES: 34 | results[model] = [{"test_errors": [], "times": []} for _ in TASKS] 35 | for task_id in range(len(TASKS)): 36 | for seed in range(1, 1 + num_seeds): 37 | logfile = os.path.join(target_dir, "%s_task%i_seed%i.txt" % (model, task_id, seed)) 38 | with open(logfile, "w") as log_fh: 39 | subprocess.check_call(["python", 40 | "train.py", 41 | "--run-test", 42 | model, 43 | "QM9", 44 | "--model-param-overrides", 45 | "{\"random_seed\": %i,\"last_layer_fa\":true,\"max_nodes_in_batch\":30000}" % seed, 46 | "--task-param-overrides", 47 | "{\"task_ids\": [%i]}" % task_id, 48 | ], 49 | stdout=log_fh, 50 | stderr=log_fh) 51 | with open(logfile, "r") as log_fh: 52 | for line in log_fh.readlines(): 53 | time_match = TIME_RE.search(line) 54 | res_match = TEST_RES_RE.search(line) 55 | if time_match is not None: 56 | results[model][task_id]["times"].append(int(time_match.groups()[0])) 57 | elif res_match is not None: 58 | results[model][task_id]["test_errors"].append(float(res_match.groups()[1])) 59 | 60 | row_fmt_string = "%7s " + "&% 35s " * len(MODEL_TYPES) + "\\\\" 61 | print(row_fmt_string % tuple([""] + MODEL_TYPES)) 62 | for task_id, task in enumerate(TASKS): 63 | model_results = [] 64 | for model in MODEL_TYPES: 65 | err = np.mean(results[model][task_id]["test_errors"]) 66 | std = np.std(results[model][task_id]["test_errors"]) 67 | time_in_min = np.mean(results[model][task_id]["times"]) / 60 68 | model_results.append("%.2f & ($\pm %.2f$; $%.1f$min)" % (err, std, time_in_min)) 69 | print(row_fmt_string % tuple([task] + model_results)) 70 | 71 | 72 | if __name__ == "__main__": 73 | args = docopt(__doc__) 74 | run_and_debug(lambda: run(args), enable_debugging=args['--debug']) 75 | -------------------------------------------------------------------------------- /tf-gnn-samples/run_varmisuse_benchs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | run_varmisuse_benchs.py [options] LOG_TARGET_DIR 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --num-runs NUM Number of runs to perform for each configuration. [default: 5] 9 | --debug Turn on debugger. 10 | """ 11 | import os 12 | import subprocess 13 | import re 14 | import numpy as np 15 | 16 | from docopt import docopt 17 | from dpu_utils.utils import run_and_debug 18 | 19 | MODEL_TYPES = ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"] 20 | 21 | TEST_RES_RE = re.compile('^Metrics: Accuracy: (0.\d+)') 22 | VALID_RES_RE = re.compile('Best validation results: Accuracy: (0.\d+)') 23 | MODEL_FILE_RE = re.compile('^Loading model from file (.+)\.') 24 | 25 | 26 | def run(args): 27 | target_dir = args['LOG_TARGET_DIR'] 28 | os.makedirs(target_dir, exist_ok=True) 29 | print("Starting VarMisuse experiments, will write logfiles for runs into %s." % target_dir) 30 | num_seeds = int(args.get('--num-runs')) 31 | print("| %- 14s | %- 17s | %- 17s | %- 17s |" % ("Model", 32 | "Valid Acc", 33 | "Test Acc", 34 | "TestOnly Acc")) 35 | print("|" + "-" * 16 + "|" + "-" * 19 + "|" + "-" * 19 + "|" + "-" * 19 + "|") 36 | for model in MODEL_TYPES: 37 | valid_accs, test_accs, testonly_accs = [], [], [] 38 | for seed in range(1, 1 + num_seeds): 39 | logfile = os.path.join(target_dir, "%s_seed%i.txt" % (model.lower(), seed)) 40 | test_logfile = os.path.join(target_dir, "%s_seed%i-testonly.txt" % (model.lower(), seed)) 41 | with open(logfile, "w") as log_fh: 42 | subprocess.check_call(["python", 43 | "train.py", 44 | "--quiet", 45 | "--run-test", 46 | model, 47 | "VarMisuse", 48 | "--model-param-overrides", 49 | "{\"random_seed\": %i}" % seed, 50 | ], 51 | stdout=log_fh, 52 | stderr=log_fh) 53 | model_file = None 54 | with open(logfile, "r") as log_fh: 55 | for line in log_fh.readlines(): 56 | valid_res_match = VALID_RES_RE.search(line) 57 | test_res_match = TEST_RES_RE.search(line) 58 | model_file_match = MODEL_FILE_RE.search(line) 59 | if valid_res_match is not None: 60 | valid_accs.append(float(valid_res_match.groups()[0])) 61 | elif test_res_match is not None: 62 | test_accs.append(float(test_res_match.groups()[0])) 63 | elif model_file_match is not None: 64 | model_file = model_file_match.groups()[0] 65 | 66 | # Run TestOnly 67 | assert model_file is not None, "Could not find saved model file" 68 | with open(test_logfile, "w") as log_fh: 69 | subprocess.check_call(["python", 70 | "test.py", 71 | "--quiet", 72 | model_file, 73 | "data/varmisuse/graphs-testonly", 74 | ], 75 | stdout=log_fh, 76 | stderr=log_fh) 77 | with open(test_logfile, "r") as log_fh: 78 | for line in log_fh.readlines(): 79 | test_res_match = TEST_RES_RE.search(line) 80 | if test_res_match is not None: 81 | testonly_accs.append(float(test_res_match.groups()[0])) 82 | 83 | print("| %- 14s | %.3f (+/- %.3f) | %.3f (+/- %.3f) | %.3f (+/- %.3f) |" 84 | % (model, 85 | np.mean(valid_accs), 86 | np.std(valid_accs), 87 | np.mean(test_accs), 88 | np.std(test_accs), 89 | np.mean(testonly_accs), 90 | np.std(testonly_accs), 91 | )) 92 | 93 | 94 | if __name__ == "__main__": 95 | args = docopt(__doc__) 96 | run_and_debug(lambda: run(args), enable_debugging=args['--debug']) 97 | -------------------------------------------------------------------------------- /tf-gnn-samples/run_varmisuse_benchs_fa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | run_varmisuse_benchs.py [options] LOG_TARGET_DIR 5 | 6 | Options: 7 | -h --help Show this screen. 8 | --num-runs NUM Number of runs to perform for each configuration. [default: 5] 9 | --debug Turn on debugger. 10 | """ 11 | import os 12 | import subprocess 13 | import re 14 | import numpy as np 15 | 16 | from docopt import docopt 17 | from dpu_utils.utils import run_and_debug 18 | 19 | #MODEL_TYPES = ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"] 20 | MODEL_TYPES = ["RGAT"] 21 | 22 | TEST_RES_RE = re.compile('^Metrics: Accuracy: (0.\d+)') 23 | VALID_RES_RE = re.compile('Best validation results: Accuracy: (0.\d+)') 24 | MODEL_FILE_RE = re.compile('^Loading model from file (.+)\.') 25 | 26 | 27 | def run(args): 28 | target_dir = args['LOG_TARGET_DIR'] 29 | os.makedirs(target_dir, exist_ok=True) 30 | print("Starting VarMisuse experiments, will write logfiles for runs into %s." % target_dir) 31 | num_seeds = int(args.get('--num-runs')) 32 | print("| %- 14s | %- 17s | %- 17s | %- 17s |" % ("Model", 33 | "Valid Acc", 34 | "Test Acc", 35 | "TestOnly Acc")) 36 | print("|" + "-" * 16 + "|" + "-" * 19 + "|" + "-" * 19 + "|" + "-" * 19 + "|") 37 | for model in MODEL_TYPES: 38 | valid_accs, test_accs, testonly_accs = [], [], [] 39 | for seed in range(1, 1 + num_seeds): 40 | logfile = os.path.join(target_dir, "%s_seed%i.txt" % (model.lower(), seed)) 41 | test_logfile = os.path.join(target_dir, "%s_seed%i-testonly.txt" % (model.lower(), seed)) 42 | with open(logfile, "w") as log_fh: 43 | subprocess.check_call(["python", 44 | "train.py", 45 | "--quiet", 46 | "--run-test", 47 | model, 48 | "VarMisuse", 49 | "--model-param-overrides", 50 | "{\"random_seed\": %i,\"last_layer_fa\":true}" % seed, 51 | ], 52 | stdout=log_fh, 53 | stderr=log_fh) 54 | model_file = None 55 | with open(logfile, "r") as log_fh: 56 | for line in log_fh.readlines(): 57 | valid_res_match = VALID_RES_RE.search(line) 58 | test_res_match = TEST_RES_RE.search(line) 59 | model_file_match = MODEL_FILE_RE.search(line) 60 | if valid_res_match is not None: 61 | valid_accs.append(float(valid_res_match.groups()[0])) 62 | elif test_res_match is not None: 63 | test_accs.append(float(test_res_match.groups()[0])) 64 | elif model_file_match is not None: 65 | model_file = model_file_match.groups()[0] 66 | 67 | # Run TestOnly 68 | assert model_file is not None, "Could not find saved model file" 69 | with open(test_logfile, "w") as log_fh: 70 | subprocess.check_call(["python", 71 | "test.py", 72 | "--quiet", 73 | model_file, 74 | "data/varmisuse/graphs-testonly", 75 | ], 76 | stdout=log_fh, 77 | stderr=log_fh) 78 | with open(test_logfile, "r") as log_fh: 79 | for line in log_fh.readlines(): 80 | test_res_match = TEST_RES_RE.search(line) 81 | if test_res_match is not None: 82 | testonly_accs.append(float(test_res_match.groups()[0])) 83 | 84 | print("| %- 14s | %.3f (+/- %.3f) | %.3f (+/- %.3f) | %.3f (+/- %.3f) |" 85 | % (model, 86 | np.mean(valid_accs), 87 | np.std(valid_accs), 88 | np.mean(test_accs), 89 | np.std(test_accs), 90 | np.mean(testonly_accs), 91 | np.std(testonly_accs), 92 | )) 93 | 94 | 95 | if __name__ == "__main__": 96 | args = docopt(__doc__) 97 | run_and_debug(lambda: run(args), enable_debugging=args['--debug']) 98 | -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparse_graph_task import Sparse_Graph_Task, DataFold 2 | from .qm9_task import QM9_Task 3 | from .citation_network_task import Citation_Network_Task 4 | from .ppi_task import PPI_Task 5 | from .varmisuse_task import VarMisuse_Task 6 | -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/PPI_GGNN.json: -------------------------------------------------------------------------------- 1 | {"task_params": {}, 2 | "model_params": {"graph_num_layers": 3, 3 | "hidden_size": 320, 4 | "max_nodes_in_batch": 12500, 5 | "graph_layer_input_dropout_keep_prob": 0.9 6 | } 7 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/PPI_GNN-Edge-MLP0.json: -------------------------------------------------------------------------------- 1 | {"task_params": {}, 2 | "model_params": {"graph_num_layers": 5, 3 | "hidden_size": 256, 4 | "max_nodes_in_batch": 6000, 5 | "graph_layer_input_dropout_keep_prob": 0.8 6 | } 7 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/PPI_GNN-Edge-MLP1.json: -------------------------------------------------------------------------------- 1 | {"task_params": {}, 2 | "model_params": {"graph_num_layers": 4, 3 | "hidden_size": 320, 4 | "max_nodes_in_batch": 6000, 5 | "graph_layer_input_dropout_keep_prob": 0.9 6 | } 7 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/PPI_GNN-FiLM.json: -------------------------------------------------------------------------------- 1 | {"task_params": {}, 2 | "model_params": {"graph_num_layers": 4, 3 | "hidden_size": 320, 4 | "max_nodes_in_batch": 6000, 5 | "graph_layer_input_dropout_keep_prob": 0.9 6 | } 7 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/PPI_RGAT.json: -------------------------------------------------------------------------------- 1 | {"task_params": {}, 2 | "model_params": {"graph_num_layers": 3, 3 | "hidden_size": 320, 4 | "max_nodes_in_batch": 11000, 5 | "graph_layer_input_dropout_keep_prob": 0.9 6 | } 7 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/PPI_RGCN.json: -------------------------------------------------------------------------------- 1 | {"task_params": {}, 2 | "model_params": {"graph_num_layers": 4, 3 | "hidden_size": 320, 4 | "max_nodes_in_batch": 12500, 5 | "graph_layer_input_dropout_keep_prob": 0.9 6 | } 7 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/PPI_RGIN.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": { 3 | "add_self_loop_edges": true 4 | }, 5 | "model_params": { 6 | "patience": 25, 7 | "graph_num_layers": 5, 8 | "hidden_size": 256, 9 | "max_nodes_in_batch": 8000, 10 | "graph_num_edge_MLP_hidden_layers": 1, 11 | "graph_num_aggr_MLP_hidden_layers": null, 12 | "graph_layer_input_dropout_keep_prob": 0.8 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/QM9_GGNN.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": {}, 3 | "model_params": { 4 | "graph_dense_between_every_num_gnn_layers": 32, 5 | "learning_rate": 0.0008471209461829375, 6 | "graph_inter_layer_norm": true, 7 | "graph_activation_function": "relu", 8 | "graph_num_timesteps_per_layer": 1, 9 | "graph_model_activation_function": "tanh", 10 | "momentum": 0.85, 11 | "optimizer": "RMSProp", 12 | "clamp_gradient_norm": 1.0, 13 | "patience": 25, 14 | "max_epochs": 10000, 15 | "graph_rnn_cell": "RNN", 16 | "graph_layer_input_dropout_keep_prob": 1.0, 17 | "graph_num_layers": 6, 18 | "message_aggregation_function": "sum", 19 | "graph_residual_connection_every_num_layers": 2, 20 | "hidden_size": 128, 21 | "max_nodes_in_batch": 50000, 22 | "learning_rate_decay": 0.98 23 | } 24 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/QM9_GNN-Edge-MLP0.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": {}, 3 | "model_params": { 4 | "max_nodes_in_batch": 50000, 5 | "graph_num_layers": 8, 6 | "graph_num_timesteps_per_layer": 1, 7 | "graph_layer_input_dropout_keep_prob": 0.9, 8 | "graph_dense_between_every_num_gnn_layers": 32, 9 | "graph_model_activation_function": "tanh", 10 | "graph_residual_connection_every_num_layers": 2, 11 | "graph_inter_layer_norm": true, 12 | "max_epochs": 10000, 13 | "patience": 25, 14 | "optimizer": "RMSProp", 15 | "learning_rate": 0.0005072060718321982, 16 | "learning_rate_decay": 0.98, 17 | "lr_for_num_graphs_per_batch": null, 18 | "momentum": 0.85, 19 | "clamp_gradient_norm": 1.0, 20 | "hidden_size": 128, 21 | "graph_activation_function": "relu", 22 | "message_aggregation_function": "sum", 23 | "graph_message_weights_dropout_ratio": 0.0, 24 | "use_target_state_as_input": true, 25 | "num_edge_hidden_layers": 0 26 | } 27 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/QM9_GNN-Edge-MLP1.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": {}, 3 | "model_params": { 4 | "max_nodes_in_batch": 50000, 5 | "graph_num_layers": 8, 6 | "graph_num_timesteps_per_layer": 1, 7 | "graph_layer_input_dropout_keep_prob": 0.9, 8 | "graph_dense_between_every_num_gnn_layers": 32, 9 | "graph_model_activation_function": "tanh", 10 | "graph_residual_connection_every_num_layers": 2, 11 | "graph_inter_layer_norm": false, 12 | "max_epochs": 10000, 13 | "patience": 25, 14 | "optimizer": "Adam", 15 | "learning_rate": 0.0006482335154980316, 16 | "learning_rate_decay": 0.98, 17 | "lr_for_num_graphs_per_batch": null, 18 | "momentum": 0.85, 19 | "clamp_gradient_norm": 1.0, 20 | "hidden_size": 128, 21 | "graph_activation_function": "gelu", 22 | "message_aggregation_function": "sum", 23 | "graph_message_weights_dropout_ratio": 0.0, 24 | "use_target_state_as_input": true, 25 | "num_edge_hidden_layers": 1 26 | } 27 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/QM9_GNN-FiLM.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": {}, 3 | "model_params": { 4 | "message_aggregation_function": "sum", 5 | "graph_activation_function": "elu", 6 | "momentum": 0.85, 7 | "learning_rate_decay": 0.98, 8 | "patience": 25, 9 | "normalize_messages_by_num_incoming": false, 10 | "max_epochs": 10000, 11 | "graph_num_timesteps_per_layer": 1, 12 | "optimizer": "RMSProp", 13 | "hidden_size": 128, 14 | "graph_num_layers": 8, 15 | "graph_residual_connection_every_num_layers": 2, 16 | "graph_layer_input_dropout_keep_prob": 0.9, 17 | "learning_rate": 0.0006654723503723253, 18 | "graph_inter_layer_norm": true, 19 | "graph_dense_between_every_num_gnn_layers": 32, 20 | "max_nodes_in_batch": 50000, 21 | "graph_model_activation_function": "tanh", 22 | "clamp_gradient_norm": 1.0 23 | } 24 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/QM9_NoStruct-MLP0.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": {}, 3 | "model_params": { 4 | "max_nodes_in_batch": 50000, 5 | "graph_num_layers": 8, 6 | "graph_num_timesteps_per_layer": 1, 7 | "graph_layer_input_dropout_keep_prob": 0.9, 8 | "graph_dense_between_every_num_gnn_layers": 32, 9 | "graph_model_activation_function": "tanh", 10 | "graph_residual_connection_every_num_layers": 2, 11 | "graph_inter_layer_norm": true, 12 | "max_epochs": 10000, 13 | "patience": 25, 14 | "optimizer": "RMSProp", 15 | "learning_rate": 0.0005072060718321982, 16 | "learning_rate_decay": 0.98, 17 | "lr_for_num_graphs_per_batch": null, 18 | "momentum": 0.85, 19 | "clamp_gradient_norm": 1.0, 20 | "hidden_size": 128, 21 | "graph_activation_function": "relu", 22 | "message_aggregation_function": "sum", 23 | "graph_message_weights_dropout_ratio": 0.0, 24 | "use_target_state_as_input": true, 25 | "num_edge_hidden_layers": 0 26 | } 27 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/QM9_RGAT.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": {}, 3 | "model_params": { 4 | "graph_model_activation_function": "tanh", 5 | "patience": 25, 6 | "optimizer": "RMSProp", 7 | "graph_activation_function": "elu", 8 | "learning_rate_decay": 0.98, 9 | "max_nodes_in_batch": 50000, 10 | "graph_layer_input_dropout_keep_prob": 0.9, 11 | "graph_inter_layer_norm": false, 12 | "clamp_gradient_norm": 1.0, 13 | "graph_num_layers": 8, 14 | "momentum": 0.85, 15 | "graph_dense_between_every_num_gnn_layers": 32, 16 | "hidden_size": 128, 17 | "graph_residual_connection_every_num_layers": 2, 18 | "num_heads": 8, 19 | "learning_rate": 0.0005800837190772856, 20 | "graph_num_timesteps_per_layer": 1, 21 | "max_epochs": 10000 22 | } 23 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/QM9_RGCN.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": {}, 3 | "model_params": { 4 | "graph_residual_connection_every_num_layers": 2, 5 | "max_nodes_in_batch": 50000, 6 | "graph_num_layers": 8, 7 | "graph_model_activation_function": "tanh", 8 | "graph_layer_input_dropout_keep_prob": 1.0, 9 | "graph_activation_function": "leaky_relu", 10 | "graph_num_timesteps_per_layer": 1, 11 | "learning_rate_decay": 0.98, 12 | "max_epochs": 10000, 13 | "momentum": 0.85, 14 | "message_aggregation_function": "sum", 15 | "graph_dense_between_every_num_gnn_layers": 32, 16 | "learning_rate": 0.0005720408870458782, 17 | "graph_inter_layer_norm": true, 18 | "hidden_size": 128, 19 | "clamp_gradient_norm": 1.0, 20 | "patience": 25, 21 | "optimizer": "RMSProp" 22 | } 23 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/QM9_RGIN.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": { 3 | "add_self_loop_edges": true 4 | }, 5 | "model_params": { 6 | "max_nodes_in_batch": 50000, 7 | "graph_num_layers": 6, 8 | "graph_num_timesteps_per_layer": 1, 9 | "graph_layer_input_dropout_keep_prob": 0.9, 10 | "graph_dense_between_every_num_gnn_layers": 32, 11 | "graph_model_activation_function": "tanh", 12 | "graph_residual_connection_every_num_layers": 2, 13 | "graph_inter_layer_norm": false, 14 | "max_epochs": 10000, 15 | "patience": 25, 16 | "optimizer": "RMSProp", 17 | "learning_rate": 0.000700776770702023, 18 | "learning_rate_decay": 0.98, 19 | "lr_for_num_graphs_per_batch": null, 20 | "momentum": 0.85, 21 | "clamp_gradient_norm": 1.0, 22 | "hidden_size": 128, 23 | "graph_activation_function": "elu", 24 | "message_aggregation_function": "sum", 25 | "use_target_state_as_input": false, 26 | "graph_num_edge_MLP_hidden_layers": 1, 27 | "graph_num_aggr_MLP_hidden_layers": null 28 | } 29 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/VarMisuse_GGNN.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": { 3 | "add_self_loop_edges": false 4 | }, 5 | "model_params": { 6 | "max_nodes_in_batch": 100000, 7 | "graph_num_layers": 6, 8 | "graph_num_timesteps_per_layer": 1, 9 | "graph_layer_input_dropout_keep_prob": 0.8, 10 | "graph_message_weights_dropout_ratio": 0.0, 11 | "graph_dense_between_every_num_gnn_layers": 10000, 12 | "graph_model_activation_function": "tanh", 13 | "graph_residual_connection_every_num_layers": 10000, 14 | "graph_inter_layer_norm": false, 15 | "max_epochs": 10000, 16 | "patience": 5, 17 | "optimizer": "Adam", 18 | "learning_rate": 0.00015, 19 | "learning_rate_decay": 0.98, 20 | "momentum": 0.85, 21 | "clamp_gradient_norm": 1.0, 22 | "random_seed": 0, 23 | "hidden_size": 96, 24 | "graph_rnn_cell": "GRU", 25 | "graph_activation_function": "tanh", 26 | "message_aggregation_function": "sum" 27 | } 28 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/VarMisuse_GNN-Edge-MLP0.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": { 3 | "add_self_loop_edges": false 4 | }, 5 | "model_params": { 6 | "max_nodes_in_batch": 50000, 7 | "graph_num_layers": 8, 8 | "graph_num_timesteps_per_layer": 1, 9 | "graph_layer_input_dropout_keep_prob": 0.9, 10 | "graph_dense_between_every_num_gnn_layers": 1, 11 | "graph_model_activation_function": "tanh", 12 | "graph_residual_connection_every_num_layers": 2, 13 | "graph_inter_layer_norm": true, 14 | "max_epochs": 10000, 15 | "patience": 5, 16 | "optimizer": "Adam", 17 | "learning_rate": 0.00015, 18 | "learning_rate_decay": 0.98, 19 | "lr_for_num_graphs_per_batch": 30, 20 | "momentum": 0.85, 21 | "clamp_gradient_norm": 1.0, 22 | "random_seed": 0, 23 | "hidden_size": 128, 24 | "graph_activation_function": "gelu", 25 | "message_aggregation_function": "sum", 26 | "graph_message_weights_dropout_ratio": 0.0, 27 | "use_target_state_as_input": true, 28 | "num_edge_hidden_layers": 0 29 | } 30 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/VarMisuse_GNN-Edge-MLP1.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": { 3 | "add_self_loop_edges": false 4 | }, 5 | "model_params": { 6 | "max_nodes_in_batch": 45000, 7 | "graph_num_layers": 10, 8 | "graph_num_timesteps_per_layer": 1, 9 | "graph_layer_input_dropout_keep_prob": 0.9, 10 | "graph_dense_between_every_num_gnn_layers": 1, 11 | "graph_model_activation_function": "tanh", 12 | "graph_residual_connection_every_num_layers": 2, 13 | "graph_inter_layer_norm": true, 14 | "max_epochs": 10000, 15 | "patience": 5, 16 | "optimizer": "Adam", 17 | "learning_rate": 0.00015, 18 | "learning_rate_decay": 0.98, 19 | "lr_for_num_graphs_per_batch": 30, 20 | "momentum": 0.85, 21 | "clamp_gradient_norm": 1.0, 22 | "random_seed": 0, 23 | "hidden_size": 128, 24 | "graph_activation_function": "gelu", 25 | "message_aggregation_function": "sum", 26 | "graph_message_weights_dropout_ratio": 0.0, 27 | "use_target_state_as_input": true, 28 | "num_edge_hidden_layers": 1 29 | } 30 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/VarMisuse_GNN-FiLM.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": { 3 | "add_self_loop_edges": true 4 | }, 5 | "model_params": { 6 | "max_nodes_in_batch": 60000, 7 | "graph_num_layers": 10, 8 | "graph_num_timesteps_per_layer": 1, 9 | "graph_layer_input_dropout_keep_prob": 0.8, 10 | "graph_message_weights_dropout_ratio": 0.0, 11 | "graph_dense_between_every_num_gnn_layers": 1, 12 | "graph_model_activation_function": "tanh", 13 | "graph_residual_connection_every_num_layers": 2, 14 | "graph_inter_layer_norm": false, 15 | "max_epochs": 10000, 16 | "patience": 5, 17 | "optimizer": "Adam", 18 | "learning_rate": 0.00015, 19 | "learning_rate_decay": 0.98, 20 | "momentum": 0.85, 21 | "clamp_gradient_norm": 1.0, 22 | "random_seed": 0, 23 | "hidden_size": 128, 24 | "graph_activation_function": "ReLU", 25 | "message_aggregation_function": "sum", 26 | "normalize_messages_by_num_incoming": false 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/VarMisuse_NoStruct-MLP1.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": { 3 | "add_self_loop_edges": false 4 | }, 5 | "model_params": { 6 | "max_nodes_in_batch": 45000, 7 | "graph_num_layers": 10, 8 | "graph_num_timesteps_per_layer": 1, 9 | "graph_layer_input_dropout_keep_prob": 0.9, 10 | "graph_dense_between_every_num_gnn_layers": 1, 11 | "graph_model_activation_function": "tanh", 12 | "graph_residual_connection_every_num_layers": 2, 13 | "graph_inter_layer_norm": true, 14 | "max_epochs": 10000, 15 | "patience": 5, 16 | "optimizer": "Adam", 17 | "learning_rate": 0.00015, 18 | "learning_rate_decay": 0.98, 19 | "lr_for_num_graphs_per_batch": 30, 20 | "momentum": 0.85, 21 | "clamp_gradient_norm": 1.0, 22 | "random_seed": 0, 23 | "hidden_size": 128, 24 | "graph_activation_function": "gelu", 25 | "message_aggregation_function": "sum", 26 | "graph_message_weights_dropout_ratio": 0.0, 27 | "use_target_state_as_input": true, 28 | "num_edge_hidden_layers": 1 29 | } 30 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/VarMisuse_RGAT.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": { 3 | "add_self_loop_edges": true 4 | }, 5 | "model_params": { 6 | "max_nodes_in_batch": 50000, 7 | "graph_num_layers": 8, 8 | "graph_num_timesteps_per_layer": 1, 9 | "graph_layer_input_dropout_keep_prob": 0.9, 10 | "graph_dense_between_every_num_gnn_layers": 10000, 11 | "graph_model_activation_function": "tanh", 12 | "graph_residual_connection_every_num_layers": 10000, 13 | "graph_inter_layer_norm": false, 14 | "max_epochs": 10000, 15 | "patience": 5, 16 | "optimizer": "Adam", 17 | "learning_rate": 0.00015, 18 | "learning_rate_decay": 0.98, 19 | "lr_for_num_graphs_per_batch": 30, 20 | "momentum": 0.85, 21 | "clamp_gradient_norm": 1.0, 22 | "random_seed": 0, 23 | "hidden_size": 96, 24 | "num_heads": 8, 25 | "graph_activation_function": "tanh" 26 | } 27 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/VarMisuse_RGCN.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": { 3 | "add_self_loop_edges": true 4 | }, 5 | "model_params": { 6 | "max_nodes_in_batch": 75000, 7 | "graph_num_layers": 10, 8 | "graph_num_timesteps_per_layer": 1, 9 | "graph_layer_input_dropout_keep_prob": 0.9, 10 | "graph_dense_between_every_num_gnn_layers": 10000, 11 | "graph_model_activation_function": "tanh", 12 | "graph_residual_connection_every_num_layers": 10000, 13 | "graph_inter_layer_norm": false, 14 | "max_epochs": 10000, 15 | "patience": 5, 16 | "optimizer": "Adam", 17 | "learning_rate": 0.00015, 18 | "learning_rate_decay": 0.98, 19 | "momentum": 0.85, 20 | "clamp_gradient_norm": 1.0, 21 | "random_seed": 0, 22 | "hidden_size": 128, 23 | "graph_activation_function": "ReLU", 24 | "message_aggregation_function": "sum", 25 | "min_epochs": 8 26 | } 27 | } -------------------------------------------------------------------------------- /tf-gnn-samples/tasks/default_hypers/VarMisuse_RGIN.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_params": { 3 | "add_self_loop_edges": true 4 | }, 5 | "model_params": { 6 | "max_nodes_in_batch": 50000, 7 | "graph_num_layers": 6, 8 | "graph_num_timesteps_per_layer": 1, 9 | "graph_layer_input_dropout_keep_prob": 0.8, 10 | "graph_dense_between_every_num_gnn_layers": 1, 11 | "graph_model_activation_function": "tanh", 12 | "graph_residual_connection_every_num_layers": 2, 13 | "graph_inter_layer_norm": true, 14 | "max_epochs": 10000, 15 | "patience": 5, 16 | "optimizer": "Adam", 17 | "learning_rate": 0.00015, 18 | "learning_rate_decay": 0.98, 19 | "lr_for_num_graphs_per_batch": 30, 20 | "momentum": 0.85, 21 | "clamp_gradient_norm": 1.0, 22 | "hidden_size": 128, 23 | "graph_activation_function": "ReLU", 24 | "message_aggregation_function": "sum", 25 | "use_target_state_as_input": false, 26 | "graph_num_edge_MLP_hidden_layers": 1, 27 | "graph_num_aggr_MLP_hidden_layers": null 28 | } 29 | } -------------------------------------------------------------------------------- /tf-gnn-samples/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | test.py [options] STORED_MODEL_PATH [DATA_PATH] 5 | 6 | STORED_MODEL is the path of a model snapshot created by train.py. 7 | DATA_PATH is the location of the data to test on. 8 | 9 | Options: 10 | -h --help Show this screen. 11 | --result-dir DIR Directory to store logfiles and trained models. [default: trained_models] 12 | --azure-info PATH Azure authentication information file (JSON). [default: azure_auth.json] 13 | --quiet Show less output. 14 | --debug Turn on debugger. 15 | """ 16 | import json 17 | from typing import Optional 18 | 19 | from docopt import docopt 20 | from dpu_utils.utils import run_and_debug, RichPath 21 | 22 | from utils.model_utils import restore 23 | 24 | 25 | def test(model_path: str, test_data_path: Optional[RichPath], result_dir: str, quiet: bool = False, run_id: str = None): 26 | model = restore(model_path, result_dir, run_id) 27 | model.params['max_nodes_in_batch'] = 2 * model.params['max_nodes_in_batch'] # We can process larger batches if we don't do training 28 | test_data_path = test_data_path or RichPath.create(model.task.default_data_path()) 29 | model.log_line(" Using the following task params: %s" % json.dumps(model.task.params)) 30 | model.log_line(" Using the following model params: %s" % json.dumps(model.params)) 31 | model.test(test_data_path) 32 | 33 | 34 | def run(args): 35 | azure_info_path = args.get('--azure-info', None) 36 | model_path = args['STORED_MODEL_PATH'] 37 | test_data_path = args.get('DATA_PATH') 38 | if test_data_path is not None: 39 | test_data_path = RichPath.create(test_data_path, azure_info_path) 40 | result_dir = args.get('--result-dir', 'trained_models') 41 | test(model_path, test_data_path, result_dir, quiet=args.get('--quiet')) 42 | 43 | 44 | if __name__ == "__main__": 45 | args = docopt(__doc__) 46 | run_and_debug(lambda: run(args), enable_debugging=args['--debug']) 47 | -------------------------------------------------------------------------------- /tf-gnn-samples/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | train.py [options] MODEL_NAME TASK_NAME 5 | 6 | MODEL_NAME has to be one of the supported models, which currently are 7 | GGNN, GNN-Edge-MLP, GNN-FiLM, RGAT, RGCN, RGDCN 8 | 9 | Options: 10 | -h --help Show this screen. 11 | --data-path PATH Path to load data from, has task-specific defaults under data/. 12 | --result-dir DIR Directory to store logfiles and trained models. [default: trained_models] 13 | --run-test Indicate if the task's test should be run. 14 | --model-param-overrides PARAMS Parameter settings overriding model defaults (in JSON format). 15 | --task-param-overrides PARAMS Parameter settings overriding task defaults (in JSON format). 16 | --quiet Show less output. 17 | --tensorboard DIR Dump tensorboard event files to DIR. 18 | --azure-info= Azure authentication information file (JSON). [default: azure_auth.json] 19 | --debug Turn on debugger. 20 | """ 21 | import json 22 | import os 23 | import sys 24 | import time 25 | 26 | from docopt import docopt 27 | from dpu_utils.utils import run_and_debug, RichPath, git_tag_run 28 | 29 | from utils.model_utils import name_to_model_class, name_to_task_class 30 | from test import test 31 | 32 | 33 | def run(args): 34 | azure_info_path = args.get('--azure-info', None) 35 | model_cls, additional_model_params = name_to_model_class(args['MODEL_NAME']) 36 | task_cls, additional_task_params = name_to_task_class(args['TASK_NAME']) 37 | 38 | # Collect parameters from first the class defaults, potential task defaults, and then CLI: 39 | task_params = task_cls.default_params() 40 | task_params.update(additional_task_params) 41 | model_params = model_cls.default_params() 42 | model_params.update(additional_model_params) 43 | 44 | # Load potential task-specific defaults: 45 | task_model_default_hypers_file = \ 46 | os.path.join(os.path.dirname(__file__), 47 | "tasks", 48 | "default_hypers", 49 | "%s_%s.json" % (task_cls.name(), model_cls.name(model_params))) 50 | if os.path.exists(task_model_default_hypers_file): 51 | print("Loading task/model-specific default parameters from %s." % task_model_default_hypers_file) 52 | with open(task_model_default_hypers_file, "rt") as f: 53 | default_task_model_hypers = json.load(f) 54 | task_params.update(default_task_model_hypers['task_params']) 55 | model_params.update(default_task_model_hypers['model_params']) 56 | 57 | # Load overrides from command line: 58 | task_params.update(json.loads(args.get('--task-param-overrides') or '{}')) 59 | model_params.update(json.loads(args.get('--model-param-overrides') or '{}')) 60 | 61 | # Finally, upgrade every parameters that's a path to a RichPath: 62 | task_params_orig = dict(task_params) 63 | for (param_name, param_value) in task_params.items(): 64 | if param_name.endswith("_path"): 65 | task_params[param_name] = RichPath.create(param_value, azure_info_path) 66 | 67 | # Now prepare to actually run by setting up directories, creating object instances and running: 68 | result_dir = args.get('--result-dir', 'trained_models') 69 | os.makedirs(result_dir, exist_ok=True) 70 | task = task_cls(task_params) 71 | data_path = args.get('--data-path') or task.default_data_path() 72 | data_path = RichPath.create(data_path, azure_info_path) 73 | task.load_data(data_path) 74 | 75 | random_seeds = model_params['random_seed'] 76 | if not isinstance(random_seeds, list): 77 | random_seeds = [random_seeds] 78 | 79 | for random_seed in random_seeds: 80 | model_params['random_seed'] = random_seed 81 | run_id = "_".join([task_cls.name(), model_cls.name(model_params), time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())]) 82 | 83 | model = model_cls(model_params, task, run_id, result_dir) 84 | model.log_line("Run %s starting." % run_id) 85 | model.log_line(" Using the following task params: %s" % json.dumps(task_params_orig)) 86 | model.log_line(" Using the following model params: %s" % json.dumps(model_params)) 87 | 88 | if sys.stdin.isatty(): 89 | try: 90 | git_sha = git_tag_run(run_id) 91 | model.log_line(" git tagged as %s" % git_sha) 92 | except: 93 | print(" Tried tagging run in git, but failed.") 94 | pass 95 | 96 | model.initialize_model( ) 97 | model.train(quiet=args.get('--quiet'), tf_summary_path=args.get('--tensorboard')) 98 | 99 | if args.get('--run-test'): 100 | test(model.best_model_file, data_path, result_dir, quiet=args.get('--quiet'), run_id=run_id) 101 | 102 | 103 | if __name__ == "__main__": 104 | args = docopt(__doc__) 105 | run_and_debug(lambda: run(args), enable_debugging=args['--debug']) 106 | -------------------------------------------------------------------------------- /tf-gnn-samples/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import SMALL_NUMBER, BIG_NUMBER, get_gated_unit, get_aggregation_function, get_activation, MLP, micro_f1 2 | -------------------------------------------------------------------------------- /tf-gnn-samples/utils/add_child_ids.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from argparse import ArgumentParser 3 | 4 | raw_keys = ['Child', 'NextToken', 'ComputedFrom', 'LastUse', 'LastWrite', 'LastLexicalUse', 'FormalArgName', 'GuardedBy', 'GuardedByNegation', 'UsesSubtoken'] 5 | 6 | if __name__ == '__main__': 7 | parser = ArgumentParser() 8 | parser.add_argument("--edges", dest="edges", required=True) 9 | args = parser.parse_args() 10 | 11 | with open(args.edges, 'rb') as file: 12 | raw_edges = pickle.load(file) 13 | 14 | parent_to_children = {} 15 | child_to_parent = {} 16 | for s, t in raw_edges['Child']: 17 | if not s in parent_to_children: 18 | parent_to_children[s] = [] 19 | parent_to_children[s].append(t) 20 | child_to_parent[t] = s 21 | 22 | cur = 0 23 | next_map = {} 24 | for s, t in raw_edges['NextToken']: 25 | next_map[s] = t 26 | prev_map = {t:s for s,t in next_map.items()} 27 | 28 | def get_all_next(n): 29 | result = [] 30 | cur = n 31 | while cur in next_map: 32 | next_item = next_map[cur] 33 | result.append(next_item) 34 | cur = next_item 35 | return result 36 | 37 | def get_all_prev(n): 38 | result = [] 39 | cur = n 40 | while cur in prev_map: 41 | prev_item = prev_map[cur] 42 | result.append(prev_item) 43 | cur = prev_item 44 | return result 45 | 46 | 47 | nodes = child_to_parent.keys() 48 | left_nodes = list(nodes) 49 | 50 | parent_to_descendants = {} 51 | def get_parent_to_descendants(p): 52 | desc = set() 53 | for c in parent_to_children[p]: 54 | if c in parent_to_children: # if c is a parent itself 55 | desc.update(get_parent_to_descendants(c)) 56 | else: 57 | desc.add(c) 58 | return desc 59 | 60 | for p in parent_to_children.keys(): 61 | desc = get_parent_to_descendants(p) 62 | parent_to_descendants[p] = desc 63 | 64 | roots = set() 65 | for n in nodes: 66 | cur = n 67 | while cur in child_to_parent: 68 | cur = child_to_parent[cur] 69 | roots.add(cur) 70 | 71 | print(raw_edges) 72 | -------------------------------------------------------------------------------- /tf-gnn-samples/utils/citation_network_utils.py: -------------------------------------------------------------------------------- 1 | # This is largely copied from https://raw.githubusercontent.com/tkipf/gcn/master/gcn/utils.py 2 | # It is Copyright (c) 2016 Thomas Kipf, under the MIT license (see LICENSE for a copy) 3 | 4 | import numpy as np 5 | import pickle as pkl 6 | import scipy.sparse as sp 7 | import sys 8 | 9 | 10 | def parse_index_file(filename): 11 | """Parse index file.""" 12 | index = [] 13 | for line in open(filename): 14 | index.append(int(line.strip())) 15 | return index 16 | 17 | 18 | def sample_mask(idx, l): 19 | """Create mask.""" 20 | mask = np.zeros(l) 21 | mask[idx] = 1 22 | return np.array(mask, dtype=np.bool) 23 | 24 | 25 | def load_data(directory: str, dataset_str: str): 26 | """ 27 | Loads input data from gcn/data directory 28 | 29 | ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; 30 | ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; 31 | ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances 32 | (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object; 33 | ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object; 34 | ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object; 35 | ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object; 36 | ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict 37 | object; 38 | ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object. 39 | 40 | All objects above must be saved using python pickle module. 41 | 42 | :param dataset_str: Dataset name 43 | :return: All data input files loaded (as well the training/test data). 44 | """ 45 | names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] 46 | objects = [] 47 | for i in range(len(names)): 48 | with open("{}/ind.{}.{}".format(directory, dataset_str, names[i]), 'rb') as f: 49 | if sys.version_info > (3, 0): 50 | objects.append(pkl.load(f, encoding='latin1')) 51 | else: 52 | objects.append(pkl.load(f)) 53 | 54 | x, y, tx, ty, allx, ally, graph = tuple(objects) 55 | test_idx_reorder = parse_index_file("{}/ind.{}.test.index".format(directory, dataset_str)) 56 | test_idx_range = np.sort(test_idx_reorder) 57 | 58 | if dataset_str == 'citeseer': 59 | # Fix citeseer dataset (there are some isolated nodes in the graph) 60 | # Find isolated nodes, add them as zero-vecs into the right position 61 | test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) 62 | tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) 63 | tx_extended[test_idx_range-min(test_idx_range), :] = tx 64 | tx = tx_extended 65 | ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) 66 | ty_extended[test_idx_range-min(test_idx_range), :] = ty 67 | ty = ty_extended 68 | 69 | features = sp.vstack((allx, tx)).tolil() 70 | features[test_idx_reorder, :] = features[test_idx_range, :] 71 | 72 | labels = np.vstack((ally, ty)) 73 | labels[test_idx_reorder, :] = labels[test_idx_range, :] 74 | 75 | idx_test = test_idx_range.tolist() 76 | idx_train = range(len(y)) 77 | idx_val = range(len(y), len(y)+500) 78 | 79 | train_mask = sample_mask(idx_train, labels.shape[0]) 80 | val_mask = sample_mask(idx_val, labels.shape[0]) 81 | test_mask = sample_mask(idx_test, labels.shape[0]) 82 | 83 | y_train = np.zeros(labels.shape) 84 | y_val = np.zeros(labels.shape) 85 | y_test = np.zeros(labels.shape) 86 | y_train[train_mask, :] = labels[train_mask, :] 87 | y_val[val_mask, :] = labels[val_mask, :] 88 | y_test[test_mask, :] = labels[test_mask, :] 89 | 90 | return graph, features, y_train, y_val, y_test, train_mask, val_mask, test_mask 91 | 92 | 93 | def sparse_to_tuple(sparse_mx): 94 | """Convert sparse matrix to tuple representation.""" 95 | def to_tuple(mx): 96 | if not sp.isspmatrix_coo(mx): 97 | mx = mx.tocoo() 98 | coords = np.vstack((mx.row, mx.col)).transpose() 99 | values = mx.data 100 | shape = mx.shape 101 | # All of these will need to be sorted: 102 | sort_indices = np.lexsort(np.rot90(coords)) 103 | return coords[sort_indices], values[sort_indices], shape 104 | 105 | if isinstance(sparse_mx, list): 106 | for i in range(len(sparse_mx)): 107 | sparse_mx[i] = to_tuple(sparse_mx[i]) 108 | else: 109 | sparse_mx = to_tuple(sparse_mx) 110 | 111 | return sparse_mx 112 | 113 | 114 | def preprocess_features(features): 115 | """Row-normalize feature matrix and convert to tuple representation""" 116 | rowsum = np.array(features.sum(1)) 117 | r_inv = np.power(rowsum, -1).flatten() 118 | r_inv[np.isinf(r_inv)] = 0. 119 | r_mat_inv = sp.diags(r_inv) 120 | features = r_mat_inv.dot(features) 121 | return features.toarray() # densify -- these are tiny and we don't care 122 | 123 | 124 | def normalize_adj(adj): 125 | """Symmetrically normalize adjacency matrix.""" 126 | adj = sp.coo_matrix(adj) 127 | rowsum = np.array(adj.sum(1)) 128 | d_inv_sqrt = np.power(rowsum, -0.5).flatten() 129 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 130 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 131 | return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() 132 | 133 | 134 | def preprocess_adj(adj): 135 | """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation.""" 136 | adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0])) 137 | return sparse_to_tuple(adj_normalized) 138 | 139 | 140 | -------------------------------------------------------------------------------- /tf-gnn-samples/utils/compute_diameters.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import json 3 | import networkx as nx 4 | import gzip 5 | import numpy as np 6 | import statistics 7 | 8 | def compute_diameter(adjacency_list): 9 | # graph is a list of edges 10 | # every edge is a list: [source, type, target] 11 | g = nx.Graph() 12 | for edge_source, _, edge_target in adjacency_list: 13 | g.add_edge(edge_source, edge_target) 14 | return nx.diameter(g) 15 | 16 | if __name__ == '__main__': 17 | parser = ArgumentParser() 18 | parser.add_argument("--data", dest="data", required=True) 19 | args = parser.parse_args() 20 | 21 | with gzip.open(args.data, 'r') as file: 22 | lines = file.readlines() 23 | 24 | objs = [json.loads(line) for line in lines] 25 | graphs = [o['graph'] for o in objs] 26 | 27 | diameters = [compute_diameter(graph) for graph in graphs] 28 | print('Max diameter: ', max(diameters)) 29 | print('Mean diameter: ', np.mean(diameters)) 30 | print('stddev: ', statistics.stdev(diameters)) 31 | 32 | percentiles = range(10, 110, 10) 33 | percentile_results = np.percentile(diameters, percentiles) 34 | for i, res in zip(percentiles, percentile_results): 35 | print('Diameters - {} percentile: {}'.format(i, res)) -------------------------------------------------------------------------------- /tf-gnn-samples/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Tuple, Type, Dict, Any 4 | 5 | import pickle 6 | 7 | from models import (Sparse_Graph_Model, GGNN_Model, GNN_FiLM_Model, GNN_Edge_MLP_Model, 8 | RGAT_Model, RGCN_Model, RGDCN_Model, RGIN_Model) 9 | from models.no_struct_mlp_model import No_Struct_MLP_Model 10 | from tasks import Sparse_Graph_Task, QM9_Task, Citation_Network_Task, PPI_Task, VarMisuse_Task 11 | 12 | 13 | def name_to_task_class(name: str) -> Tuple[Type[Sparse_Graph_Task], Dict[str, Any]]: 14 | name = name.lower() 15 | if name == "qm9": 16 | return QM9_Task, {} 17 | if name == "cora": 18 | return Citation_Network_Task, {"data_kind": "cora"} 19 | if name == "citeseer": 20 | return Citation_Network_Task, {"data_kind": "citeseer"} 21 | if name == "pubmed": 22 | return Citation_Network_Task, {"data_kind": "pubmed"} 23 | if name == "citationnetwork": 24 | return Citation_Network_Task, {} 25 | if name == "ppi": 26 | return PPI_Task, {} 27 | if name == "varmisuse": 28 | return VarMisuse_Task, {} 29 | 30 | raise ValueError("Unknown task type '%s'" % name) 31 | 32 | 33 | def name_to_model_class(name: str) -> Tuple[Type[Sparse_Graph_Model], Dict[str, Any]]: 34 | name = name.lower() 35 | if name in ["ggnn", "ggnn_model"]: 36 | return GGNN_Model, {} 37 | if name in ["gnn_edge_mlp", "gnn-edge-mlp", "gnn_edge_mlp_model"]: 38 | return GNN_Edge_MLP_Model, {} 39 | if name in ["gnn_edge_mlp0", "gnn-edge-mlp0", "gnn_edge_mlp0_model"]: 40 | return GNN_Edge_MLP_Model, {'num_edge_hidden_layers': 0} 41 | if name in ["gnn_edge_mlp1", "gnn-edge-mlp1", "gnn_edge_mlp1_model"]: 42 | return GNN_Edge_MLP_Model, {'num_edge_hidden_layers': 1} 43 | if name in ["gnn_edge_mlp", "gnn-edge-mlp"]: 44 | return GNN_Edge_MLP_Model, {} 45 | if name in ["gnn_film", "gnn-film", "gnn_film_model"]: 46 | return GNN_FiLM_Model, {} 47 | if name in ["rgat", "rgat_model"]: 48 | return RGAT_Model, {} 49 | if name in ["rgcn", "rgcn_model"]: 50 | return RGCN_Model, {} 51 | if name in ["rgdcn", "rgdcn_model"]: 52 | return RGDCN_Model, {} 53 | if name in ["rgin", "rgin_model"]: 54 | return RGIN_Model, {} 55 | if name in ['nostruct', 'no_struct', 'no-struct', 'nostruct-mlp1']: 56 | return No_Struct_MLP_Model, {'num_edge_hidden_layers': 1} 57 | 58 | raise ValueError("Unknown model type '%s'" % name) 59 | 60 | 61 | def restore(saved_model_path: str, result_dir: str, run_id: str = None) -> Sparse_Graph_Model: 62 | print("Loading model from file %s." % saved_model_path) 63 | with open(saved_model_path, 'rb') as in_file: 64 | data_to_load = pickle.load(in_file) 65 | 66 | model_cls, _ = name_to_model_class(data_to_load['model_class']) 67 | task_cls, additional_task_params = name_to_task_class(data_to_load['task_class']) 68 | 69 | if run_id is None: 70 | run_id = "_".join([task_cls.name(), model_cls.name(data_to_load['model_params']), time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())]) 71 | 72 | task = task_cls(data_to_load['task_params']) 73 | task.restore_from_metadata(data_to_load['task_metadata']) 74 | 75 | model = model_cls(data_to_load['model_params'], task, run_id, result_dir) 76 | model.load_weights(data_to_load['weights']) 77 | 78 | model.log_line("Loaded model from snapshot %s." % saved_model_path) 79 | 80 | return model 81 | -------------------------------------------------------------------------------- /tf-gnn-samples/utils/prep_baseline.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import json 3 | import pickle 4 | import sys 5 | from argparse import ArgumentParser 6 | import gzip 7 | import re 8 | import os 9 | import random 10 | import multiprocessing as mp 11 | 12 | ignore_list = ['openlivewriter', 'botbuilder'] 13 | CANDIDATE_BEGIN = '' 14 | CANDIDATE_END = '' 15 | SLOT = '' 16 | project_name_map = { 17 | 'akka.net': 'akka' 18 | } 19 | filename_mapping = { 20 | 'C:\\Users\\t-mialla\\Documents\\sampleProjects\\SignalR\\src\\Microsoft.AspNet.SignalR.Core\\Messaging\\Cursor.cs': 'Core\\Messaging\\Cursor.cs' 21 | } 22 | 23 | RE_WORDS = re.compile(r''' 24 | # Find words in a string. Order matters! 25 | [A-Z]+(?=[A-Z][a-z]) | # All upper case before a capitalized word 26 | [A-Z]?[a-z]+ | # Capitalized words / all lower case 27 | [A-Z]+ | # All upper case 28 | \d+ | # Numbers 29 | _ | 30 | \" | 31 | .+ 32 | ''', re.VERBOSE) 33 | 34 | def split_subtokens(str): 35 | return [subtok for subtok in RE_WORDS.findall(str) if not subtok == '_'] 36 | 37 | def get_immediate_subdirectories(a_dir): 38 | return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir) 39 | if os.path.isdir(os.path.join(a_dir, name))] 40 | 41 | def get_immediate_files(dir): 42 | return [(os.path.join(dir, name)) for name in os.listdir(dir) 43 | if os.path.isfile(os.path.join(dir, name))] 44 | 45 | def collect_tokens(path): 46 | projects = get_immediate_subdirectories(path) 47 | 48 | tokens_dict = {} 49 | for proj in projects: 50 | proj_name = proj.split('/')[-1] 51 | if proj_name in project_name_map: 52 | proj_name = project_name_map[proj_name] 53 | tokens_file_name = f'{proj}/{proj_name}-tokens.json.gz' 54 | if proj_name in ignore_list: 55 | continue 56 | #if os.path.isfile(tokens_file_name): 57 | with gzip.open(tokens_file_name, 'r') as file: 58 | lines = file.readlines() 59 | objs = json.loads(lines[0]) 60 | for o in objs: 61 | tokens_dict[o['Provenance']] = o['Tokens'] 62 | return tokens_dict 63 | 64 | def create_sequences(path, tokens_dict, out_path): 65 | subsets = get_immediate_subdirectories(path) # train, valid, test, testonly 66 | process_gz_file_func = functools.partial(process_gz_file, tokens_dict) 67 | for dir in subsets: 68 | dir_name = dir.split('/')[-1] 69 | out_dir_path = f'{out_path}/{dir_name}' 70 | if os.path.isdir(out_dir_path): 71 | raise ValueError(f'{out_path}/{dir_name} already exists') 72 | os.mkdir(out_dir_path) 73 | files = get_immediate_files(dir) 74 | with open(f'{out_dir_path}/source.txt', 'w') as out_source_file, \ 75 | open(f'{out_dir_path}/target.txt', 'w') as out_target_file: 76 | with mp.Pool(64) as pool: 77 | #results = [process_gz_file(file, tokens_dict) for file in files] 78 | results = pool.imap_unordered(process_gz_file_func, files) 79 | for example in results: 80 | for source, target in zip(*example): 81 | out_source_file.write(source) 82 | out_target_file.write(target) 83 | 84 | def process_gz_file(tokens_dict, gz_file_name): 85 | sources, targets = [], [] 86 | with gzip.open(gz_file_name, 'r') as gz_file: 87 | lines = gz_file.readlines() 88 | objs = [json.loads(l) for l in lines] 89 | for o in objs: 90 | filename = o['filename'] 91 | if filename in tokens_dict: 92 | tokens = tokens_dict[filename] 93 | elif filename in filename_mapping: 94 | tokens = tokens_dict[filename_mapping[filename]] 95 | else: 96 | found_filenames = [name for name in tokens_dict.keys() if filename.endswith(name)] 97 | if len(found_filenames) != 1: 98 | found_filenames = [name for name in found_filenames if name != 's'] 99 | if len(found_filenames) != 1: 100 | raise ValueError( 101 | f'Looking for filename: {filename}, but found in tokens_dict: {found_filenames}') 102 | 103 | tokens = tokens_dict[found_filenames[0]] 104 | print(f'Taking {found_filenames[0]} instead of {filename}') 105 | slot_token_index = o['slotTokenIdx'] 106 | tokens[slot_token_index] = SLOT 107 | subtokens = [' '.join(split_subtokens(tok)) for tok in tokens] 108 | candidates = [' '.join(split_subtokens(candi['SymbolName'])) for candi in o['SymbolCandidates']] 109 | # Important to shuffle, because the first one is always the correct one 110 | random.shuffle(candidates) 111 | label = [' '.join(split_subtokens(candi['SymbolName'])) for candi in o['SymbolCandidates'] if 112 | candi['IsCorrect'] == True] 113 | if len(label) is not 1: 114 | raise ValueError(f'Found {len(label)} correct labels in {gz_file_name}, example {o["filename"]}') 115 | label = label[0] 116 | outline = ' '.join(subtokens) + ' ' + ' '.join( 117 | [CANDIDATE_BEGIN + ' ' + candi + ' ' + CANDIDATE_END for candi in candidates]) + '\n' 118 | sources.append(outline) 119 | targets.append(label + '\n') 120 | return sources, targets 121 | 122 | if __name__ == '__main__': 123 | parser = ArgumentParser() 124 | parser.add_argument("--raw", dest="raw_path", required=True) 125 | parser.add_argument("--reorg", dest="reorg_path", required=True) 126 | parser.add_argument("--out", dest="out_path", required=True) 127 | 128 | args = parser.parse_args() 129 | 130 | #tokens_dict = collect_tokens(args.raw_path) 131 | with open('tokens.pkl', 'rb') as file: 132 | tokens_dict = pickle.load(file) 133 | sequences = create_sequences(args.reorg_path, tokens_dict, args.out_path) 134 | -------------------------------------------------------------------------------- /tf-gnn-samples/utils/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Callable, Union, List 2 | 3 | import tensorflow as tf 4 | 5 | 6 | BIG_NUMBER = 1e7 7 | SMALL_NUMBER = 1e-7 8 | 9 | 10 | def get_gated_unit(units: int, gated_unit: str, activation_function: str): 11 | activation_fn = get_activation(activation_function) 12 | gated_unit_name = gated_unit.lower() 13 | if gated_unit_name == 'rnn': 14 | return tf.keras.layers.SimpleRNNCell(units, activation=activation_fn) 15 | if gated_unit_name == 'gru': 16 | return tf.keras.layers.GRUCell(units, activation=activation_fn) 17 | if gated_unit_name == 'lstm': 18 | return tf.keras.layers.LSTMCell(units, activation=activation_fn) 19 | else: 20 | raise Exception("Unknown RNN cell type '%s'." % gated_unit) 21 | 22 | 23 | def get_aggregation_function(aggregation_fun: Optional[str]): 24 | if aggregation_fun in ['sum', 'unsorted_segment_sum']: 25 | return tf.unsorted_segment_sum 26 | if aggregation_fun in ['max', 'unsorted_segment_max']: 27 | return tf.unsorted_segment_max 28 | if aggregation_fun in ['mean', 'unsorted_segment_mean']: 29 | return tf.unsorted_segment_mean 30 | if aggregation_fun in ['sqrt_n', 'unsorted_segment_sqrt_n']: 31 | return tf.unsorted_segment_sqrt_n 32 | else: 33 | raise ValueError("Unknown aggregation function '%s'!" % aggregation_fun) 34 | 35 | 36 | def get_activation(activation_fun: Optional[str]): 37 | if activation_fun is None: 38 | return None 39 | activation_fun = activation_fun.lower() 40 | if activation_fun == 'linear': 41 | return None 42 | if activation_fun == 'tanh': 43 | return tf.tanh 44 | if activation_fun == 'relu': 45 | return tf.nn.relu 46 | if activation_fun == 'leaky_relu': 47 | return tf.nn.leaky_relu 48 | if activation_fun == 'elu': 49 | return tf.nn.elu 50 | if activation_fun == 'selu': 51 | return tf.nn.selu 52 | if activation_fun == 'gelu': 53 | def gelu(input_tensor): 54 | cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) 55 | return input_tensor * cdf 56 | return gelu 57 | else: 58 | raise ValueError("Unknown activation function '%s'!" % activation_fun) 59 | 60 | 61 | def micro_f1(logits, labels): 62 | # Everything on int, because who trusts float anyway? 63 | predicted = tf.round(tf.nn.sigmoid(logits)) 64 | predicted = tf.cast(predicted, dtype=tf.int32) 65 | labels = tf.cast(labels, dtype=tf.int32) 66 | 67 | true_pos = tf.count_nonzero(predicted * labels) 68 | false_pos = tf.count_nonzero(predicted * (labels - 1)) 69 | false_neg = tf.count_nonzero((predicted - 1) * labels) 70 | 71 | precision = true_pos / (true_pos + false_pos) 72 | recall = true_pos / (true_pos + false_neg) 73 | fmeasure = (2 * precision * recall) / (precision + recall) 74 | return tf.cast(fmeasure, tf.float32) 75 | 76 | 77 | class MLP(object): 78 | def __init__(self, 79 | out_size: int, 80 | hidden_layers: Union[List[int], int] = 1, 81 | use_biases: bool = False, 82 | activation_fun: Optional[Callable[[tf.Tensor], tf.Tensor]] = tf.nn.relu, 83 | dropout_rate: Union[float, tf.Tensor] = 0.0, 84 | name: Optional[str] = "MLP", 85 | ): 86 | """ 87 | Create new MLP with given number of hidden layers. 88 | 89 | Arguments: 90 | out_size: Dimensionality of output. 91 | hidden_layers: Either an integer determining number of hidden layers, who will have out_size units each; 92 | or list of integers whose lengths determines the number of hidden layers and whose contents the 93 | number of units in each layer. 94 | use_biases: Flag indicating use of bias in fully connected layers. 95 | activation_fun: Activation function applied between hidden layers (NB: the output of the MLP 96 | is always the direct result of a linear transformation) 97 | dropout_rate: Dropout applied to inputs of each MLP layer. 98 | """ 99 | if isinstance(hidden_layers, int): 100 | hidden_layer_sizes = [out_size] * hidden_layers 101 | else: 102 | hidden_layer_sizes = hidden_layers 103 | 104 | if len(hidden_layer_sizes) > 1: 105 | assert activation_fun is not None, "Multiple linear layers without an activation" 106 | 107 | self.__dropout_rate = dropout_rate 108 | self.__name = name 109 | with tf.variable_scope(self.__name): 110 | self.__layers = [] # type: List[tf.layers.Dense] 111 | for hidden_layer_size in hidden_layer_sizes: 112 | self.__layers.append(tf.layers.Dense(units=hidden_layer_size, 113 | use_bias=use_biases, 114 | activation=activation_fun)) 115 | # Output layer: 116 | self.__layers.append(tf.layers.Dense(units=out_size, 117 | use_bias=use_biases, 118 | activation=None)) 119 | 120 | def __call__(self, input: tf.Tensor) -> tf.Tensor: 121 | with tf.variable_scope(self.__name): 122 | activations = input 123 | for layer in self.__layers[:-1]: 124 | activations = tf.nn.dropout(activations, rate=self.__dropout_rate) 125 | activations = layer(activations) 126 | return self.__layers[-1](activations) 127 | -------------------------------------------------------------------------------- /tf-gnn-samples/utils/varmisuse_data_splitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Usage: 4 | varmisuse_data_splitter.py [options] RAW_DATA_DIR OUT_DIR 5 | 6 | Reads in datapoints from a set of files and creates smaller files mixing these, in a format 7 | suitable for streaming them into the training process. 8 | 9 | Options: 10 | -h --help Show this screen. 11 | --chunk-size NUM Number of samples per output file. [default: 100] 12 | --num-workers NUM Number of worker processes. Defaults to number of CPU cores. 13 | --window-size NUM Number of samples to load before mixing and writing things out. [default: 5000] 14 | --azure-info= Azure authentication information file (JSON). [default: azure_auth.json] 15 | --debug Turn on debugger. 16 | """ 17 | from typing import List, Any 18 | 19 | import numpy as np 20 | from more_itertools import chunked 21 | from docopt import docopt 22 | from dpu_utils.utils import run_and_debug, RichPath 23 | from multiprocessing import Process, Queue, cpu_count 24 | 25 | 26 | def _data_loading_worker(file_queue: Queue, result_queue: Queue) -> None: 27 | while True: 28 | next_path = file_queue.get() 29 | if next_path is None: # Our signal that all files have been processed 30 | file_queue.put(None) # Signal to the other workers 31 | result_queue.put(None) # Signal to the controller that we are done 32 | break 33 | 34 | # Read the file and push examples out as soon as we get them: 35 | for raw_sample in next_path.read_by_file_suffix(): 36 | result_queue.put(raw_sample) 37 | 38 | 39 | def _write_data(out_dir: RichPath, window_idx: int, chunk_size: int, data_window: List[Any]): 40 | np.random.shuffle(data_window) 41 | for chunk_idx, data_chunk in enumerate(chunked(data_window, chunk_size)): 42 | out_file = out_dir.join('chunk_%i-%i.jsonl.gz' % (window_idx, chunk_idx)) 43 | out_file.save_as_compressed_file(data_chunk) 44 | 45 | 46 | def run(args): 47 | azure_info_path = args.get('--azure-info', None) 48 | in_dir = RichPath.create(args['RAW_DATA_DIR'], azure_info_path) 49 | out_dir = RichPath.create(args['OUT_DIR'], azure_info_path) 50 | out_dir.make_as_dir() 51 | 52 | num_workers = int(args.get('--num-workers') or cpu_count()) 53 | chunk_size = int(args['--chunk-size']) 54 | window_size = int(args['--window-size']) 55 | 56 | files_to_load = list(in_dir.iterate_filtered_files_in_dir("*.gz")) 57 | path_queue = Queue(maxsize=len(files_to_load) + 1) 58 | result_queue = Queue(1000) 59 | 60 | # Set up list of work to do: 61 | for path in files_to_load: 62 | path_queue.put(path) 63 | path_queue.put(None) # Signal for the end of the queue 64 | 65 | # Set up workers: 66 | workers = [] 67 | for _ in range(num_workers): 68 | workers.append(Process(target=_data_loading_worker, 69 | args=(path_queue, result_queue,))) 70 | workers[-1].start() 71 | 72 | # Consume the data: 73 | num_workers_terminated = 0 74 | data_window = [] 75 | window_idx = 0 76 | while num_workers_terminated < len(workers): 77 | parsed_sample = result_queue.get() 78 | if parsed_sample is None: 79 | num_workers_terminated += 1 # Worker signaled that it's done 80 | else: 81 | data_window.append(parsed_sample) 82 | if len(data_window) >= window_size: 83 | _write_data(out_dir, window_idx, chunk_size, data_window) 84 | data_window = [] 85 | window_idx += 1 86 | 87 | # Write out the remainder of the data: 88 | _write_data(out_dir, window_idx, chunk_size, data_window) 89 | 90 | # Clean up the workers: 91 | for worker in workers: 92 | worker.join() 93 | 94 | 95 | if __name__ == "__main__": 96 | args = docopt(__doc__) 97 | run_and_debug(lambda: run(args), enable_debugging=args['--debug']) 98 | --------------------------------------------------------------------------------