├── .gitignore
├── CITATION.cff
├── LICENSE
├── README.md
├── common.py
├── experiment.py
├── gnn-comparison
    ├── .gitignore
    ├── Check isomorphisms and noisy labels.ipynb
    ├── Conda_Info
    ├── EndToEnd_Evaluation.py
    ├── LICENSE
    ├── Launch_Experiments.py
    ├── PrepareDatasets.py
    ├── README.md
    ├── config
    │   ├── __init__.py
    │   ├── base.py
    │   └── utils.py
    ├── config_BaselineChemical.yml
    ├── config_BaselineENZYMES.yml
    ├── config_BaselineIMDB.yml
    ├── config_BaselineSocial.yml
    ├── config_DGCNN.yml
    ├── config_DiffPool.yml
    ├── config_ECC.yml
    ├── config_GIN.yml
    ├── config_GraphSAGE.yml
    ├── evaluation
    │   ├── dataset_getter.py
    │   ├── model_selection
    │   │   ├── HoldOutSelector.py
    │   │   └── K_Fold_Selection.py
    │   └── risk_assessment
    │   │   ├── HoldOutAssessment.py
    │   │   └── K_Fold_Assessment.py
    ├── experiments
    │   ├── EndToEndExperiment.py
    │   └── Experiment.py
    ├── log
    │   ├── Logger.py
    │   └── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── gnn_wrapper
    │   │   ├── NetWrapper.py
    │   │   └── __init__.py
    │   ├── graph_classifiers
    │   │   ├── DGCNN.py
    │   │   ├── DeepMultisets.py
    │   │   ├── DiffPool.py
    │   │   ├── ECC.py
    │   │   ├── GIN.py
    │   │   ├── GraphSAGE.py
    │   │   ├── MLP_Classifier.py
    │   │   ├── MolecularFingerprint.py
    │   │   ├── __init__.py
    │   │   └── self_attention.py
    │   ├── modules.py
    │   ├── schedulers
    │   │   └── ECCScheduler.py
    │   └── utils
    │   │   ├── EarlyStopper.py
    │   │   └── __init__.py
    ├── requirements.txt
    └── utils
    │   ├── batch_utils.py
    │   ├── eval_across_folds.py
    │   └── utils.py
├── images
    ├── fig3.png
    └── fig5.png
├── main.py
├── models
    └── graph_model.py
├── requirements.txt
├── run-gat-2-8.py
├── run-gcn-2-8.py
├── run-ggnn-2-8.py
├── run-gin-2-8.py
├── tasks
    ├── dictionary_lookup.py
    └── tree_dataset.py
└── tf-gnn-samples
    ├── .flake8
    ├── .gitignore
    ├── CONTRIBUTING.md
    ├── LICENSE
    ├── README.md
    ├── gnns
        ├── __init__.py
        ├── ggnn.py
        ├── gnn_edge_mlp.py
        ├── gnn_film.py
        ├── rgat.py
        ├── rgcn.py
        ├── rgdcn.py
        └── rgin.py
    ├── models
        ├── __init__.py
        ├── ggnn_model.py
        ├── gnn_edge_mlp_model.py
        ├── gnn_film_model.py
        ├── no_struct_mlp_model.py
        ├── rgat_model.py
        ├── rgcn_model.py
        ├── rgdcn_model.py
        ├── rgin_model.py
        ├── self_attention.py
        └── sparse_graph_model.py
    ├── requirements.txt
    ├── run_ppi_benchs.py
    ├── run_qm9_benchs.py
    ├── run_qm9_benchs_fa.py
    ├── run_varmisuse_benchs.py
    ├── run_varmisuse_benchs_fa.py
    ├── tasks
        ├── __init__.py
        ├── citation_network_task.py
        ├── default_hypers
        │   ├── PPI_GGNN.json
        │   ├── PPI_GNN-Edge-MLP0.json
        │   ├── PPI_GNN-Edge-MLP1.json
        │   ├── PPI_GNN-FiLM.json
        │   ├── PPI_RGAT.json
        │   ├── PPI_RGCN.json
        │   ├── PPI_RGIN.json
        │   ├── QM9_GGNN.json
        │   ├── QM9_GNN-Edge-MLP0.json
        │   ├── QM9_GNN-Edge-MLP1.json
        │   ├── QM9_GNN-FiLM.json
        │   ├── QM9_NoStruct-MLP0.json
        │   ├── QM9_RGAT.json
        │   ├── QM9_RGCN.json
        │   ├── QM9_RGIN.json
        │   ├── VarMisuse_GGNN.json
        │   ├── VarMisuse_GNN-Edge-MLP0.json
        │   ├── VarMisuse_GNN-Edge-MLP1.json
        │   ├── VarMisuse_GNN-FiLM.json
        │   ├── VarMisuse_NoStruct-MLP1.json
        │   ├── VarMisuse_RGAT.json
        │   ├── VarMisuse_RGCN.json
        │   └── VarMisuse_RGIN.json
        ├── ppi_task.py
        ├── qm9_task.py
        ├── sparse_graph_task.py
        └── varmisuse_task.py
    ├── test.py
    ├── train.py
    └── utils
        ├── __init__.py
        ├── add_child_ids.py
        ├── citation_network_utils.py
        ├── compute_diameters.py
        ├── model_utils.py
        ├── prep_baseline.py
        ├── utils.py
        └── varmisuse_data_splitter.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | .idea
131 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | @inproceedings{
2 |     alon2021on,
3 |     title={On the Bottleneck of Graph Neural Networks and its Practical Implications},
4 |     author={Uri Alon and Eran Yahav},
5 |     booktitle={International Conference on Learning Representations},
6 |     year={2021},
7 |     url={https://openreview.net/forum?id=i80OPhOCVH2}
8 | }
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 tech-srl
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum, auto
 2 | 
 3 | from tasks.dictionary_lookup import DictionaryLookupDataset
 4 | 
 5 | from torch import nn
 6 | from torch_geometric.nn import GCNConv, GatedGraphConv, GINConv, GATConv
 7 | 
 8 | 
 9 | class Task(Enum):
10 |     NEIGHBORS_MATCH = auto()
11 | 
12 |     @staticmethod
13 |     def from_string(s):
14 |         try:
15 |             return Task[s]
16 |         except KeyError:
17 |             raise ValueError()
18 | 
19 |     def get_dataset(self, depth, train_fraction):
20 |         if self is Task.NEIGHBORS_MATCH:
21 |             dataset = DictionaryLookupDataset(depth)
22 |         else:
23 |             dataset = None
24 | 
25 |         return dataset.generate_data(train_fraction)
26 | 
27 | 
28 | class GNN_TYPE(Enum):
29 |     GCN = auto()
30 |     GGNN = auto()
31 |     GIN = auto()
32 |     GAT = auto()
33 | 
34 |     @staticmethod
35 |     def from_string(s):
36 |         try:
37 |             return GNN_TYPE[s]
38 |         except KeyError:
39 |             raise ValueError()
40 | 
41 |     def get_layer(self, in_dim, out_dim):
42 |         if self is GNN_TYPE.GCN:
43 |             return GCNConv(
44 |                 in_channels=in_dim,
45 |                 out_channels=out_dim)
46 |         elif self is GNN_TYPE.GGNN:
47 |             return GatedGraphConv(out_channels=out_dim, num_layers=1)
48 |         elif self is GNN_TYPE.GIN:
49 |             return GINConv(nn.Sequential(nn.Linear(in_dim, out_dim), nn.BatchNorm1d(out_dim), nn.ReLU(),
50 |                                          nn.Linear(out_dim, out_dim), nn.BatchNorm1d(out_dim), nn.ReLU()))
51 |         elif self is GNN_TYPE.GAT:
52 |             # 4-heads, although the paper by Velickovic et al. had used 6-8 heads.
53 |             # The output will be the concatenation of the heads, yielding a vector of size out_dim
54 |             num_heads = 4
55 |             return GATConv(in_dim, out_dim // num_heads, heads=num_heads)
56 | 
57 | 
58 | class STOP(Enum):
59 |     TRAIN = auto()
60 |     TEST = auto()
61 | 
62 |     @staticmethod
63 |     def from_string(s):
64 |         try:
65 |             return STOP[s]
66 |         except KeyError:
67 |             raise ValueError()
68 | 
69 | 
70 | def one_hot(key, depth):
71 |     return [1 if i == key else 0 for i in range(depth)]
72 | 


--------------------------------------------------------------------------------
/gnn-comparison/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # pylint
104 | .pylintrc
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | # vscode
110 | .vscode
111 | 
112 | # PROJECT
113 | *DATA*/**
114 | Old_Baselines/**
115 | *RESULTS*/**
116 | 
117 | 
118 | .idea/inspectionProfiles/Project_Default.xml
119 | *.xml
120 | models/.DS_Store
121 | .DS_Store
122 | 


--------------------------------------------------------------------------------
/gnn-comparison/Check isomorphisms and noisy labels.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import itertools\n",
 10 |     "import networkx as nx\n",
 11 |     "from torch_geometric.datasets import TUDataset\n",
 12 |     "from torch_geometric.utils import to_networkx\n",
 13 |     "\n",
 14 |     "%matplotlib inline"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "def convert(data):\n",
 24 |     "    G = to_networkx(data)\n",
 25 |     "    G.graph['label'] = data.y.item()\n",
 26 |     "    return nx.to_undirected(G)\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "def dataset_to_graphs(dataset):\n",
 30 |     "    graphs = []\n",
 31 |     "    for data in dataset:\n",
 32 |     "        graphs.append(convert(data))\n",
 33 |     "    return graphs\n",
 34 |     "    \n",
 35 |     "\n",
 36 |     "def check(graphs):\n",
 37 |     "    num_iso_pairs = 0\n",
 38 |     "    num_inconsistent_labels = 0\n",
 39 |     "    num_graphs = len(graphs)\n",
 40 |     "    combinations = itertools.combinations(range(num_graphs), 2)\n",
 41 |     "    \n",
 42 |     "    for (i1, i2) in combinations:\n",
 43 |     "        G1, G2 = graphs[i1], graphs[i2]\n",
 44 |     "        label1, label2 = G1.graph['label'], G2.graph['label']\n",
 45 |     "        \n",
 46 |     "        if nx.is_isomorphic(G1, G2):\n",
 47 |     "            num_iso_pairs += 1\n",
 48 |     "            if label1 != label2:\n",
 49 |     "                num_inconsistent_labels += 1\n",
 50 |     "    \n",
 51 |     "    print(f\"number of isomorphic pairs: {num_iso_pairs}\") \n",
 52 |     "    print(f\"number of isomorphic pairs with inconsistent labels: {num_inconsistent_labels}\")\n",
 53 |     "    print(f\"ratio of inconsistently labelled isomorphic pairs vs. isomorphic pairs: {num_inconsistent_labels / num_iso_pairs:.4f}\")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 3,
 59 |    "metadata": {
 60 |     "scrolled": false
 61 |    },
 62 |    "outputs": [
 63 |     {
 64 |      "name": "stdout",
 65 |      "output_type": "stream",
 66 |      "text": [
 67 |       "############## Checking dataset IMDB-BINARY ##############\n",
 68 |       "number of isomorphic pairs: 3356\n",
 69 |       "number of isomorphic pairs with inconsistent labels: 1119\n",
 70 |       "ratio of inconsistently labelled isomorphic pairs vs. isomorphic pairs: 0.3334\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "dataset_name = \"IMDB-BINARY\"\n",
 76 |     "\n",
 77 |     "print(f\"############## Checking dataset {dataset_name} ##############\")\n",
 78 |     "dataset = TUDataset(f'tmp/{dataset_name}', dataset_name)\n",
 79 |     "graphs = dataset_to_graphs(dataset)\n",
 80 |     "\n",
 81 |     "# WARNING: this might take several minutes depending on your hardware\n",
 82 |     "check(graphs)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "scrolled": false
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "def bfs_seq(G, start_id):\n",
 94 |     "    \"\"\" taken from https://github.com/JiaxuanYou/graph-generation/blob/master/data.py \"\"\"\n",
 95 |     "    dictionary = dict(nx.bfs_successors(G, start_id))\n",
 96 |     "    start = [start_id]\n",
 97 |     "    output = [start_id]\n",
 98 |     "    while len(start) > 0:\n",
 99 |     "        next = []\n",
100 |     "        while len(start) > 0:\n",
101 |     "            current = start.pop(0)\n",
102 |     "            neighbor = dictionary.get(current)\n",
103 |     "            if neighbor is not None:\n",
104 |     "                next = next + neighbor\n",
105 |     "        output = output + next\n",
106 |     "        start = next\n",
107 |     "    return output\n",
108 |     "\n",
109 |     "# 10 and 710 have different labels, but are isomorphic\n",
110 |     "G1, G2 = graphs[10], graphs[710]\n",
111 |     "\n",
112 |     "# reorder nodes\n",
113 |     "seq1, seq2 = bfs_seq(G1, 0), bfs_seq(G2, 0)\n",
114 |     "G2 = nx.relabel_nodes(G2, {n:m for n, m in zip(seq2, seq1)})\n",
115 |     "print(f\"G1 label: {G1.graph['label']} - G2 label: {G2.graph['label']}\")\n",
116 |     "\n",
117 |     "fig, axs = plt.subplots(1, 2)\n",
118 |     "pos = nx.random_layout(G1, seed=42)\n",
119 |     "nx.draw_networkx(G1, pos=pos, ax=axs.flat[0])\n",
120 |     "nx.draw_networkx(G2, pos=pos, ax=axs.flat[1])"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": []
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "kernelspec": {
133 |    "display_name": "Python 3",
134 |    "language": "python",
135 |    "name": "python3"
136 |   },
137 |   "language_info": {
138 |    "codemirror_mode": {
139 |     "name": "ipython",
140 |     "version": 3
141 |    },
142 |    "file_extension": ".py",
143 |    "mimetype": "text/x-python",
144 |    "name": "python",
145 |    "nbconvert_exporter": "python",
146 |    "pygments_lexer": "ipython3",
147 |    "version": "3.7.4"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 2
152 | }
153 | 


--------------------------------------------------------------------------------
/gnn-comparison/Conda_Info:
--------------------------------------------------------------------------------
1 | Install gcc_linux-64 and gxx_linux-64 (or similar names) with conda
2 | Use export CUDA_VISIBLE_DEVICES="" if Graph_Comparison_Models crashes when launched in parallel
3 | Use export OMP_NUM_THREADS=1 in case of issues with MultiProcessing
4 | 
5 | 


--------------------------------------------------------------------------------
/gnn-comparison/EndToEnd_Evaluation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | from config.base import Grid, Config
 5 | 
 6 | from evaluation.model_selection.HoldOutSelector import HoldOutSelector
 7 | from evaluation.risk_assessment.K_Fold_Assessment import KFoldAssessment
 8 | from experiments.EndToEndExperiment import EndToEndExperiment
 9 | 
10 | 
11 | def main(config_file, dataset_name,
12 |          outer_k, outer_processes, inner_k, inner_processes, result_folder, debug=False):
13 | 
14 |     # Needed to avoid thread spawning, conflicts with multi-processing. You may set a number > 1 but take into account
15 |     # the number of processes on the machine
16 |     torch.set_num_threads(1)
17 | 
18 |     experiment_class = EndToEndExperiment
19 | 
20 |     model_configurations = Grid(config_file, dataset_name)
21 |     model_configuration = Config(**model_configurations[0])
22 | 
23 |     exp_path = os.path.join(result_folder, f'{model_configuration.exp_name}_assessment')
24 | 
25 |     model_selector = HoldOutSelector(max_processes=inner_processes)
26 |     risk_assesser = KFoldAssessment(outer_k, model_selector, exp_path, model_configurations,
27 |                                     outer_processes=outer_processes)
28 | 
29 |     risk_assesser.risk_assessment(experiment_class, debug=debug)
30 | 


--------------------------------------------------------------------------------
/gnn-comparison/Launch_Experiments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from EndToEnd_Evaluation import main as endtoend
 3 | 
 4 | 
 5 | def get_args():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--config-file', dest='config_file')
 8 |     parser.add_argument('--experiment', dest='experiment', default='endtoend')
 9 |     parser.add_argument('--result-folder', dest='result_folder', default='RESULTS')
10 |     parser.add_argument('--dataset-name', dest='dataset_name', default='none')
11 |     parser.add_argument('--outer-folds', dest='outer_folds', default=10)
12 |     parser.add_argument('--outer-processes', dest='outer_processes', default=2)
13 |     parser.add_argument('--inner-folds', dest='inner_folds', default=5)
14 |     parser.add_argument('--inner-processes', dest='inner_processes', default=1)
15 |     parser.add_argument('--debug', action="store_true", dest='debug')
16 |     return parser.parse_args()
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     args = get_args()
21 | 
22 |     if args.dataset_name != 'none':
23 |         datasets = [args.dataset_name]
24 |     else:
25 |         datasets = ['IMDB-MULTI', 'IMDB-BINARY', 'PROTEINS', 'NCI1', 'ENZYMES', 'DD',
26 |                     'REDDIT-BINARY', 'REDDIT-MULTI-5K', 'COLLAB', 'REDDIT-MULTI-12K']
27 | 
28 |     config_file = args.config_file
29 |     experiment = args.experiment
30 | 
31 |     for dataset_name in datasets:
32 |         try:
33 |             endtoend(config_file, dataset_name,
34 |                      outer_k=int(args.outer_folds), outer_processes=int(args.outer_processes),
35 |                      inner_k=int(args.inner_folds), inner_processes=int(args.inner_processes),
36 |                      result_folder=args.result_folder, debug=args.debug)
37 |         
38 |         except Exception as e:
39 |             raise e  # print(e)


--------------------------------------------------------------------------------
/gnn-comparison/PrepareDatasets.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from datasets import *
 4 | 
 5 | 
 6 | DATASETS = {
 7 |     'REDDIT-BINARY': RedditBinary,
 8 |     'REDDIT-MULTI-5K': Reddit5K,
 9 |     'COLLAB': Collab,
10 |     'IMDB-BINARY': IMDBBinary,
11 |     'IMDB-MULTI': IMDBMulti,
12 |     'NCI1': NCI1,
13 |     'ENZYMES': Enzymes,
14 |     'PROTEINS': Proteins,
15 |     'DD': DD
16 | }
17 | 
18 | 
19 | def get_args_dict():
20 |     parser = argparse.ArgumentParser()
21 | 
22 |     parser.add_argument('DATA_DIR',
23 |                         help='where to save the datasets')
24 |     parser.add_argument('--dataset-name', dest='dataset_name',
25 |                         choices=DATASETS.keys(), default='all', help='dataset name [Default: \'all\']')
26 |     parser.add_argument('--outer-k', dest='outer_k', type=int,
27 |                         default=10, help='evaluation folds [Default: 10]')
28 |     parser.add_argument('--inner-k', dest='inner_k', type=int,
29 |                         default=None, help='model selection folds [Default: None]')
30 |     parser.add_argument('--use-one', action='store_true',
31 |                         default=False, help='use 1 as feature')
32 |     parser.add_argument('--use-degree', dest='use_node_degree', action='store_true',
33 |                         default=False, help='use degree as feature')
34 |     parser.add_argument('--no-kron', dest='precompute_kron_indices', action='store_false',
35 |                         default=True, help='don\'t precompute kron reductions')
36 | 
37 |     return vars(parser.parse_args())
38 | 
39 | 
40 | def preprocess_dataset(name, args_dict):
41 |     dataset_class = DATASETS[name]
42 |     if name == 'ENZYMES':
43 |         args_dict.update(use_node_attrs=True)
44 |     dataset_class(**args_dict)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     args_dict = get_args_dict()
49 | 
50 |     print(args_dict)
51 | 
52 |     dataset_name = args_dict.pop('dataset_name')
53 |     if dataset_name == 'all':
54 |         for name in DATASETS:
55 |             preprocess_dataset(name, args_dict)
56 |     else:
57 |         preprocess_dataset(dataset_name, args_dict)


--------------------------------------------------------------------------------
/gnn-comparison/README.md:
--------------------------------------------------------------------------------
 1 | # A Fair Comparison of Graph Neural Networks for Graph Classification
 2 | 
 3 | ## Summary
 4 | 
 5 | The library includes data and scripts to reproduce the experiments reported in the paper.
 6 | 
 7 | This research software is provided as is. If you happen to use or modify this code, please remember to cite the paper:
 8 | 
 9 | [*Federico Errica and Marco Podda, Davide Bacciu, Alessio Micheli: A Fair Comparison of Graph Neural Networks for Graph Classification. Proceedings of the 8th International Conference on Learning Representations (ICLR 2020).*](https://openreview.net/pdf?id=HygDF6NFPB)
10 | 
11 | ### Instructions
12 | 
13 | To reproduce the experiments, first preprocess datasets as follows:
14 | 
15 | `python PrepareDatasets.py DATA/CHEMICAL --dataset-name <name> --outer-k 10`
16 | 
17 | `python PrepareDatasets.py DATA/SOCIAL_1 --dataset-name <name> --use-one --outer-k 10`
18 | 
19 | `python PrepareDatasets.py DATA/SOCIAL_DEGREE --dataset-name <name> --use-degree --outer-k 10`
20 | 
21 | Where `<name>` is the name of the dataset. Then, substitute the split (json) files with the ones provided in the `data_splits` folder.
22 | 
23 | Please note that dataset folders should be organized as follows:
24 | 
25 |     CHEMICAL:
26 |         NCI1
27 |         DD
28 |         ENZYMES
29 |         PROTEINS
30 |     SOCIAL[_1 | _DEGREE]:
31 |         IMDB-BINARY
32 |         IMDB-MULTI
33 |         REDDIT-BINARY
34 |         REDDIT-MULTI-5K
35 |         COLLAB
36 | 
37 | Then, you can launch experiments by typing:
38 | 
39 | `cp -r DATA/[CHEMICAL|SOCIAL_1|SOCIAL_DEGREE]/<name> DATA`
40 | `python Launch_Experiments.py --config-file <config> --dataset-name <name> --result-folder <your-result-folder> --debug`
41 | 
42 | Where `<config>` is your config file (e.g. config_BaselineChemical.yml), and `<name>` is the dataset name chosen as before.
43 | 
44 | ### Troubleshooting
45 | 
46 | The installation of Pytorch Geometric depends on other libraries (torch_scatter, torch_cluster, torch_sparse) that have to be installed separately and before torch_geometric. Do not use pip install -r requirements.txt because it will not work. Please refer to the [official instructions](https://github.com/rusty1s/pytorch_geometric) to install the required libraries.
47 | 
48 | 


--------------------------------------------------------------------------------
/gnn-comparison/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/config/__init__.py


--------------------------------------------------------------------------------
/gnn-comparison/config/utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import json
 3 | import yaml
 4 | import pickle
 5 | 
 6 | 
 7 | def read_config_file(dict_or_filelike):
 8 |     if isinstance(dict_or_filelike, dict):
 9 |         return dict_or_filelike
10 | 
11 |     path = Path(dict_or_filelike)
12 |     if path.suffix == ".json":
13 |         return json.load(open(path, "r"))
14 |     elif path.suffix in [".yaml", ".yml"]:
15 |         return yaml.load(open(path, "r"), Loader=yaml.FullLoader)
16 |     elif path.suffix in [".pkl", ".pickle"]:
17 |         return pickle.load(open(path, "rb"))
18 | 
19 |     raise ValueError("Only JSON, YaML and pickle files supported.")
20 | 


--------------------------------------------------------------------------------
/gnn-comparison/config_BaselineChemical.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   - MolecularFingerprint
 3 | device:
 4 |   - cpu
 5 | batch_size:
 6 |   - 32
 7 |   - 128
 8 | hidden_units:
 9 |   - 32
10 |   - 128
11 |   - 256
12 | learning_rate:
13 |   - 0.000001
14 |   - 0.001
15 |   - 0.1
16 | l2:
17 |   - 0.0001
18 |   - 0.001
19 |   - 0.01
20 | classifier_epochs:
21 |   - 5000
22 | optimizer:
23 |   - Adam
24 | scheduler:
25 |   - null
26 | loss:
27 |   - MulticlassClassificationLoss
28 | gradient_clipping:
29 |   - null
30 | early_stopper:
31 |   -
32 |     class: Patience
33 |     args:
34 |       patience: 500
35 |       use_loss: False
36 |   -
37 |     class: Patience
38 |     args:
39 |       patience: 500
40 |       use_loss: True
41 | shuffle:
42 |   - True
43 | 


--------------------------------------------------------------------------------
/gnn-comparison/config_BaselineENZYMES.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   - DeepMultisets
 3 | device:
 4 |   - cpu
 5 | batch_size:
 6 |   - 32
 7 | hidden_units:
 8 |   - 32
 9 |   - 64
10 |   - 128
11 |   - 256
12 | learning_rate:
13 |   - 0.00001
14 |   - 0.0001
15 |   - 0.001
16 | l2:
17 |   - 0.01
18 |   - 0.0001
19 |   - 0.00001
20 | classifier_epochs:
21 |   - 5000
22 | optimizer:
23 |   - Adam
24 | scheduler:
25 |   - null
26 | loss:
27 |   - MulticlassClassificationLoss
28 | gradient_clipping:
29 |   - null
30 | early_stopper:
31 |   -
32 |     class: Patience
33 |     args:
34 |       patience: 500
35 |       use_loss: False
36 |   -
37 |     class: Patience
38 |     args:
39 |       patience: 500
40 |       use_loss: True
41 | shuffle:
42 |   - True
43 | 


--------------------------------------------------------------------------------
/gnn-comparison/config_BaselineIMDB.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   - DeepMultisets
 3 | device:
 4 |   - cpu
 5 | batch_size:
 6 |   - 32
 7 |   - 128
 8 | hidden_units:
 9 |   - 32
10 |   - 128
11 |   - 256
12 | learning_rate:
13 |   - 0.000001
14 |   - 0.001
15 |   - 0.1
16 | l2:
17 |   - 0.0001
18 |   - 0.001
19 |   - 0.01
20 | classifier_epochs:
21 |   - 3000
22 | optimizer:
23 |   - Adam
24 | scheduler:
25 |   - null
26 | loss:
27 |   - MulticlassClassificationLoss
28 | gradient_clipping:
29 |   - null
30 | early_stopper:
31 |   -
32 |     class: Patience
33 |     args:
34 |       patience: 500
35 |       use_loss: False
36 |   -
37 |     class: Patience
38 |     args:
39 |       patience: 500
40 |       use_loss: True
41 | shuffle:
42 |   - True
43 | 


--------------------------------------------------------------------------------
/gnn-comparison/config_BaselineSocial.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   - DeepMultisets
 3 | device:
 4 |   - cpu
 5 | batch_size:
 6 |   - 32
 7 | hidden_units:
 8 |   - 32
 9 |   - 128
10 | learning_rate:
11 |   - 0.001
12 |   - 0.1
13 | l2:
14 |   - 0.0001
15 |   - 0.001
16 |   - 0.01
17 | classifier_epochs:
18 |   - 3000
19 | optimizer:
20 |   - Adam
21 | scheduler:
22 |   - null
23 | loss:
24 |   - MulticlassClassificationLoss
25 | gradient_clipping:
26 |   - null
27 | early_stopper:
28 |   -
29 |     class: Patience
30 |     args:
31 |       patience: 500
32 |       use_loss: False
33 |   -
34 |     class: Patience
35 |     args:
36 |       patience: 500
37 |       use_loss: True
38 | shuffle:
39 |   - True
40 | 


--------------------------------------------------------------------------------
/gnn-comparison/config_DGCNN.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   - DGCNN
 3 | device:
 4 |   - cpu
 5 | batch_size:
 6 |   - 50
 7 | last_layer_fa:
 8 |   - false
 9 | dense_dim:
10 |   - 128
11 | k:
12 |   - 0.9
13 |   - 0.6
14 | learning_rate:
15 |   - 0.0001
16 |   - 0.00001
17 | classifier_epochs:
18 |   - 1000
19 | optimizer:
20 |   - Adam
21 | scheduler:
22 |   - null
23 | loss:
24 |   - MulticlassClassificationLoss
25 | gradient_clipping:
26 |   - null
27 | early_stopper:
28 |   -
29 |     class: Patience
30 |     args:
31 |       patience: 500
32 |       use_loss: False
33 |   -
34 |     class: Patience
35 |     args:
36 |       patience: 500
37 |       use_loss: True
38 | shuffle:
39 |   - True
40 | l2:
41 |   - 0.
42 | embedding_dim:
43 |   - 32
44 |   - 64
45 | num_layers:
46 |   - 2
47 |   - 3
48 |   - 4
49 | 


--------------------------------------------------------------------------------
/gnn-comparison/config_DiffPool.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   - DiffPool
 3 | device:
 4 |   - cpu
 5 | batch_size:
 6 |   - 8
 7 | last_layer_fa:
 8 |   - false
 9 | learning_rate:
10 |   - 0.001
11 |   - 0.0001
12 |   - 0.00001
13 | classifier_epochs:
14 |   - 3000
15 | optimizer:
16 |   - Adam
17 | scheduler:
18 |   - null
19 | loss:
20 |   - DiffPoolMulticlassClassificationLoss
21 | l2:
22 |   - 0.
23 | gradient_clipping:
24 |   - 2.0
25 | early_stopper:
26 |   -
27 |     class: Patience
28 |     args:
29 |       patience: 500
30 |       use_loss: False
31 |   -
32 |     class: Patience
33 |     args:
34 |       patience: 500
35 |       use_loss: True
36 | shuffle:
37 |   - True
38 | num_layers:
39 |   # - 1
40 |   - 2
41 | gnn_dim_hidden:
42 |   # - 32
43 |   - 64
44 | dim_embedding:
45 |   # - 64
46 |   - 128
47 | dim_embedding_MLP:
48 |   - 50
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/gnn-comparison/config_ECC.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   - ECC
 3 | device:
 4 |   - cpu
 5 | batch_size:
 6 |   - 32
 7 | last_layer_fa:
 8 |   - false
 9 | learning_rate:
10 |   - 0.1
11 |   - 0.01
12 | classifier_epochs:
13 |   - 1000
14 | optimizer:
15 |   - SGD
16 | scheduler:
17 |   -
18 |     class: ECCLR
19 |     args:
20 |       gamma: 0.1
21 |       step_size: 10
22 | loss:
23 |   - MulticlassClassificationLoss
24 | gradient_clipping:
25 |   - null
26 | early_stopper:
27 |   -
28 |     class: Patience
29 |     args:
30 |       patience: 500
31 |       use_loss: False
32 |   -
33 |     class: Patience
34 |     args:
35 |       patience: 500
36 |       use_loss: True
37 | shuffle:
38 |   - True
39 | l2:
40 |   - 0.
41 | dropout:
42 |   - 0.05
43 |   - 0.25
44 | dropout_final:
45 |   - 0.1
46 | num_layers:
47 |   # - 1
48 |   - 2
49 | dim_embedding:
50 |   - 32
51 |   - 64
52 | 


--------------------------------------------------------------------------------
/gnn-comparison/config_GIN.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   - GIN
 3 | device:
 4 |   - cpu
 5 | last_layer_fa:
 6 |   - false
 7 | batch_size:
 8 |   - 32
 9 |   - 128
10 | learning_rate:
11 |   - 0.01
12 | classifier_epochs:
13 |   - 1000
14 | hidden_units:  # Note: GIN add a first layer that simply adds up all node features
15 |   - [64, 64, 64, 64]
16 |   - [32, 32, 32, 32]
17 |   - [64]
18 |   - [32, 32]
19 | optimizer:
20 |   - Adam
21 | scheduler:
22 |   -
23 |     class: StepLR
24 |     args:
25 |       step_size: 50
26 |       gamma: 0.5
27 | loss:
28 |   - MulticlassClassificationLoss
29 | train_eps:
30 |   - true
31 |   - false
32 | l2:
33 |   - 0.
34 | aggregation:
35 |   - mean
36 |   - sum
37 | gradient_clipping:
38 |   - null
39 | dropout:
40 |   - 0.5
41 |   - 0.
42 | early_stopper:
43 |   -
44 |     class: Patience
45 |     args:
46 |       patience: 500
47 |       use_loss: False
48 |   -
49 |     class: Patience
50 |     args:
51 |       patience: 500
52 |       use_loss: True
53 | shuffle:
54 |   - True
55 | resume:
56 |   - False
57 | 


--------------------------------------------------------------------------------
/gnn-comparison/config_GraphSAGE.yml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   - GraphSAGE
 3 | device:
 4 |   - cpu
 5 | batch_size:
 6 |   - 32
 7 | last_layer_fa:
 8 |   - false
 9 | learning_rate:
10 |   - 0.0001
11 |   - 0.01
12 |   - 0.001
13 | l2:
14 |   - 0.
15 | classifier_epochs:
16 |   - 1000
17 | optimizer:
18 |   - Adam
19 | scheduler:
20 |   - null
21 | loss:
22 |   - MulticlassClassificationLoss
23 | gradient_clipping:
24 |   - null
25 | early_stopper:
26 |   -
27 |     class: Patience
28 |     args:
29 |       patience: 500
30 |       use_loss: False
31 |   -
32 |     class: Patience
33 |     args:
34 |       patience: 500
35 |       use_loss: True
36 | shuffle:
37 |   - True
38 | dim_embedding:
39 |   - 32
40 |   - 64
41 | num_layers:
42 |   - 3
43 |   - 5
44 | aggregation:
45 |   - add
46 |   - max
47 |   - mean
48 | 


--------------------------------------------------------------------------------
/gnn-comparison/evaluation/dataset_getter.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class DatasetGetter:
 4 | 
 5 |     def __init__(self, outer_k=None, inner_k=None):
 6 |         self.outer_k = outer_k
 7 |         self.inner_k = inner_k
 8 | 
 9 |     def set_inner_k(self, k):
10 |         self.inner_k = k
11 | 
12 |     def get_train_val(self, dataset, batch_size, shuffle=True):
13 |         return dataset.get_model_selection_fold(self.outer_k, self.inner_k, batch_size, shuffle)
14 | 
15 |     def get_test(self, dataset, batch_size, shuffle=True):
16 |         return dataset.get_test_fold(self.outer_k, batch_size, shuffle)


--------------------------------------------------------------------------------
/gnn-comparison/evaluation/model_selection/HoldOutSelector.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import concurrent.futures
  4 | 
  5 | from log.Logger import Logger
  6 | 
  7 | 
  8 | class HoldOutSelector:
  9 |     """
 10 |     Class implementing a sufficiently general framework to do model selection
 11 |     """
 12 | 
 13 |     def __init__(self, max_processes):
 14 |         self.max_processes = max_processes
 15 | 
 16 |         # Create the experiments folder straight away
 17 |         self._CONFIG_BASE = 'config_'
 18 |         self._CONFIG_FILENAME = 'config_results.json'
 19 |         self.WINNER_CONFIG_FILENAME = 'winner_config.json'
 20 | 
 21 |     def process_results(self, HOLDOUT_MS_FOLDER, no_configurations):
 22 | 
 23 |         best_vl = 0.
 24 | 
 25 |         for i in range(1, no_configurations+1):
 26 |             try:
 27 |                 config_filename = os.path.join(HOLDOUT_MS_FOLDER, self._CONFIG_BASE + str(i),
 28 |                                                self._CONFIG_FILENAME)
 29 | 
 30 |                 with open(config_filename, 'r') as fp:
 31 |                     config_dict = json.load(fp)
 32 | 
 33 |                 vl = config_dict['VL_score']
 34 | 
 35 |                 if best_vl <= vl:
 36 |                     best_i = i
 37 |                     best_vl = vl
 38 |                     best_config = config_dict
 39 | 
 40 |             except Exception as e:
 41 |                 print(e)
 42 | 
 43 |         print('Model selection winner for experiment', HOLDOUT_MS_FOLDER, 'is config ', best_i, ':')
 44 |         for k in best_config.keys():
 45 |             print('\t', k, ':', best_config[k])
 46 | 
 47 |         return best_config
 48 | 
 49 |     def model_selection(self, dataset_getter, experiment_class, exp_path, model_configs, debug=False, other=None):
 50 |         """
 51 |         :param experiment_class: the kind of experiment used
 52 |         :param debug:
 53 |         :return: the best performing configuration on average over the k folds. TL;DR RETURNS A MODEL, NOT AN ESTIMATE!
 54 |         """
 55 |         HOLDOUT_MS_FOLDER = os.path.join(exp_path, 'HOLDOUT_MS')
 56 | 
 57 |         if not os.path.exists(HOLDOUT_MS_FOLDER):
 58 |             os.makedirs(HOLDOUT_MS_FOLDER)
 59 | 
 60 |         config_id = 0
 61 | 
 62 |         pool = concurrent.futures.ProcessPoolExecutor(max_workers=self.max_processes)
 63 | 
 64 |         for config in model_configs:  # generate_grid(model_configs):
 65 | 
 66 |             # Create a separate folder for each experiment
 67 |             exp_config_name = os.path.join(HOLDOUT_MS_FOLDER, self._CONFIG_BASE + str(config_id + 1))
 68 |             if not os.path.exists(exp_config_name):
 69 |                 os.makedirs(exp_config_name)
 70 | 
 71 |             json_config = os.path.join(exp_config_name, self._CONFIG_FILENAME)
 72 |             if not os.path.exists(json_config):
 73 |                 if not debug:
 74 |                     pool.submit(self._model_selection_helper, dataset_getter, experiment_class, config,
 75 |                                 exp_config_name, other)
 76 |                 else:  # DEBUG
 77 |                     self._model_selection_helper(dataset_getter, experiment_class, config, exp_config_name,
 78 |                                                 other)
 79 |             else:
 80 |                 # Do not recompute experiments for this fold.
 81 |                 print(f"Config {json_config} already present! Shutting down to prevent loss of previous experiments")
 82 |                 continue
 83 | 
 84 |             config_id += 1
 85 | 
 86 |         pool.shutdown()  # wait the batch of configs to terminate
 87 | 
 88 |         best_config = self.process_results(HOLDOUT_MS_FOLDER, config_id)
 89 | 
 90 |         with open(os.path.join(HOLDOUT_MS_FOLDER, self.WINNER_CONFIG_FILENAME), 'w') as fp:
 91 |             json.dump(best_config, fp)
 92 | 
 93 |         return best_config
 94 | 
 95 |     def _model_selection_helper(self, dataset_getter, experiment_class, config, exp_config_name,
 96 |                                 other=None):
 97 |         """
 98 |         :param dataset_getter:
 99 |         :param experiment_class:
100 |         :param config:
101 |         :param exp_config_name:
102 |         :param other:
103 |         :return:
104 |         """
105 | 
106 |         # Create the experiment object which will be responsible for running a specific experiment
107 |         experiment = experiment_class(config, exp_config_name)
108 | 
109 |         # Set up a log file for this experiment (run in a separate process)
110 |         logger = Logger(str(os.path.join(experiment.exp_path, 'experiment.log')), mode='a')
111 |         logger.log('Configuration: ' + str(experiment.model_config))
112 | 
113 |         config_filename = os.path.join(experiment.exp_path, self._CONFIG_FILENAME)
114 | 
115 |         # ------------- PREPARE DICTIONARY TO STORE RESULTS -------------- #
116 | 
117 |         selection_dict = {
118 |             'config': experiment.model_config.config_dict,
119 |             'TR_score': 0.,
120 |             'VL_score': 0.,
121 |         }
122 | 
123 |         dataset_getter.set_inner_k(None)  # need to stay this way
124 | 
125 |         training_score, validation_score = experiment.run_valid(dataset_getter, logger, other)
126 | 
127 |         selection_dict['TR_score'] = float(training_score)
128 |         selection_dict['VL_score'] = float(validation_score)
129 | 
130 |         logger.log('TR Accuracy: ' + str(training_score) + ' VL Accuracy: ' + str(validation_score))
131 | 
132 |         with open(config_filename, 'w') as fp:
133 |             json.dump(selection_dict, fp)
134 | 


--------------------------------------------------------------------------------
/gnn-comparison/evaluation/model_selection/K_Fold_Selection.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | 
  4 | import numpy as np
  5 | import concurrent.futures
  6 | from copy import deepcopy
  7 | 
  8 | from log.Logger import Logger
  9 | 
 10 | 
 11 | class KFoldSelection:
 12 |     """
 13 |     Class implementing a sufficiently general framework to do model selection
 14 |     """
 15 | 
 16 |     def __init__(self, folds, max_processes):
 17 |         self.folds = folds
 18 |         self.max_processes = max_processes
 19 | 
 20 |         # Create the experiments folder straight away
 21 |         self._CONFIG_BASE = 'config_'
 22 |         self._CONFIG_FILENAME = 'config_results.json'
 23 |         self.WINNER_CONFIG_FILENAME = 'winner_config.json'
 24 | 
 25 |     def process_results(self, KFOLD_FOLDER, no_configurations):
 26 | 
 27 |         best_avg_vl = 0.
 28 |         best_std_vl = 100.
 29 | 
 30 |         for i in range(1, no_configurations+1):
 31 |             try:
 32 |                 config_filename = os.path.join(KFOLD_FOLDER, self._CONFIG_BASE + str(i), self._CONFIG_FILENAME)
 33 | 
 34 |                 with open(config_filename, 'r') as fp:
 35 |                     config_dict = json.load(fp)
 36 | 
 37 |                 avg_vl = config_dict['avg_VL_score']
 38 |                 std_vl = config_dict['std_VL_score']
 39 | 
 40 |                 if (best_avg_vl < avg_vl) or (best_avg_vl == avg_vl and best_std_vl > std_vl):
 41 |                     best_i = i
 42 |                     best_avg_vl = avg_vl
 43 |                     best_config = config_dict
 44 | 
 45 |             except Exception as e:
 46 |                 print(e)
 47 | 
 48 |         print('Model selection winner for experiment', KFOLD_FOLDER, 'is config ', best_i, ':')
 49 |         for k in best_config.keys():
 50 |             print('\t', k, ':', best_config[k])
 51 | 
 52 |         return best_config
 53 | 
 54 |     def model_selection(self, dataset_getter, experiment_class, exp_path, model_configs, debug=False, other=None):
 55 |         """
 56 |         :param experiment_class: the kind of experiment used
 57 |         :param debug:
 58 |         :return: the best performing configuration on average over the k folds. TL;DR RETURNS A MODEL, NOT AN ESTIMATE!
 59 |         """
 60 | 
 61 |         exp_path = exp_path
 62 |         KFOLD_FOLDER = os.path.join(exp_path, str(self.folds) + '_FOLD_MS')
 63 | 
 64 |         if not os.path.exists(KFOLD_FOLDER):
 65 |             os.makedirs(KFOLD_FOLDER)
 66 | 
 67 |         config_id = 0
 68 | 
 69 |         pool = concurrent.futures.ProcessPoolExecutor(max_workers=self.max_processes)
 70 |         for config in model_configs:
 71 | 
 72 |             # I need to make a copy of this dictionary
 73 |             # It seems it gets shared between processes!
 74 |             cfg = deepcopy(config)
 75 | 
 76 |             # Create a separate folder for each experiment
 77 |             exp_config_name = os.path.join(KFOLD_FOLDER, self._CONFIG_BASE + str(config_id + 1))
 78 |             if not os.path.exists(exp_config_name):
 79 |                 os.makedirs(exp_config_name)
 80 | 
 81 |             if not debug:
 82 |                 pool.submit(self._model_selection_helper, dataset_getter, experiment_class, cfg,
 83 |                             exp_config_name, other)
 84 |             else:  # DEBUG
 85 |                 self._model_selection_helper(dataset_getter, experiment_class, cfg,
 86 |                                              exp_config_name, other)
 87 | 
 88 |             config_id += 1
 89 | 
 90 |         pool.shutdown()
 91 | 
 92 |         best_config = self.process_results(KFOLD_FOLDER, config_id)
 93 | 
 94 |         with open(os.path.join(KFOLD_FOLDER, self.WINNER_CONFIG_FILENAME), 'w') as fp:
 95 |             json.dump(best_config, fp)
 96 | 
 97 |         return best_config
 98 | 
 99 |     def _model_selection_helper(self, dataset_getter, experiment_class, config, exp_config_name,
100 |                                 other=None):
101 | 
102 |         # Set up a log file for this experiment (run in a separate process)
103 |         logger = Logger(str(os.path.join(exp_config_name, 'experiment.log')), mode='a')
104 | 
105 |         logger.log('Configuration: ' + str(config))
106 | 
107 |         config_filename = os.path.join(exp_config_name, self._CONFIG_FILENAME)
108 | 
109 |         # ------------- PREPARE DICTIONARY TO STORE RESULTS -------------- #
110 | 
111 |         k_fold_dict = {
112 |             'config': config,
113 |             'folds': [{} for _ in range(self.folds)],
114 |             'avg_TR_score': 0.,
115 |             'avg_VL_score': 0.,
116 |             'std_TR_score': 0.,
117 |             'std_VL_score': 0.
118 |         }
119 | 
120 |         for k in range(self.folds):
121 | 
122 |             dataset_getter.set_inner_k(k)
123 | 
124 |             fold_exp_folder = os.path.join(exp_config_name, 'FOLD_' + str(k + 1))
125 |             # Create the experiment object which will be responsible for running a specific experiment
126 |             experiment = experiment_class(config, fold_exp_folder)
127 | 
128 |             training_score, validation_score = experiment.run_valid(dataset_getter, logger, other)
129 | 
130 |             logger.log(str(k+1) + ' split, TR Accuracy: ' + str(training_score) +
131 |                        ' VL Accuracy: ' + str(validation_score))
132 | 
133 |             k_fold_dict['folds'][k]['TR_score'] = training_score
134 |             k_fold_dict['folds'][k]['VL_score'] = validation_score
135 | 
136 |         tr_scores = np.array([k_fold_dict['folds'][k]['TR_score'] for k in range(self.folds)])
137 |         vl_scores = np.array([k_fold_dict['folds'][k]['VL_score'] for k in range(self.folds)])
138 | 
139 |         k_fold_dict['avg_TR_score'] = tr_scores.mean()
140 |         k_fold_dict['std_TR_score'] = tr_scores.std()
141 |         k_fold_dict['avg_VL_score'] = vl_scores.mean()
142 |         k_fold_dict['std_VL_score'] = vl_scores.std()
143 | 
144 |         logger.log('TR avg is ' + str(k_fold_dict['avg_TR_score']) + ' std is ' + str(k_fold_dict['std_TR_score']) +
145 |                    ' VL avg is ' + str(k_fold_dict['avg_VL_score']) + ' std is ' + str(k_fold_dict['std_VL_score']))
146 | 
147 |         with open(config_filename, 'w') as fp:
148 |             json.dump(k_fold_dict, fp)
149 | 


--------------------------------------------------------------------------------
/gnn-comparison/evaluation/risk_assessment/HoldOutAssessment.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | from config.base import Config
 5 | from evaluation.dataset_getter import DatasetGetter
 6 | from log.Logger import Logger
 7 | 
 8 | 
 9 | class HoldOutAssessment:
10 |     """
11 |     Class implementing a sufficiently general framework to do model ASSESSMENT
12 |     """
13 | 
14 |     def __init__(self, model_selector, exp_path, model_configs, max_processes=2):
15 |         self.max_processes = max_processes
16 |         self.model_configs = model_configs  # Dictionary with key:list of possible values
17 |         self.model_selector = model_selector
18 | 
19 |         # Create the experiments folder straight away
20 |         self.exp_path = exp_path
21 |         self._HOLDOUT_FOLDER = os.path.join(exp_path, 'HOLDOUT_ASS')
22 |         self._ASSESSMENT_FILENAME = 'assessment_results.json'
23 | 
24 |     def risk_assessment(self, experiment_class, debug=False, other=None):
25 |         """
26 |         :param experiment_class: the kind of experiment used
27 |         :param debug:
28 |         :return: An average over the outer test folds. RETURNS AN ESTIMATE, NOT A MODEL!!!
29 |         """
30 |         if not os.path.exists(self._HOLDOUT_FOLDER):
31 |             os.makedirs(self._HOLDOUT_FOLDER)
32 |         else:
33 |             print("Folder already present! Shutting down to prevent loss of previous experiments")
34 |             return
35 | 
36 |         self._risk_assessment_helper(experiment_class, self._HOLDOUT_FOLDER, debug, other)
37 | 
38 |     def _risk_assessment_helper(self, experiment_class, exp_path, debug=False, other=None):
39 | 
40 |         dataset_getter = DatasetGetter(None)
41 | 
42 |         best_config = self.model_selector.model_selection(dataset_getter, experiment_class, exp_path,
43 |                                                           self.model_configs, debug, other)
44 | 
45 |         # Retrain with the best configuration and test
46 |         experiment = experiment_class(best_config['config'], exp_path)
47 | 
48 |         # Set up a log file for this experiment (I am in a forked process)
49 |         logger = Logger(str(os.path.join(experiment.exp_path, 'experiment.log')), mode='a')
50 | 
51 |         dataset_getter.set_inner_k(None)
52 | 
53 |         training_scores, test_scores = [], []
54 | 
55 |         # Mitigate bad random initializations
56 |         for i in range(3):
57 |             training_score, test_score = experiment.run_test(dataset_getter, logger, other)
58 |             print(f'Final training run {i + 1}: {training_score}, {test_score}')
59 | 
60 |             training_scores.append(training_score)
61 |             test_scores.append(test_score)
62 | 
63 |         training_score = sum(training_scores)/3
64 |         test_score = sum(test_scores)/3
65 | 
66 |         logger.log('TR score: ' + str(training_score) + ' TS score: ' + str(test_score))
67 | 
68 |         with open(os.path.join(self._HOLDOUT_FOLDER, self._ASSESSMENT_FILENAME), 'w') as fp:
69 |             json.dump({'best_config': best_config, 'HOLDOUT_TR': training_score, 'HOLDOUT_TS': test_score}, fp)
70 | 


--------------------------------------------------------------------------------
/gnn-comparison/experiments/EndToEndExperiment.py:
--------------------------------------------------------------------------------
  1 | from models.gnn_wrapper.NetWrapper import NetWrapper
  2 | 
  3 | from experiments.Experiment import Experiment
  4 | 
  5 | 
  6 | class EndToEndExperiment(Experiment):
  7 | 
  8 |     def __init__(self, model_configuration, exp_path):
  9 |         super(EndToEndExperiment, self).__init__(model_configuration, exp_path)
 10 | 
 11 |     def run_valid(self, dataset_getter, logger, other=None):
 12 |         """
 13 |         This function returns the training and validation or test accuracy
 14 |         :return: (training accuracy, validation/test accuracy)
 15 |         """
 16 | 
 17 |         # print(self.model_config, dataset_getter.outer_k, dataset_getter.inner_k)
 18 | 
 19 |         dataset_class = self.model_config.dataset  # dataset_class()
 20 | 
 21 |         if 'dense' in self.model_config:
 22 |             dataset = dataset_class(dense=self.model_config.dense)
 23 |         else:
 24 |             dataset = dataset_class()
 25 | 
 26 |         model_class = self.model_config.model
 27 |         loss_class = self.model_config.loss
 28 |         optim_class = self.model_config.optimizer
 29 |         sched_class = self.model_config.scheduler
 30 |         stopper_class = self.model_config.early_stopper
 31 |         clipping = self.model_config.gradient_clipping
 32 | 
 33 |         shuffle = self.model_config['shuffle'] if 'shuffle' in self.model_config else True
 34 | 
 35 |         train_loader, val_loader = dataset_getter.get_train_val(dataset, self.model_config['batch_size'],
 36 |                                                                 shuffle=shuffle)
 37 | 
 38 |         model = model_class(dim_features=dataset.dim_features, dim_target=dataset.dim_target, config=self.model_config)
 39 |         net = NetWrapper(model, loss_function=loss_class(), device=self.model_config['device'])
 40 | 
 41 |         optimizer = optim_class(model.parameters(),
 42 |                                 lr=self.model_config['learning_rate'], weight_decay=self.model_config['l2'])
 43 | 
 44 |         if sched_class is not None:
 45 |             scheduler = sched_class(optimizer)
 46 |         else:
 47 |             scheduler = None
 48 | 
 49 |         train_loss, train_acc, val_loss, val_acc, _, _, _ = net.train(train_loader=train_loader,
 50 |                                                                    max_epochs=self.model_config['classifier_epochs'],
 51 |                                                                    optimizer=optimizer, scheduler=scheduler,
 52 |                                                                    clipping=clipping,
 53 |                                                                    validation_loader=val_loader,
 54 |                                                                    early_stopping=stopper_class,
 55 |                                                                    logger=logger)
 56 |         return train_acc, val_acc
 57 | 
 58 |     def run_test(self, dataset_getter, logger, other=None):
 59 |         """
 60 |         This function returns the training and test accuracy. DO NOT USE THE TEST FOR TRAINING OR EARLY STOPPING!
 61 |         :return: (training accuracy, test accuracy)
 62 |         """
 63 | 
 64 |         dataset_class = self.model_config.dataset  # dataset_class()
 65 | 
 66 |         if 'dense' in self.model_config:
 67 |             dataset = dataset_class(dense=self.model_config.dense)
 68 |         else:
 69 |             dataset = dataset_class()
 70 | 
 71 |         shuffle = self.model_config['shuffle'] if 'shuffle' in self.model_config else True
 72 | 
 73 |         model_class = self.model_config.model
 74 |         loss_class = self.model_config.loss
 75 |         optim_class = self.model_config.optimizer
 76 |         sched_class = self.model_config.scheduler
 77 |         stopper_class = self.model_config.early_stopper
 78 |         clipping = self.model_config.gradient_clipping
 79 | 
 80 |         train_loader, val_loader = dataset_getter.get_train_val(dataset, self.model_config['batch_size'],
 81 |                                                                 shuffle=shuffle)
 82 |         test_loader = dataset_getter.get_test(dataset, self.model_config['batch_size'], shuffle=shuffle)
 83 | 
 84 |         model = model_class(dim_features=dataset.dim_features, dim_target=dataset.dim_target,
 85 |                             config=self.model_config)
 86 |         net = NetWrapper(model, loss_function=loss_class(), device=self.model_config['device'])
 87 | 
 88 |         optimizer = optim_class(model.parameters(),
 89 |                                 lr=self.model_config['learning_rate'], weight_decay=self.model_config['l2'])
 90 | 
 91 |         if sched_class is not None:
 92 |             scheduler = sched_class(optimizer)
 93 |         else:
 94 |             scheduler = None
 95 | 
 96 |         train_loss, train_acc, val_loss, val_acc, test_loss, test_acc, _ = \
 97 |             net.train(train_loader=train_loader, max_epochs=self.model_config['classifier_epochs'],
 98 |                       optimizer=optimizer, scheduler=scheduler, clipping=clipping,
 99 |                       validation_loader=val_loader, test_loader=test_loader, early_stopping=stopper_class,
100 |                       logger=logger)
101 | 
102 |         return train_acc, test_acc
103 | 


--------------------------------------------------------------------------------
/gnn-comparison/experiments/Experiment.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from config.base import Config
 3 | 
 4 | 
 5 | class Experiment:
 6 |     """
 7 |     Experiment provides a layer of abstraction to avoid that all models implement the same interface
 8 |     """
 9 | 
10 |     def __init__(self, model_configuration, exp_path):
11 |         self.model_config = Config.from_dict(model_configuration)
12 |         self.exp_path = exp_path
13 | 
14 |     def run_valid(self, get_train_val, logger, other=None):
15 |         """
16 |         This function returns the training and validation accuracy. DO WHATEVER YOU WANT WITH VL SET,
17 |         BECAUSE YOU WILL MAKE PERFORMANCE ASSESSMENT ON A TEST SET
18 |         :return: (training accuracy, validation accuracy)
19 |         """
20 |         raise NotImplementedError('You must implement this function!')
21 | 
22 |     def run_test(self, get_train_val, get_test, logger, other=None):
23 |         """
24 |         This function returns the training and test accuracy
25 |         :return: (training accuracy, test accuracy)
26 |         """
27 |         raise NotImplementedError('You must implement this function!')
28 | 
29 | 
30 | class ToyExperiment(Experiment):
31 | 
32 |     def __init__(self, model_configuration, exp_path):
33 |         super(ToyExperiment, self).__init__(model_configuration, exp_path)
34 | 
35 |     def run_valid(self, get_train_val, logger, other=None):
36 |         """
37 |         This function returns the training and validation or test accuracy
38 |         :return: (training accuracy, validation/test accuracy)
39 |         """
40 |         return random.uniform(0, 100), random.uniform(0, 100)
41 | 
42 |     def run_test(self, get_train_val, logger, get_test, other=None):
43 |         """
44 |         This function returns the training and test accuracy. DO NOT USE THE TEST FOR ANY REASON
45 |         :return: (training accuracy, test accuracy)
46 |         """
47 |         return random.uniform(0, 100), random.uniform(0, 100)
48 | 


--------------------------------------------------------------------------------
/gnn-comparison/log/Logger.py:
--------------------------------------------------------------------------------
 1 | class Logger:
 2 |     def __init__(self, filepath, mode, lock=None):
 3 |         """
 4 |         Implements write routine
 5 |         :param filepath: the file where to write
 6 |         :param mode: can be 'w' or 'a'
 7 |         :param lock: pass a shared lock for multi process write access
 8 |         """
 9 |         self.filepath = filepath
10 |         if mode not in ['w', 'a']:
11 |             assert False, 'Mode must be one of w, r or a'
12 |         else:
13 |             self.mode = mode
14 |         self.lock = lock
15 | 
16 |     def log(self, str):
17 |         if self.lock:
18 |             self.lock.acquire()
19 | 
20 |         try:
21 |             with open(self.filepath, self.mode) as f:
22 |                 f.write(str + '\n')
23 |         except Exception as e:
24 |             print(e)
25 | 
26 |         if self.lock:
27 |             self.lock.release()
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/gnn-comparison/log/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/log/__init__.py


--------------------------------------------------------------------------------
/gnn-comparison/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/models/__init__.py


--------------------------------------------------------------------------------
/gnn-comparison/models/gnn_wrapper/NetWrapper.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from datetime import timedelta
  3 | import torch
  4 | from torch import optim
  5 | 
  6 | 
  7 | def format_time(avg_time):
  8 |     avg_time = timedelta(seconds=avg_time)
  9 |     total_seconds = int(avg_time.total_seconds())
 10 |     hours, remainder = divmod(total_seconds, 3600)
 11 |     minutes, seconds = divmod(remainder, 60)
 12 |     return f"{hours:02d}:{minutes:02d}:{int(seconds):02d}.{str(avg_time.microseconds)[:3]}"
 13 | 
 14 | 
 15 | class NetWrapper:
 16 | 
 17 |     def __init__(self, model, loss_function, device='cpu', classification=True):
 18 |         self.model = model
 19 |         self.loss_fun = loss_function
 20 |         self.device = torch.device(device)
 21 |         self.classification = classification
 22 | 
 23 |     def _train(self, train_loader, optimizer, clipping=None):
 24 |         model = self.model.to(self.device)
 25 | 
 26 |         model.train()
 27 | 
 28 |         loss_all = 0
 29 |         acc_all = 0
 30 |         for data in train_loader:
 31 | 
 32 |             data = data.to(self.device)
 33 |             optimizer.zero_grad()
 34 |             output = model(data)
 35 | 
 36 |             if not isinstance(output, tuple):
 37 |                 output = (output,)
 38 | 
 39 |             if self.classification:
 40 |                 loss, acc = self.loss_fun(data.y, *output)
 41 |                 loss.backward()
 42 | 
 43 |                 try:
 44 |                     num_graphs = data.num_graphs
 45 |                 except TypeError:
 46 |                     num_graphs = data.adj.size(0)
 47 | 
 48 |                 loss_all += loss.item() * num_graphs
 49 |                 acc_all += acc.item() * num_graphs
 50 |             else:
 51 |                 loss = self.loss_fun(data.y, *output)
 52 |                 loss.backward()
 53 |                 loss_all += loss.item()
 54 | 
 55 |             if clipping is not None:  # Clip gradient before updating weights
 56 |                 torch.nn.utils.clip_grad_norm_(model.parameters(), clipping)
 57 |             optimizer.step()
 58 | 
 59 |         if self.classification:
 60 |             return acc_all / len(train_loader.dataset), loss_all / len(train_loader.dataset)
 61 |         else:
 62 |             return None, loss_all / len(train_loader.dataset)
 63 | 
 64 |     def classify_graphs(self, loader):
 65 |         model = self.model.to(self.device)
 66 |         model.eval()
 67 | 
 68 |         loss_all = 0
 69 |         acc_all = 0
 70 |         for data in loader:
 71 |             data = data.to(self.device)
 72 |             output = model(data)
 73 | 
 74 |             if not isinstance(output, tuple):
 75 |                 output = (output,)
 76 | 
 77 |             if self.classification:
 78 |                 loss, acc = self.loss_fun(data.y, *output)
 79 | 
 80 |                 try:
 81 |                     num_graphs = data.num_graphs
 82 |                 except TypeError:
 83 |                     num_graphs = data.adj.size(0)
 84 | 
 85 |                 loss_all += loss.item() * num_graphs
 86 |                 acc_all += acc.item() * num_graphs
 87 |             else:
 88 |                 loss = self.loss_fun(data.y, *output)
 89 |                 loss_all += loss.item()
 90 | 
 91 |         if self.classification:
 92 |             return acc_all / len(loader.dataset), loss_all / len(loader.dataset)
 93 |         else:
 94 |             return None, loss_all / len(loader.dataset)
 95 | 
 96 |     def train(self, train_loader, max_epochs=100, optimizer=torch.optim.Adam, scheduler=None, clipping=None,
 97 |               validation_loader=None, test_loader=None, early_stopping=None, logger=None, log_every=10):
 98 | 
 99 |         early_stopper = early_stopping() if early_stopping is not None else None
100 | 
101 |         val_loss, val_acc = -1, -1
102 |         test_loss, test_acc = None, None
103 | 
104 |         time_per_epoch = []
105 | 
106 |         for epoch in range(1, max_epochs+1):
107 | 
108 |             start = time.time()
109 |             train_acc, train_loss = self._train(train_loader, optimizer, clipping)
110 |             end = time.time() - start
111 |             time_per_epoch.append(end)
112 | 
113 |             if scheduler is not None:
114 |                 scheduler.step(epoch)
115 | 
116 |             if test_loader is not None:
117 |                 test_acc, test_loss = self.classify_graphs(test_loader)
118 | 
119 |             if validation_loader is not None:
120 |                 val_acc, val_loss = self.classify_graphs(validation_loader)
121 | 
122 |                 # Early stopping (lazy if evaluation)
123 |                 if early_stopper is not None and early_stopper.stop(epoch, val_loss, val_acc,
124 |                                                                     test_loss, test_acc,
125 |                                                                     train_loss, train_acc):
126 |                     msg = f'Stopping at epoch {epoch}, best is {early_stopper.get_best_vl_metrics()}'
127 |                     if logger is not None:
128 |                         logger.log(msg)
129 |                         print(msg)
130 |                     else:
131 |                         print(msg)
132 |                     break
133 | 
134 |             if epoch % log_every == 0 or epoch == 1:
135 |                 msg = f'Epoch: {epoch}, TR loss: {train_loss} TR acc: {train_acc}, VL loss: {val_loss} VL acc: {val_acc} ' \
136 |                     f'TE loss: {test_loss} TE acc: {test_acc}'
137 |                 if logger is not None:
138 |                     logger.log(msg)
139 |                     print(msg)
140 |                 else:
141 |                     print(msg)
142 | 
143 |         time_per_epoch = torch.tensor(time_per_epoch)
144 |         avg_time_per_epoch = float(time_per_epoch.mean())
145 | 
146 |         elapsed = format_time(avg_time_per_epoch)
147 | 
148 |         if early_stopper is not None:
149 |             train_loss, train_acc, val_loss, val_acc, test_loss, test_acc, best_epoch = early_stopper.get_best_vl_metrics()
150 | 
151 |         return train_loss, train_acc, val_loss, val_acc, test_loss, test_acc, elapsed
152 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/gnn_wrapper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/models/gnn_wrapper/__init__.py


--------------------------------------------------------------------------------
/gnn-comparison/models/graph_classifiers/DGCNN.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch_geometric
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | from torch_geometric.nn import MessagePassing, global_sort_pool
  6 | from torch_geometric.utils import add_self_loops, degree
  7 | 
  8 | 
  9 | class DGCNN(nn.Module):
 10 |     """
 11 |     Uses fixed architecture
 12 |     """
 13 | 
 14 |     def __init__(self, dim_features, dim_target, config):
 15 |         super(DGCNN, self).__init__()
 16 | 
 17 |         self.ks = {'NCI1': { '0.6': 30, '0.9': 46 },
 18 |                    'PROTEINS_full': { '0.6': 32, '0.9': 81 },
 19 |                    'DD': {'0.6': 291, '0.9': 503 },
 20 |                    'ENZYMES': { '0.6': 36, '0.9': 48 },
 21 |                    'IMDB-BINARY': { '0.6': 18, '0.9': 31 },
 22 |                    'IMDB-MULTI': { '0.6': 11, '0.9': 22 },
 23 |                    'REDDIT-BINARY': { '0.6': 370, '0.9': 1002 },
 24 |                    'REDDIT-MULTI-5K': { '0.6': 469, '0.9': 1081 },
 25 |                    'COLLAB': { '0.6': 61, '0.9': 130 },
 26 |                    }
 27 | 
 28 |         self.k = self.ks[config.dataset.name][str(config['k'])]
 29 |         self.embedding_dim = config['embedding_dim']
 30 |         self.num_layers = config['num_layers']
 31 |         self.last_layer_fa = config['last_layer_fa']
 32 |         if self.last_layer_fa:
 33 |             print('Using LastLayerFA')
 34 | 
 35 |         self.convs = []
 36 |         for layer in range(self.num_layers):
 37 |             input_dim = dim_features if layer == 0 else self.embedding_dim
 38 |             self.convs.append(DGCNNConv(input_dim, self.embedding_dim))
 39 |         self.total_latent_dim = self.num_layers * self.embedding_dim
 40 | 
 41 |         # Add last embedding
 42 |         self.convs.append(DGCNNConv(self.embedding_dim, 1))
 43 |         self.total_latent_dim += 1
 44 | 
 45 |         self.convs = nn.ModuleList(self.convs)
 46 | 
 47 |         # should we leave this fixed?
 48 |         self.conv1d_params1 = nn.Conv1d(1, 16, self.total_latent_dim, self.total_latent_dim)
 49 |         self.maxpool1d = nn.MaxPool1d(2, 2)
 50 |         self.conv1d_params2 = nn.Conv1d(16, 32, 5, 1)
 51 | 
 52 |         dense_dim = int((self.k - 2) / 2 + 1)
 53 |         self.input_dense_dim = (dense_dim - 5 + 1) * 32
 54 | 
 55 |         self.hidden_dense_dim = config['dense_dim']
 56 |         self.dense_layer = nn.Sequential(nn.Linear(self.input_dense_dim, self.hidden_dense_dim),
 57 |                                          nn.ReLU(),
 58 |                                          nn.Dropout(p=0.5),
 59 |                                          nn.Linear(self.hidden_dense_dim, dim_target))
 60 | 
 61 |     def forward(self, data):
 62 |         # Implement Equation 4.2 of the paper i.e. concat all layers' graph representations and apply linear model
 63 |         # note: this can be decomposed in one smaller linear model per layer
 64 |         x, edge_index, batch = data.x, data.edge_index, data.batch
 65 | 
 66 |         hidden_repres = []
 67 | 
 68 |         for i, conv in enumerate(self.convs):
 69 |             edges = edge_index
 70 |             if self.last_layer_fa and i == len(self.convs) - 1:
 71 |                 block_map = torch.eq(batch.unsqueeze(0), batch.unsqueeze(-1)).int()
 72 |                 edges, _ = torch_geometric.utils.dense_to_sparse(block_map)
 73 |             x = torch.tanh(conv(x, edges))
 74 |             hidden_repres.append(x)
 75 | 
 76 |         # apply sortpool
 77 |         x_to_sortpool = torch.cat(hidden_repres, dim=1)
 78 |         x_1d = global_sort_pool(x_to_sortpool, batch, self.k)  # in the code the authors sort the last channel only
 79 | 
 80 |         # apply 1D convolutional layers
 81 |         x_1d = torch.unsqueeze(x_1d, dim=1)
 82 |         conv1d_res = F.relu(self.conv1d_params1(x_1d))
 83 |         conv1d_res = self.maxpool1d(conv1d_res)
 84 |         conv1d_res = F.relu(self.conv1d_params2(conv1d_res))
 85 |         conv1d_res = conv1d_res.reshape(conv1d_res.shape[0], -1)
 86 | 
 87 |         # apply dense layer
 88 |         out_dense = self.dense_layer(conv1d_res)
 89 |         return out_dense
 90 | 
 91 | 
 92 | class DGCNNConv(MessagePassing):
 93 |     """
 94 |     Extended from tuorial on GCNs of Pytorch Geometrics
 95 |     """
 96 | 
 97 |     def __init__(self, in_channels, out_channels):
 98 |         super(DGCNNConv, self).__init__(aggr='add')  # "Add" aggregation.
 99 |         self.lin = nn.Linear(in_channels, out_channels)
100 | 
101 |     def forward(self, x, edge_index):
102 |         # x has shape [N, in_channels]
103 |         # edge_index has shape [2, E]
104 | 
105 |         # Step 1: Add self-loops to the adjacency matrix.
106 |         edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
107 | 
108 |         # Step 2: Linearly transform node feature matrix.
109 |         x = self.lin(x)
110 | 
111 |         # Step 3-5: Start propagating messages.
112 |         return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)
113 | 
114 |     def message(self, x_j, edge_index, size):
115 |         # x_j has shape [E, out_channels]
116 | 
117 |         # Step 3: Normalize node features.
118 |         src, dst = edge_index  # we assume source_to_target message passing
119 |         deg = degree(src, size[0], dtype=x_j.dtype)
120 |         deg = deg.pow(-1)
121 |         norm = deg[dst]
122 | 
123 |         return norm.view(-1, 1) * x_j  # broadcasting the normalization term to all out_channels === hidden features
124 | 
125 |     def update(self, aggr_out):
126 |         # aggr_out has shape [N, out_channels]
127 | 
128 |         # Step 5: Return new node embeddings.
129 |         return aggr_out
130 | 
131 |     def __repr__(self):
132 |         return '{}({}, {})'.format(self.__class__.__name__, self.in_channels,
133 |                                    self.out_channels)
134 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/graph_classifiers/DeepMultisets.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.nn import Linear
 4 | from torch_geometric.nn import global_add_pool
 5 | 
 6 | 
 7 | class DeepMultisets(torch.nn.Module):
 8 | 
 9 |     def __init__(self, dim_features, dim_target, config):
10 |         super(DeepMultisets, self).__init__()
11 | 
12 |         hidden_units = config['hidden_units']
13 | 
14 |         self.fc_vertex = Linear(dim_features, hidden_units)
15 |         self.fc_global1 = Linear(hidden_units, hidden_units)
16 |         self.fc_global2 = Linear(hidden_units, dim_target)
17 | 
18 |     def forward(self, data):
19 |         x, batch = data.x, data.batch
20 | 
21 |         x = F.relu(self.fc_vertex(x))
22 |         x = global_add_pool(x, batch)  # sums all vertex embeddings belonging to the same graph!
23 |         x = F.relu(self.fc_global1(x))
24 |         x = self.fc_global2(x)
25 |         return x
26 | 
27 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/graph_classifiers/GIN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.nn import BatchNorm1d
 4 | from torch.nn import Sequential, Linear, ReLU
 5 | import torch_geometric
 6 | from torch_geometric.nn import GINConv, global_add_pool, global_mean_pool
 7 | 
 8 | from models.graph_classifiers.self_attention import SelfAttention
 9 | 
10 | 
11 | class GIN(torch.nn.Module):
12 | 
13 |     def __init__(self, dim_features, dim_target, config):
14 |         super(GIN, self).__init__()
15 | 
16 |         self.config = config
17 |         self.dropout = config['dropout']
18 |         self.embeddings_dim = [config['hidden_units'][0]] + config['hidden_units']
19 |         self.no_layers = len(self.embeddings_dim)
20 |         self.first_h = []
21 |         self.nns = []
22 |         self.convs = []
23 |         self.linears = []
24 |         self.last_layer_fa = config['last_layer_fa']
25 |         if self.last_layer_fa:
26 |             print('Using LastLayerFA')
27 | 
28 |         train_eps = config['train_eps']
29 |         if config['aggregation'] == 'sum':
30 |             self.pooling = global_add_pool
31 |         elif config['aggregation'] == 'mean':
32 |             self.pooling = global_mean_pool
33 | 
34 |         for layer, out_emb_dim in enumerate(self.embeddings_dim):
35 | 
36 |             if layer == 0:
37 |                 self.first_h = Sequential(Linear(dim_features, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU(),
38 |                                     Linear(out_emb_dim, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU())
39 |                 self.linears.append(Linear(out_emb_dim, dim_target))
40 |             else:
41 |                 input_emb_dim = self.embeddings_dim[layer-1]
42 |                 self.nns.append(Sequential(Linear(input_emb_dim, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU(),
43 |                                       Linear(out_emb_dim, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU()))
44 |                 self.convs.append(GINConv(self.nns[-1], train_eps=train_eps))  # Eq. 4.2
45 | 
46 |                 self.linears.append(Linear(out_emb_dim, dim_target))
47 | 
48 | 
49 |         self.nns = torch.nn.ModuleList(self.nns)
50 |         self.convs = torch.nn.ModuleList(self.convs)
51 |         self.linears = torch.nn.ModuleList(self.linears)  # has got one more for initial input
52 | 
53 |     def forward(self, data):
54 |         x, edge_index, batch = data.x, data.edge_index, data.batch
55 | 
56 |         out = 0
57 | 
58 |         for layer in range(self.no_layers):
59 |             if layer == 0:
60 |                 x = self.first_h(x)
61 | 
62 |                 out += F.dropout(self.pooling(self.linears[layer](x), batch), p=self.dropout)
63 |             else:
64 |                 # Layer l ("convolution" layer)
65 |                 edges = edge_index
66 |                 if self.last_layer_fa and layer == self.no_layers - 1:
67 |                     block_map = torch.eq(batch.unsqueeze(0), batch.unsqueeze(-1)).int()
68 |                     edges, _ = torch_geometric.utils.dense_to_sparse(block_map)
69 |                 x = self.convs[layer-1](x, edges)
70 |                 out += F.dropout(self.linears[layer](self.pooling(x, batch)), p=self.dropout, training=self.training)
71 | 
72 |         return out
73 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/graph_classifiers/GraphSAGE.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch_geometric
 3 | from torch import nn
 4 | from torch.nn import functional as F
 5 | 
 6 | from torch_geometric.nn import SAGEConv, global_max_pool
 7 | 
 8 | import torch
 9 | import torch.nn.functional as F
10 | 
11 | 
12 | class GraphSAGE(nn.Module):
13 |     def __init__(self, dim_features, dim_target, config):
14 |         super().__init__()
15 | 
16 |         num_layers = config['num_layers']
17 |         dim_embedding = config['dim_embedding']
18 |         self.aggregation = config['aggregation']  # can be mean or max
19 |         self.last_layer_fa = config['last_layer_fa']
20 |         if self.last_layer_fa:
21 |             print('Using LastLayerFA')
22 | 
23 |         if self.aggregation == 'max':
24 |             self.fc_max = nn.Linear(dim_embedding, dim_embedding)
25 | 
26 |         self.layers = nn.ModuleList([])
27 |         for i in range(num_layers):
28 |             dim_input = dim_features if i == 0 else dim_embedding
29 | 
30 |             conv = SAGEConv(dim_input, dim_embedding)
31 |             # Overwrite aggregation method (default is set to mean
32 |             conv.aggr = self.aggregation
33 | 
34 |             self.layers.append(conv)
35 | 
36 |         # For graph classification
37 |         self.fc1 = nn.Linear(num_layers * dim_embedding, dim_embedding)
38 |         self.fc2 = nn.Linear(dim_embedding, dim_target)
39 | 
40 |     def forward(self, data):
41 |         x, edge_index, batch = data.x, data.edge_index, data.batch
42 | 
43 |         x_all = []
44 | 
45 |         for i, layer in enumerate(self.layers):
46 |             edges = edge_index
47 |             if self.last_layer_fa and i == len(self.layers) - 1:
48 |                 block_map = torch.eq(batch.unsqueeze(0), batch.unsqueeze(-1)).int()
49 |                 edges, _ = torch_geometric.utils.dense_to_sparse(block_map)
50 |             x = layer(x, edges)
51 |             if self.aggregation == 'max':
52 |                 x = torch.relu(self.fc_max(x))
53 |             x_all.append(x)
54 | 
55 |         x = torch.cat(x_all, dim=1)
56 |         x = global_max_pool(x, batch)
57 | 
58 |         x = F.relu(self.fc1(x))
59 |         x = self.fc2(x)
60 |         return x
61 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/graph_classifiers/MLP_Classifier.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.nn import Linear
 4 | 
 5 | 
 6 | class MLPClassifier(torch.nn.Module):
 7 | 
 8 |     def __init__(self, dim_features, dim_target, config):
 9 |         super(MLPClassifier, self).__init__()
10 | 
11 |         hidden_units = config['hidden_units']
12 | 
13 |         self.fc_global = Linear(dim_features, hidden_units)
14 |         self.out = Linear(hidden_units, dim_target)
15 | 
16 |     def forward(self, x, batch):
17 |         return self.out(F.relu(self.fc_global(x)))
18 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/graph_classifiers/MolecularFingerprint.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import ReLU
 3 | from torch_geometric.nn import global_add_pool
 4 | 
 5 | 
 6 | class MolecularFingerprint(torch.nn.Module):
 7 | 
 8 |     def __init__(self, dim_features, dim_target, config):
 9 |         super(MolecularFingerprint, self).__init__()
10 |         hidden_dim = config['hidden_units']
11 | 
12 |         self.mlp = torch.nn.Sequential(torch.nn.Linear(dim_features, hidden_dim), ReLU(),
13 |                                        torch.nn.Linear(hidden_dim, dim_target), ReLU())
14 | 
15 |     def forward(self, data):
16 |         return self.mlp(global_add_pool(data.x, data.batch))
17 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/graph_classifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/models/graph_classifiers/__init__.py


--------------------------------------------------------------------------------
/gnn-comparison/models/graph_classifiers/self_attention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class SelfAttention(torch.nn.Module):
 4 |     def __init__(self,
 5 |                  num_heads,
 6 |                  model_dim,
 7 |                  dropout_keep_prob):
 8 |         super(SelfAttention, self).__init__()
 9 |         
10 |         self.num_heads = num_heads
11 |         self.model_dim = model_dim
12 |         self.dropout_keep_prob = dropout_keep_prob
13 |         self.q_layer = torch.nn.Linear(model_dim, model_dim * self.num_heads, bias=False)
14 |         self.out_layer = torch.nn.Linear(model_dim * self.num_heads, model_dim, bias=False)
15 |         self.out_layer2 = torch.nn.Linear(model_dim * 2, model_dim, bias=False)
16 |         self.relu = torch.nn.ReLU()
17 |         self.softmax = torch.nn.Softmax(dim=-1)
18 |         self.dropout = torch.nn.Dropout(1- dropout_keep_prob)
19 | 
20 |     def forward(self, batched_inputs, attn_mask=None):
21 |         q = self._linear_projection(batched_inputs)
22 |         qs = self._split_heads(q)
23 |         tiled_inputs = batched_inputs.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
24 |         outputs = self._scaled_dot_product(qs, tiled_inputs, tiled_inputs, attn_mask)  # (batch, num_heads, max_contexts, value_dim)
25 |         outputs = self._concat_heads(outputs)  # (batch, max_contexts, value_dim * num_heads)
26 |         if self.num_heads > 1:
27 |             outputs = self.out_layer(outputs)  # (batch, max_contexts, model_dim)
28 |             outputs = self.relu(outputs)  # (batch, max_contexts, model_dim)
29 |             #outputs = self.dropout(outputs)
30 |         outputs = torch.cat([outputs, batched_inputs], dim=-1) # (batch, max_contexts, 2 * model_dim)
31 |         outputs = self.out_layer2(outputs) # (batch, max_contexts, model_dim)c
32 |         outputs = self.relu(outputs)  # (batch, max_contexts, model_dim)
33 |         return outputs
34 | 
35 |     def _linear_projection(self, batched_inputs):
36 |         q = self.q_layer(batched_inputs)  # (batch, max_contexts, key_dim * num_heads)
37 |         # k = tf.layers.dense(batched_inputs, units=self.model_dim,
38 |         #                     use_bias=False)  # (batch, max_contexts, key_dim * num_heads)
39 |         return q
40 | 
41 |     def _split_heads(self, q):
42 | 
43 |         def split_last_dimension_then_transpose(tensor, num_heads, dim):
44 |             tensor = tensor.view([-1, tensor.size()[1], num_heads,
45 |                                          dim])  # (batch, max_contexts, num_heads, dim)
46 |             return tensor.transpose(1,2)  # (batch, num_heads, max_contexts, dim)
47 | 
48 |         qs = split_last_dimension_then_transpose(q, self.num_heads,
49 |                                                  self.model_dim)  # (batch, num_heads, max_contexts, key_dim)
50 |         # ks = split_last_dimension_then_transpose(k, self.num_heads,
51 |         #                                          self.model_dim)  # (batch, num_heads, max_contexts, key_dim)
52 |         return qs
53 | 
54 |     def _scaled_dot_product(self, qs, ks, tiled_inputs, valid_mask):
55 |         queries_dot_keys = torch.matmul(qs, ks.transpose(2,3))  # (batch, num_heads, max_contexts, max_contexts)
56 |         scaled_scores = queries_dot_keys #/ ((self.model_dim // self.num_heads) ** 0.5)  # (batch, num_heads, max_contexts, max_contexts)
57 | 
58 |         if valid_mask is not None:
59 |             mask = torch.log(valid_mask.view(valid_mask.size()[0], 1, 1, valid_mask.size()[1])) # (batch, 1, 1, max_contexts)
60 |             scaled_scores += mask
61 | 
62 |         attention_weights = self.softmax(scaled_scores)  # (batch, num_heads, max_contexts, max_contexts)
63 |         return torch.matmul(attention_weights, tiled_inputs)  # (batch, num_heads, max_contexts, value_dim)
64 | 
65 |     def _concat_heads(self, outputs):
66 |         # outputs: (batch, num_heads, max_contexts, value_dim)
67 |         max_contexts = outputs.size()[2]
68 |         tensor = outputs.transpose(1, 2)  # [batch, max_contexts, num_heads, value_dim]
69 |         return tensor.contiguous().view([-1, max_contexts, self.model_dim * self.num_heads])
70 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/modules.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | 
  5 | class ClassificationLoss(nn.Module):
  6 |     def __init__(self):
  7 |         super().__init__()
  8 |         self.loss = None
  9 | 
 10 |     def forward(self, targets, *outputs):
 11 |         """
 12 |         :param targets:
 13 |         :param outputs:
 14 |         :return: loss and accuracy values
 15 |         """
 16 |         outputs = outputs[0]
 17 |         loss = self.loss(outputs, targets)
 18 |         accuracy = self._calculate_accuracy(outputs, targets)
 19 |         return loss, accuracy
 20 | 
 21 |     def _get_correct(self, outputs):
 22 |         raise NotImplementedError()
 23 | 
 24 |     def _calculate_accuracy(self, outputs, targets):
 25 |         correct = self._get_correct(outputs)
 26 |         return 100. * (correct == targets).sum().float() / targets.size(0)
 27 | 
 28 | 
 29 | class BinaryClassificationLoss(ClassificationLoss):
 30 |     def __init__(self, reduction=None):
 31 |         super().__init__()
 32 |         if reduction is not None:
 33 |             self.loss = nn.BCEWithLogitsLoss(reduction=reduction)
 34 |         else:
 35 |             self.loss = nn.BCEWithLogitsLoss()
 36 | 
 37 |     def _get_correct(self, outputs):
 38 |         return outputs > 0.5
 39 | 
 40 | 
 41 | class MulticlassClassificationLoss(ClassificationLoss):
 42 |     def __init__(self, reduction=None):
 43 |         super().__init__()
 44 |         if reduction is not None:
 45 |             self.loss = nn.CrossEntropyLoss(reduction=reduction)
 46 |         else:
 47 |             self.loss = nn.CrossEntropyLoss()
 48 | 
 49 |     def _get_correct(self, outputs):
 50 |         return torch.argmax(outputs, dim=1)
 51 | 
 52 | 
 53 | class RegressionLoss(nn.Module):
 54 |     def __init__(self):
 55 |         super().__init__()
 56 |         self.loss = None
 57 | 
 58 |     def forward(self, targets, *outputs):
 59 |         """
 60 | 
 61 |         :param targets:
 62 |         :param outputs:
 63 |         :return: a loss value
 64 |         """
 65 |         raise NotImplementedError()
 66 | 
 67 | 
 68 | class CovarianceResidualError(RegressionLoss):  # For Cascade "Correlation"
 69 |     def __init__(self):
 70 |         super().__init__()
 71 | 
 72 |     def forward(self, targets, *outputs):
 73 |         _, _, graph_emb, errors = outputs
 74 | 
 75 |         errors_minus_mean = errors - torch.mean(errors, dim=0)
 76 |         activations_minus_mean = graph_emb - torch.mean(graph_emb, dim=0)
 77 | 
 78 |         # todo check against commented code
 79 |         cov_per_pattern = torch.zeros(errors.shape)
 80 | 
 81 |         cov_error = 0.
 82 |         for o in range(errors.shape[1]):  # for each output unit
 83 |             for i in range(errors.shape[0]):  # for each pattern
 84 |                 cov_per_pattern[i, o] = errors_minus_mean[i, o]*activations_minus_mean[i, 0]
 85 | 
 86 |             cov_error = cov_error + torch.abs(torch.sum(cov_per_pattern[:, o]))
 87 | 
 88 |         #print(torch.mean(cov_per_pattern, dim=0), torch.mean(errors_minus_mean), torch.mean(graph_emb))
 89 | 
 90 |         '''
 91 |         activations_minus_mean = torch.sum(activations_minus_mean, dim=1)
 92 |         activations_minus_mean = torch.unsqueeze(activations_minus_mean, dim=1)
 93 | 
 94 |         activations_minus_mean = torch.t(activations_minus_mean)
 95 | 
 96 |         cov_per_pattern = torch.mm(activations_minus_mean, errors_minus_mean)
 97 | 
 98 |         cov_abs = torch.abs(cov_per_pattern)
 99 | 
100 |         # sum over output "units"
101 |         cov_error = torch.sum(cov_abs)
102 |         '''
103 | 
104 |         # Minus --> maximization problem!
105 |         return - cov_error
106 | 
107 | 
108 | class NN4GMulticlassClassificationLoss(MulticlassClassificationLoss):
109 | 
110 |     def mse(self, ts, ys, return_sum):
111 | 
112 |         targets_oh = torch.zeros(ys.shape)
113 |         ts = ts.unsqueeze(1)
114 |         targets_oh.scatter_(1, ts, value=1.)  # src must not be specified
115 |         ts = targets_oh
116 | 
117 |         if return_sum == True:
118 |             return torch.sum(0.5 * (ts - ys) ** 2) / len(ts)
119 |         else:
120 |             return 0.5 * (ts - ys) ** 2 / len(ts)
121 | 
122 |     def forward(self, targets, *outputs):
123 | 
124 |         preds, _, _, _ = outputs
125 | 
126 |         # Try MSE
127 |         loss = self.mse(targets, preds, return_sum=True)
128 | 
129 |         #loss = self.loss(preds, targets)
130 | 
131 |         accuracy = self._calculate_accuracy(preds, targets)
132 |         return loss, accuracy
133 | 
134 | 
135 | class DiffPoolMulticlassClassificationLoss(MulticlassClassificationLoss):
136 |     """
137 |     DiffPool - No Link Prediction Loss
138 |     """
139 | 
140 |     def forward(self, targets, *outputs):
141 |         preds, lp_loss, ent_loss = outputs
142 | 
143 |         if targets.dim() > 1 and targets.size(1) == 1:
144 |             targets = targets.squeeze(1)
145 | 
146 |         loss = self.loss(preds, targets)
147 |         accuracy = self._calculate_accuracy(preds, targets)
148 |         return loss + lp_loss + ent_loss, accuracy
149 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/schedulers/ECCScheduler.py:
--------------------------------------------------------------------------------
 1 | from torch.optim.lr_scheduler import StepLR
 2 | 
 3 | 
 4 | class ECCLR(StepLR):
 5 | 
 6 |     def __init__(self, optimizer, step_size=1, gamma=0.1, last_epoch=-1):
 7 |         self.step_size = step_size  # does not matter
 8 |         self.gamma = gamma
 9 |         super(ECCLR, self).__init__(optimizer, step_size=step_size, gamma=gamma, last_epoch=last_epoch)
10 | 
11 |     def get_lr(self):
12 |         if self.last_epoch in [25, 35, 45]:
13 |             return [group['lr'] * self.gamma
14 |                     for group in self.optimizer.param_groups]
15 |         else:
16 |             return [group['lr'] for group in self.optimizer.param_groups]
17 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/utils/EarlyStopper.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | 
  4 | class EarlyStopper:
  5 | 
  6 |     def stop(self, epoch, val_loss, val_acc=None, test_loss=None, test_acc=None, train_loss=None, train_acc=None):
  7 |         raise NotImplementedError("Implement this method!")
  8 | 
  9 |     def get_best_vl_metrics(self):
 10 |         return self.train_loss, self.train_acc, self.val_loss, self.val_acc, self.test_loss, self.test_acc, self.best_epoch
 11 | 
 12 | 
 13 | class GLStopper(EarlyStopper):
 14 | 
 15 |     '''
 16 |     Implement Generalization Loss technique (Prechelt 1997)
 17 |     '''
 18 | 
 19 |     def __init__(self, starting_epoch, alpha=5, use_loss=True):
 20 |         self.local_optimum = float("inf") if use_loss else -float("inf")
 21 |         self.use_loss = use_loss
 22 |         self.alpha = alpha
 23 |         self.best_epoch = -1
 24 |         self.counter = None
 25 |         self.starting_epoch = starting_epoch
 26 | 
 27 |         self.train_loss, self.train_acc = None, None
 28 |         self.val_loss, self.val_acc = None, None
 29 |         self.test_loss, self.test_acc = None, None
 30 | 
 31 |     def stop(self, epoch, val_loss, val_acc=None, test_loss=None, test_acc=None, train_loss=None, train_acc=None):
 32 | 
 33 |         if epoch <= self.starting_epoch:
 34 |             return False
 35 | 
 36 |         if self.use_loss:
 37 |             if val_loss <= self.local_optimum:
 38 |                 self.local_optimum = val_loss
 39 |                 self.best_epoch = epoch
 40 |                 self.train_loss, self.train_acc = train_loss, train_acc
 41 |                 self.val_loss, self.val_acc = val_loss, val_acc
 42 |                 self.test_loss, self.test_acc = test_loss, test_acc
 43 |                 return False
 44 |             else:
 45 |                 return 100*(val_loss/self.local_optimum - 1) > self.alpha
 46 |         else:
 47 |             if val_acc >= self.local_optimum:
 48 |                 self.local_optimum = val_acc
 49 |                 self.best_epoch = epoch
 50 |                 self.train_loss, self.train_acc = train_loss, train_acc
 51 |                 self.val_loss, self.val_acc = val_loss, val_acc
 52 |                 self.test_loss, self.test_acc = test_loss, test_acc
 53 |                 return False
 54 |             else:
 55 |                 return (self.local_optimum/val_acc - 1) > self.alpha
 56 | 
 57 | 
 58 | class Patience(EarlyStopper):
 59 | 
 60 |     '''
 61 |     Implement common "patience" technique
 62 |     '''
 63 | 
 64 |     def __init__(self, patience=20, use_loss=True):
 65 |         self.local_val_optimum = float("inf") if use_loss else -float("inf")
 66 |         self.use_loss = use_loss
 67 |         self.patience = patience
 68 |         self.best_epoch = -1
 69 |         self.counter = -1
 70 | 
 71 |         self.train_loss, self.train_acc = None, None
 72 |         self.val_loss, self.val_acc = None, None
 73 |         self.test_loss, self.test_acc = None, None
 74 | 
 75 |     def stop(self, epoch, val_loss, val_acc=None, test_loss=None, test_acc=None, train_loss=None, train_acc=None):
 76 |         if self.use_loss:
 77 |             if val_loss <= self.local_val_optimum:
 78 |                 self.counter = 0
 79 |                 self.local_val_optimum = val_loss
 80 |                 self.best_epoch = epoch
 81 |                 self.train_loss, self.train_acc = train_loss, train_acc
 82 |                 self.val_loss, self.val_acc = val_loss, val_acc
 83 |                 self.test_loss, self.test_acc = test_loss, test_acc
 84 |                 return False
 85 |             else:
 86 |                 self.counter += 1
 87 |                 return self.counter >= self.patience
 88 |         else:
 89 |             if val_acc >= self.local_val_optimum:
 90 |                 self.counter = 0
 91 |                 self.local_val_optimum = val_acc
 92 |                 self.best_epoch = epoch
 93 |                 self.train_loss, self.train_acc = train_loss, train_acc
 94 |                 self.val_loss, self.val_acc = val_loss, val_acc
 95 |                 self.test_loss, self.test_acc = test_loss, test_acc
 96 |                 return False
 97 |             else:
 98 |                 self.counter += 1
 99 |                 return self.counter >= self.patience
100 | 
101 | 


--------------------------------------------------------------------------------
/gnn-comparison/models/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/gnn-comparison/models/utils/__init__.py


--------------------------------------------------------------------------------
/gnn-comparison/requirements.txt:
--------------------------------------------------------------------------------
1 | networkx
2 | requests
3 | pyyaml
4 | torch
5 | torch_scatter
6 | torch_sparse
7 | torch_cluster
8 | torch_geometric
9 | 


--------------------------------------------------------------------------------
/gnn-comparison/utils/batch_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import networkx as nx
 4 | 
 5 | from torch_geometric.data import Data, Batch
 6 | from torch_geometric.utils import dense_to_sparse, scatter_, to_dense_adj
 7 | 
 8 | 
 9 | def construct_mask_indices(sizes):
10 |     num_rows, num_cols = sum(sizes), len(sizes)
11 | 
12 |     indices = []
13 |     for i, size in enumerate(sizes):
14 |         cum_sum = sum(sizes[:i])
15 |         indices.append((cum_sum, cum_sum + size))
16 |     return indices
17 | 
18 | 
19 | def _make_block_diag(mats, mat_sizes):
20 |     block_diag = torch.zeros(sum(mat_sizes), sum(mat_sizes))
21 | 
22 |     for i, (mat, size) in enumerate(zip(mats, mat_sizes)):
23 |         cum_size = sum(mat_sizes[:i])
24 |         block_diag[cum_size:cum_size+size,cum_size:cum_size+size] = mat
25 | 
26 |     return block_diag
27 | 
28 | 
29 | def make_block_diag(data):
30 |     data = data.to_data_list()
31 |     adjs = [to_dense_adj(d.edge_index).squeeze(0) for d in data]
32 |     adj_sizes = [a.size(0) for a in adjs]
33 |     bd_mat = _make_block_diag(adjs, adj_sizes)
34 |     mask_indices = construct_mask_indices(adj_sizes)
35 |     return bd_mat, mask_indices
36 | 
37 | 
38 | def get_adj(block_diag, index):
39 |     from_i, to_i = index
40 |     return block_diag[from_i:to_i, from_i:to_i]
41 | 
42 | 
43 | def mock_batch(batch_size):
44 |     """construct pyG batch"""
45 |     graphs = []
46 |     while len(graphs) < batch_size:
47 |         G = nx.erdos_renyi_graph(np.random.choice([300, 500]), 0.5)
48 |         if G.number_of_edges() > 1:
49 |             graphs.append(G)
50 | 
51 |     adjs = [torch.from_numpy(nx.to_numpy_array(G)) for G in graphs]
52 |     graph_data = [dense_to_sparse(A) for A in adjs]
53 |     data_list = [Data(x=x, edge_index=e) for (e, x) in graph_data]
54 |     return Batch.from_data_list(data_list)
55 | 
56 | 
57 | def test():
58 |     batch_size = 3
59 |     data = mock_batch(batch_size=batch_size)
60 | 
61 |     # create block diagonal matrix of batch
62 |     # block size: [nodes_in_batch] x [nodes_in_batch]
63 |     block_diag, indices = make_block_diag(data)
64 |     for i in range(batch_size):
65 |         graph_adj = get_adj(block_diag, indices[i])
66 |         print(graph_adj)


--------------------------------------------------------------------------------
/gnn-comparison/utils/eval_across_folds.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from argparse import ArgumentParser
 3 | import glob
 4 | import numpy as np
 5 | 
 6 | if __name__ == '__main__':
 7 |     parser = ArgumentParser()
 8 |     parser.add_argument("--dir", dest="results_dir", required=True)
 9 | 
10 |     args = parser.parse_args()
11 |     results_dir = args.results_dir
12 |     num_folds = len(glob.glob(f'{results_dir}/GIN_NCI1_assessment/*/*/'))
13 |     num_config = len(glob.glob(f'{results_dir}/GIN_NCI1_assessment/*/OUTER_FOLD_1/HOLDOUT_MS/*/'))
14 |     
15 |     results = {}
16 |     config_values = {}
17 |     for config_id in range(1, num_config + 1):
18 |         config_result_jsons = glob.glob(f'{results_dir}/GIN_NCI1_assessment/*/*/HOLDOUT_MS/config_{config_id}/config_results.json')
19 |         count = 0
20 |         values = []
21 |         for json_path in config_result_jsons:
22 |             with open(json_path, 'r') as file:
23 |                 obj = json.load(file)
24 |             count += 1
25 |             values.append(obj['VL_score'])
26 |         if count > 0:
27 |             config_values[config_id] = obj['config']
28 |             results[config_id] = (np.mean(values), np.std(values), count)
29 |     
30 |     sorted_configs = [(k, v) for k, v in sorted(results.items(), key=lambda item: item[1][0], reverse=True)]
31 |     for config_id, results in sorted_configs:
32 |         print(f'Config id: {config_id}: {results[0]} std: {results[1]} (count: {results[2]})')
33 |         print(config_values[config_id])
34 |         print()


--------------------------------------------------------------------------------
/gnn-comparison/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | 
 4 | 
 5 | class NumpyEncoder(json.JSONEncoder):
 6 |     def default(self, obj):
 7 |         if isinstance(obj, np.ndarray):
 8 |             return obj.tolist()
 9 |         return json.JSONEncoder.default(self, obj)
10 | 
11 | 
12 | def one_hot(value, num_classes):
13 |     vec = np.zeros(num_classes)
14 |     vec[value - 1] = 1
15 |     return vec
16 | 
17 | 
18 | def get_max_num_nodes(dataset_str):
19 |     import datasets
20 |     dataset = getattr(datasets, dataset_str)()
21 | 
22 |     max_num_nodes = -1
23 |     for d in dataset.dataset:
24 |         max_num_nodes = max(max_num_nodes, d.num_nodes)
25 |     return max_num_nodes
26 | 


--------------------------------------------------------------------------------
/images/fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/images/fig3.png


--------------------------------------------------------------------------------
/images/fig5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tech-srl/bottleneck/bfe83b4a6dd7939ddb19cabea4f1e072f3c35432/images/fig5.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from attrdict import AttrDict
 3 | 
 4 | from experiment import Experiment
 5 | from common import Task, GNN_TYPE, STOP
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = ArgumentParser()
 9 |     parser.add_argument("--task", dest="task", default=Task.NEIGHBORS_MATCH, type=Task.from_string, choices=list(Task),
10 |                         required=False)
11 |     parser.add_argument("--type", dest="type", default=GNN_TYPE.GCN, type=GNN_TYPE.from_string, choices=list(GNN_TYPE),
12 |                         required=False)
13 |     parser.add_argument("--dim", dest="dim", default=32, type=int, required=False)
14 |     parser.add_argument("--depth", dest="depth", default=3, type=int, required=False)
15 |     parser.add_argument("--num_layers", dest="num_layers", default=None, type=int, required=False)
16 |     parser.add_argument("--train_fraction", dest="train_fraction", default=0.8, type=float, required=False)
17 |     parser.add_argument("--max_epochs", dest="max_epochs", default=50000, type=int, required=False)
18 |     parser.add_argument("--eval_every", dest="eval_every", default=100, type=int, required=False)
19 |     parser.add_argument("--batch_size", dest="batch_size", default=1024, type=int, required=False)
20 |     parser.add_argument("--accum_grad", dest="accum_grad", default=1, type=int, required=False)
21 |     parser.add_argument("--stop", dest="stop", default=STOP.TRAIN, type=STOP.from_string, choices=list(STOP),
22 |                         required=False)
23 |     parser.add_argument("--patience", dest="patience", default=20, type=int, required=False)
24 |     parser.add_argument("--loader_workers", dest="loader_workers", default=0, type=int, required=False)
25 |     parser.add_argument('--last_layer_fully_adjacent', action='store_true')
26 |     parser.add_argument('--no_layer_norm', action='store_true')
27 |     parser.add_argument('--no_activation', action='store_true')
28 |     parser.add_argument('--no_residual', action='store_true')
29 |     parser.add_argument('--unroll', action='store_true', help='use the same weights across GNN layers')
30 | 
31 |     args = parser.parse_args()
32 |     Experiment(args).run()
33 | 
34 | 
35 | def get_fake_args(
36 |         task=Task.NEIGHBORS_MATCH,
37 |         type=GNN_TYPE.GCN,
38 |         dim=32,
39 |         depth=3,
40 |         num_layers=None,
41 |         train_fraction=0.8,
42 |         max_epochs=50000,
43 |         eval_every=100,
44 |         batch_size=1024,
45 |         accum_grad=1,
46 |         patience=20,
47 |         stop=STOP.TRAIN,
48 |         loader_workers=0,
49 |         last_layer_fully_adjacent=False,
50 |         no_layer_norm=False,
51 |         no_activation=False,
52 |         no_residual=False,
53 |         unroll=False,
54 | ):
55 |     return AttrDict({
56 |         'task': task,
57 |         'type': type,
58 |         'dim': dim,
59 |         'depth': depth,
60 |         'num_layers': num_layers,
61 |         'train_fraction': train_fraction,
62 |         'max_epochs': max_epochs,
63 |         'eval_every': eval_every,
64 |         'batch_size': batch_size,
65 |         'accum_grad': accum_grad,
66 |         'stop': stop,
67 |         'patience': patience,
68 |         'loader_workers': loader_workers,
69 |         'last_layer_fully_adjacent': last_layer_fully_adjacent,
70 |         'no_layer_norm': no_layer_norm,
71 |         'no_activation': no_activation,
72 |         'no_residual': no_residual,
73 |         'unroll': unroll,
74 |     })
75 | 


--------------------------------------------------------------------------------
/models/graph_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | 
 6 | class GraphModel(torch.nn.Module):
 7 |     def __init__(self, gnn_type, num_layers, dim0, h_dim, out_dim, last_layer_fully_adjacent,
 8 |                  unroll, layer_norm, use_activation, use_residual):
 9 |         super(GraphModel, self).__init__()
10 |         self.gnn_type = gnn_type
11 |         self.unroll = unroll
12 |         self.last_layer_fully_adjacent = last_layer_fully_adjacent
13 |         self.use_layer_norm = layer_norm
14 |         self.use_activation = use_activation
15 |         self.use_residual = use_residual
16 |         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
17 | 
18 |         self.num_layers = num_layers
19 |         self.layer0_keys = nn.Embedding(num_embeddings=dim0 + 1, embedding_dim=h_dim)
20 |         self.layer0_values = nn.Embedding(num_embeddings=dim0 + 1, embedding_dim=h_dim)
21 |         self.layers = nn.ModuleList()
22 |         self.layer_norms = nn.ModuleList()
23 |         if unroll:
24 |             self.layers.append(gnn_type.get_layer(
25 |                 in_dim=h_dim,
26 |                 out_dim=h_dim))
27 |         else:
28 |             for i in range(num_layers):
29 |                 self.layers.append(gnn_type.get_layer(
30 |                     in_dim=h_dim,
31 |                     out_dim=h_dim))
32 |         if self.use_layer_norm:
33 |             for i in range(num_layers):
34 |                 self.layer_norms.append(nn.LayerNorm(h_dim))
35 | 
36 |         self.out_dim = out_dim
37 |         # self.out_layer = nn.Linear(in_features=h_dim, out_features=out_dim, bias=False)
38 |         self.out_layer = nn.Linear(in_features=h_dim, out_features=out_dim + 1, bias=False)
39 | 
40 |     def forward(self, data):
41 |         x, edge_index, batch, roots = data.x, data.edge_index, data.batch, data.root_mask
42 | 
43 |         x_key, x_val = x[:, 0], x[:, 1]
44 |         x_key_embed = self.layer0_keys(x_key)
45 |         x_val_embed = self.layer0_values(x_val)
46 |         x = x_key_embed + x_val_embed
47 | 
48 |         for i in range(self.num_layers):
49 |             if self.unroll:
50 |                 layer = self.layers[0]
51 |             else:
52 |                 layer = self.layers[i]
53 |             new_x = x
54 |             if self.last_layer_fully_adjacent and i == self.num_layers - 1:
55 |                 root_indices = torch.nonzero(roots, as_tuple=False).squeeze(-1)
56 |                 target_roots = root_indices.index_select(dim=0, index=batch)
57 |                 source_nodes = torch.arange(0, data.num_nodes).to(self.device)
58 |                 edges = torch.stack([source_nodes, target_roots], dim=0)
59 | 
60 |             else:
61 |                 edges = edge_index
62 |             new_x = layer(new_x, edges)
63 |             if self.use_activation:
64 |                 new_x = F.relu(new_x)
65 |             if self.use_residual:
66 |                 x = x + new_x
67 |             else:
68 |                 x = new_x
69 |             if self.use_layer_norm:
70 |                 x = self.layer_norms[i](x)
71 | 
72 |         root_nodes = x[roots]
73 |         logits = self.out_layer(root_nodes)
74 |         # logits = F.linear(root_nodes, self.layer0_values.weight)
75 |         return logits
76 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | attrdict==2.0.1
2 | torch>=1.4.0
3 | torch-geometric>=1.4.2
4 | torch-scatter>=2.0.4
5 | torch-sparse>=0.6.0
6 | torchvision>=0.5.0
7 | sklearn
8 | 
9 | 


--------------------------------------------------------------------------------
/run-gat-2-8.py:
--------------------------------------------------------------------------------
 1 | import main
 2 | from common import Task, STOP, GNN_TYPE
 3 | from attrdict import AttrDict
 4 | from experiment import Experiment
 5 | import torch
 6 | 
 7 | override_params = {
 8 |     2: {'batch_size': 64, 'eval_every': 1000},
 9 |     3: {'batch_size': 64},
10 |     4: {'batch_size': 1024},
11 |     5: {'batch_size': 1024},
12 |     6: {'batch_size': 1024},
13 |     7: {'batch_size': 2048},
14 |     8: {'batch_size': 1024, 'accum_grad': 2},  # effective batch size of 2048, with less GPU memory
15 | }
16 | 
17 | 
18 | class Results:
19 |     def __init__(self, train_acc, test_acc, epoch):
20 |         self.train_acc = train_acc
21 |         self.test_acc = test_acc
22 |         self.epoch = epoch
23 | 
24 | 
25 | if __name__ == '__main__':
26 | 
27 |     task = Task.NEIGHBORS_MATCH
28 |     gnn_type = GNN_TYPE.GAT
29 |     stopping_criterion = STOP.TRAIN
30 |     min_depth = 2
31 |     max_depth = 8
32 | 
33 |     results_all_depths = {}
34 |     for depth in range(min_depth, max_depth + 1):
35 |         num_layers = depth + 1
36 |         args = main.get_fake_args(task=task, depth=depth, num_layers=num_layers, loader_workers=7,
37 |                                   type=gnn_type, stop=stopping_criterion,
38 |                                   no_activation=True, no_residual=False)
39 |         if depth in override_params:
40 |             for key, value in AttrDict(override_params[depth]).items():
41 |                 args[key] = value
42 |         train_acc, test_acc, epoch = Experiment(args).run()
43 |         torch.cuda.empty_cache()
44 |         results_all_depths[depth] = Results(train_acc=train_acc, test_acc=test_acc, epoch=epoch)
45 |         print()
46 | 
47 |     print(f'Task: {task}')
48 |     print('depth, train_acc, test_acc, epoch, train_acc, test_acc, epoch,')
49 |     for depth in range(min_depth, max_depth + 1):
50 |         res = results_all_depths[depth]
51 |         print(f'{depth}, {res.train_acc}, {res.test_acc}, {res.epoch}')
52 | 


--------------------------------------------------------------------------------
/run-gcn-2-8.py:
--------------------------------------------------------------------------------
 1 | import main
 2 | from common import Task, STOP, GNN_TYPE
 3 | from attrdict import AttrDict
 4 | from experiment import Experiment
 5 | import torch
 6 | 
 7 | override_params = {
 8 |     2: {'batch_size': 64, 'eval_every': 1000},
 9 |     3: {'batch_size': 64},
10 |     4: {'batch_size': 1024},
11 |     5: {'batch_size': 1024},
12 |     6: {'batch_size': 1024},
13 |     7: {'batch_size': 2048},
14 |     8: {'batch_size': 1024, 'accum_grad': 2},  # effective batch size of 2048, with less GPU memory
15 | }
16 | 
17 | 
18 | class Results:
19 |     def __init__(self, train_acc, test_acc, epoch):
20 |         self.train_acc = train_acc
21 |         self.test_acc = test_acc
22 |         self.epoch = epoch
23 | 
24 | 
25 | if __name__ == '__main__':
26 | 
27 |     task = Task.NEIGHBORS_MATCH
28 |     gnn_type = GNN_TYPE.GCN
29 |     stopping_criterion = STOP.TRAIN
30 |     min_depth = 2
31 |     max_depth = 8
32 | 
33 |     results_all_depths = {}
34 |     for depth in range(min_depth, max_depth + 1):
35 |         num_layers = depth + 1
36 |         args = main.get_fake_args(task=task, depth=depth, num_layers=num_layers, loader_workers=7,
37 |                                   type=gnn_type, stop=stopping_criterion)
38 |         if depth in override_params:
39 |             for key, value in AttrDict(override_params[depth]).items():
40 |                 args[key] = value
41 |         train_acc, test_acc, epoch = Experiment(args).run()
42 |         torch.cuda.empty_cache()
43 |         results_all_depths[depth] = Results(train_acc=train_acc, test_acc=test_acc, epoch=epoch)
44 |         print()
45 | 
46 |     print(f'Task: {task}')
47 |     print('depth, train_acc, test_acc, epoch, train_acc, test_acc, epoch,')
48 |     for depth in range(min_depth, max_depth + 1):
49 |         res = results_all_depths[depth]
50 |         print(f'{depth}, {res.train_acc}, {res.test_acc}, {res.epoch}')
51 | 


--------------------------------------------------------------------------------
/run-ggnn-2-8.py:
--------------------------------------------------------------------------------
 1 | import main
 2 | from common import Task, STOP, GNN_TYPE
 3 | from attrdict import AttrDict
 4 | from experiment import Experiment
 5 | import torch
 6 | 
 7 | override_params = {
 8 |     2: {'batch_size': 64, 'eval_every': 1000},
 9 |     3: {'batch_size': 64},
10 |     4: {'batch_size': 1024},
11 |     5: {'batch_size': 1024},
12 |     6: {'batch_size': 1024},
13 |     7: {'batch_size': 1024, 'accum_grad': 2},
14 |     8: {'batch_size': 512, 'accum_grad': 4},  # effective batch size of 2048, with less GPU memory
15 | }
16 | 
17 | 
18 | class Results:
19 |     def __init__(self, train_acc, test_acc, epoch):
20 |         self.train_acc = train_acc
21 |         self.test_acc = test_acc
22 |         self.epoch = epoch
23 | 
24 | 
25 | if __name__ == '__main__':
26 | 
27 |     task = Task.NEIGHBORS_MATCH
28 |     gnn_type = GNN_TYPE.GGNN
29 |     stopping_criterion = STOP.TRAIN
30 |     min_depth = 2
31 |     max_depth = 8
32 | 
33 |     results_all_depths = {}
34 |     for depth in range(min_depth, max_depth + 1):
35 |         num_layers = depth + 1
36 |         args = main.get_fake_args(task=task, depth=depth, num_layers=num_layers, loader_workers=7,
37 |                                   type=gnn_type, stop=stopping_criterion,
38 |                                   no_layer_norm=True, no_activation=True, no_residual=True)
39 |         if depth in override_params:
40 |             for key, value in AttrDict(override_params[depth]).items():
41 |                 args[key] = value
42 |         train_acc, test_acc, epoch = Experiment(args).run()
43 |         torch.cuda.empty_cache()
44 |         results_all_depths[depth] = Results(train_acc=train_acc, test_acc=test_acc, epoch=epoch)
45 |         print()
46 | 
47 |     print(f'Task: {task}')
48 |     print('depth, train_acc, test_acc, epoch, train_acc, test_acc, epoch,')
49 |     for depth in range(min_depth, max_depth + 1):
50 |         res = results_all_depths[depth]
51 |         print(f'{depth}, {res.train_acc}, {res.test_acc}, {res.epoch}')
52 | 


--------------------------------------------------------------------------------
/run-gin-2-8.py:
--------------------------------------------------------------------------------
 1 | import main
 2 | from common import Task, STOP, GNN_TYPE
 3 | from attrdict import AttrDict
 4 | from experiment import Experiment
 5 | import torch
 6 | 
 7 | override_params = {
 8 |     2: {'batch_size': 64, 'eval_every': 1000},
 9 |     3: {'batch_size': 64},
10 |     4: {'batch_size': 1024},
11 |     5: {'batch_size': 1024},
12 |     6: {'batch_size': 1024},
13 |     7: {'batch_size': 1024, 'accum_grad': 2},
14 |     8: {'batch_size': 512, 'accum_grad': 4},  # effective batch size of 2048, with less GPU memory
15 | }
16 | 
17 | 
18 | class Results:
19 |     def __init__(self, train_acc, test_acc, epoch):
20 |         self.train_acc = train_acc
21 |         self.test_acc = test_acc
22 |         self.epoch = epoch
23 | 
24 | 
25 | if __name__ == '__main__':
26 | 
27 |     task = Task.NEIGHBORS_MATCH
28 |     gnn_type = GNN_TYPE.GIN
29 |     stopping_criterion = STOP.TRAIN
30 |     min_depth = 2
31 |     max_depth = 8
32 | 
33 |     results_all_depths = {}
34 |     for depth in range(min_depth, max_depth + 1):
35 |         num_layers = depth + 1
36 |         args = main.get_fake_args(task=task, depth=depth, num_layers=num_layers, loader_workers=7,
37 |                                   type=gnn_type, stop=stopping_criterion)
38 |         if depth in override_params:
39 |             for key, value in AttrDict(override_params[depth]).items():
40 |                 args[key] = value
41 |         train_acc, test_acc, epoch = Experiment(args).run()
42 |         torch.cuda.empty_cache()
43 |         results_all_depths[depth] = Results(train_acc=train_acc, test_acc=test_acc, epoch=epoch)
44 |         print()
45 | 
46 |     print(f'Task: {task}')
47 |     print('depth, train_acc, test_acc, epoch, train_acc, test_acc, epoch,')
48 |     for depth in range(min_depth, max_depth + 1):
49 |         res = results_all_depths[depth]
50 |         print(f'{depth}, {res.train_acc}, {res.test_acc}, {res.epoch}')
51 | 


--------------------------------------------------------------------------------
/tasks/dictionary_lookup.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import itertools
 3 | import random
 4 | import math
 5 | 
 6 | from tasks.tree_dataset import TreeDataset
 7 | import common
 8 | 
 9 | 
10 | class DictionaryLookupDataset(TreeDataset):
11 |     def __init__(self, depth):
12 |         super(DictionaryLookupDataset, self).__init__(depth)
13 | 
14 |     def get_combinations(self):
15 |         # returns: an iterable of [key, permutation(leaves)]
16 |         # number of combinations: (num_leaves!)*num_choices
17 |         num_leaves = len(self.leaf_indices)
18 |         num_permutations = 1000
19 |         max_examples = 32000
20 | 
21 |         if self.depth > 3:
22 |             per_depth_num_permutations = min(num_permutations, math.factorial(num_leaves), max_examples // num_leaves)
23 |             permutations = [np.random.permutation(range(1, num_leaves + 1)) for _ in
24 |                             range(per_depth_num_permutations)]
25 |         else:
26 |             permutations = random.sample(list(itertools.permutations(range(1, num_leaves + 1))),
27 |                                          min(num_permutations, math.factorial(num_leaves)))
28 | 
29 |         return itertools.chain.from_iterable(
30 | 
31 |             zip(range(1, num_leaves + 1), itertools.repeat(perm))
32 |             for perm in permutations)
33 | 
34 |     def get_nodes_features(self, combination):
35 |         # combination: a list of indices
36 |         # Each leaf contains a one-hot encoding of a key, and a one-hot encoding of the value
37 |         # Every other node is empty, for now
38 |         selected_key, values = combination
39 | 
40 |         # The root is [one-hot selected key] + [0 ... 0]
41 |         nodes = [ (selected_key, 0) ]
42 | 
43 |         for i in range(1, self.num_nodes):
44 |             if i in self.leaf_indices:
45 |                 leaf_num = self.leaf_indices.index(i)
46 |                 node = (leaf_num+1, values[leaf_num])
47 |             else:
48 |                 node = (0, 0)
49 |             nodes.append(node)
50 |         return nodes
51 | 
52 |     def label(self, combination):
53 |         selected_key, values = combination
54 |         return int(values[selected_key - 1])
55 | 
56 |     def get_dims(self):
57 |         # get input and output dims
58 |         in_dim = len(self.leaf_indices)
59 |         out_dim = len(self.leaf_indices)
60 |         return in_dim, out_dim
61 | 


--------------------------------------------------------------------------------
/tasks/tree_dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch_geometric
 3 | 
 4 | from torch_geometric.data import Data
 5 | from torch.nn import functional as F
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | 
 9 | class TreeDataset(object):
10 |     def __init__(self, depth):
11 |         super(TreeDataset, self).__init__()
12 |         self.depth = depth
13 |         self.num_nodes, self.edges, self.leaf_indices = self._create_blank_tree()
14 |         self.criterion = F.cross_entropy
15 | 
16 |     def add_child_edges(self, cur_node, max_node):
17 |         edges = []
18 |         leaf_indices = []
19 |         stack = [(cur_node, max_node)]
20 |         while len(stack) > 0:
21 |             cur_node, max_node = stack.pop()
22 |             if cur_node == max_node:
23 |                 leaf_indices.append(cur_node)
24 |                 continue
25 |             left_child = cur_node + 1
26 |             right_child = cur_node + 1 + ((max_node - cur_node) // 2)
27 |             edges.append([left_child, cur_node])
28 |             edges.append([right_child, cur_node])
29 |             stack.append((right_child, max_node))
30 |             stack.append((left_child, right_child - 1))
31 |         return edges, leaf_indices
32 | 
33 |     def _create_blank_tree(self):
34 |         max_node_id = 2 ** (self.depth + 1) - 2
35 |         edges, leaf_indices = self.add_child_edges(cur_node=0, max_node=max_node_id)
36 |         return max_node_id + 1, edges, leaf_indices
37 | 
38 |     def create_blank_tree(self, add_self_loops=True):
39 |         edge_index = torch.tensor(self.edges).t()
40 |         if add_self_loops:
41 |             edge_index, _ = torch_geometric.utils.add_remaining_self_loops(edge_index=edge_index, )
42 |         return edge_index
43 | 
44 |     def generate_data(self, train_fraction):
45 |         data_list = []
46 | 
47 |         for comb in self.get_combinations():
48 |             edge_index = self.create_blank_tree(add_self_loops=True)
49 |             nodes = torch.tensor(self.get_nodes_features(comb), dtype=torch.long)
50 |             root_mask = torch.tensor([True] + [False] * (len(nodes) - 1))
51 |             label = self.label(comb)
52 |             data_list.append(Data(x=nodes, edge_index=edge_index, root_mask=root_mask, y=label))
53 | 
54 |         dim0, out_dim = self.get_dims()
55 |         X_train, X_test = train_test_split(
56 |             data_list, train_size=train_fraction, shuffle=True, stratify=[data.y for data in data_list])
57 | 
58 | 
59 |         return X_train, X_test, dim0, out_dim, self.criterion
60 | 
61 |     # Every sub-class should implement the following methods:
62 |     def get_combinations(self):
63 |         raise NotImplementedError
64 | 
65 |     def get_nodes_features(self, combination):
66 |         raise NotImplementedError
67 | 
68 |     def label(self, combination):
69 |         raise NotImplementedError
70 | 
71 |     def get_dims(self):
72 |         raise NotImplementedError
73 | 
74 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 150
3 | ignore =
4 |     # W605: invalid escape sequence -- triggered by pseudo-LaTeX in comments
5 |     W605,


--------------------------------------------------------------------------------
/tf-gnn-samples/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | molecules_*.json
104 | data/*


--------------------------------------------------------------------------------
/tf-gnn-samples/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## CONTRIBUTING
 2 | 
 3 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
 4 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
 5 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
 6 | 
 7 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
 8 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
 9 | provided by the bot. You will only need to do this once across all repos using our CLA.
10 | 
11 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
12 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
13 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
14 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/gnns/__init__.py:
--------------------------------------------------------------------------------
1 | from .ggnn import sparse_ggnn_layer
2 | from .gnn_edge_mlp import sparse_gnn_edge_mlp_layer
3 | from .gnn_film import sparse_gnn_film_layer
4 | from .rgat import sparse_rgat_layer
5 | from .rgcn import sparse_rgcn_layer
6 | from .rgdcn import sparse_rgdcn_layer
7 | from .rgin import sparse_rgin_layer
8 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/gnns/ggnn.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from utils import get_gated_unit, get_aggregation_function
 6 | 
 7 | 
 8 | def sparse_ggnn_layer(node_embeddings: tf.Tensor,
 9 |                       adjacency_lists: List[tf.Tensor],
10 |                       state_dim: Optional[int],
11 |                       num_timesteps: int = 1,
12 |                       gated_unit_type: str = "gru",
13 |                       activation_function: str = "tanh",
14 |                       message_aggregation_function: str = "sum"
15 |                       ) -> tf.Tensor:
16 |     """
17 |     Compute new graph states by neural message passing and gated units on the nodes.
18 |     For this, we assume existing node states h^t_v and a list of per-edge-type adjacency
19 |     matrices A_\ell.
20 | 
21 |     We compute new states as follows:
22 |         h^{t+1}_v := Cell(h^t_v, \sum_\ell
23 |                                  \sum_{(u, v) \in A_\ell}
24 |                                      W_\ell * h^t_u)
25 |     The learnable parameters of this are the recurrent Cell and the W_\ell \in R^{D,D}.
26 | 
27 |     We use the following abbreviations in shape descriptions:
28 |     * V: number of nodes
29 |     * D: state dimension
30 |     * L: number of different edge types
31 |     * E: number of edges of a given edge type
32 | 
33 |     Arguments:
34 |         node_embeddings: float32 tensor of shape [V, D], the original representation of
35 |             each node in the graph.
36 |         adjacency_lists: List of L adjacency lists, represented as int32 tensors of shape
37 |             [E, 2]. Concretely, adjacency_lists[l][k,:] == [v, u] means that the k-th edge
38 |             of type l connects node v to node u.
39 |         state_dim: Optional size of output dimension of the GNN layer. If not set, defaults
40 |             to D, the dimensionality of the input. If different from the input dimension,
41 |             parameter num_timesteps has to be 1.
42 |         num_timesteps: Number of repeated applications of this message passing layer.
43 |         gated_unit_type: Type of the recurrent unit used (one of RNN, GRU and LSTM).
44 |         activation_function: Type of activation function used.
45 |         message_aggregation_function: Type of aggregation function used for messages.
46 | 
47 |     Returns:
48 |         float32 tensor of shape [V, state_dim]
49 |     """
50 |     num_nodes = tf.shape(node_embeddings, out_type=tf.int32)[0]
51 |     if state_dim is None:
52 |         state_dim = tf.shape(node_embeddings, out_type=tf.int32)[1]
53 | 
54 |     # === Prepare things we need across all timesteps:
55 |     message_aggregation_fn = get_aggregation_function(message_aggregation_function)
56 |     gated_cell = get_gated_unit(state_dim, gated_unit_type, activation_function)
57 |     edge_type_to_message_transformation_layers = []  # Layers to compute the message from a source state
58 |     edge_type_to_message_targets = []  # List of tensors of message targets
59 |     for edge_type_idx, adjacency_list_for_edge_type in enumerate(adjacency_lists):
60 |         edge_type_to_message_transformation_layers.append(
61 |             tf.keras.layers.Dense(units=state_dim,
62 |                                   use_bias=False,
63 |                                   activation=None,
64 |                                   name="Edge_%i_Weight" % edge_type_idx))
65 |         edge_type_to_message_targets.append(adjacency_list_for_edge_type[:, 1])
66 | 
67 |     # Let M be the number of messages (sum of all E):
68 |     message_targets = tf.concat(edge_type_to_message_targets, axis=0)  # Shape [M]
69 | 
70 |     cur_node_states = node_embeddings
71 |     for _ in range(num_timesteps):
72 |         messages = []  # list of tensors of messages of shape [E, D]
73 |         message_source_states = []  # list of tensors of edge source states of shape [E, D]
74 | 
75 |         # Collect incoming messages per edge type
76 |         for edge_type_idx, adjacency_list_for_edge_type in enumerate(adjacency_lists):
77 |             edge_sources = adjacency_list_for_edge_type[:, 0]
78 |             edge_source_states = tf.nn.embedding_lookup(params=cur_node_states,
79 |                                                         ids=edge_sources)  # Shape [E, D]
80 |             all_messages_for_edge_type = \
81 |                 edge_type_to_message_transformation_layers[edge_type_idx](edge_source_states)  # Shape [E,D]
82 |             messages.append(all_messages_for_edge_type)
83 |             message_source_states.append(edge_source_states)
84 | 
85 |         messages = tf.concat(messages, axis=0)  # Shape [M, D]
86 |         aggregated_messages = \
87 |             message_aggregation_fn(data=messages,
88 |                                    segment_ids=message_targets,
89 |                                    num_segments=num_nodes)  # Shape [V, D]
90 | 
91 |         # pass updated vertex features into RNN cell
92 |         new_node_states = gated_cell(aggregated_messages, [cur_node_states])[0]  # Shape [V, D]
93 |         cur_node_states = new_node_states
94 | 
95 |     return cur_node_states
96 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .sparse_graph_model import Sparse_Graph_Model
2 | from .ggnn_model import GGNN_Model
3 | from .gnn_edge_mlp_model import GNN_Edge_MLP_Model
4 | from .gnn_film_model import GNN_FiLM_Model
5 | from .rgat_model import RGAT_Model
6 | from .rgcn_model import RGCN_Model
7 | from .rgdcn_model import RGDCN_Model
8 | from .rgin_model import RGIN_Model
9 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/models/ggnn_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, List
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from .sparse_graph_model import Sparse_Graph_Model
 6 | from tasks import Sparse_Graph_Task
 7 | from gnns import sparse_ggnn_layer
 8 | 
 9 | 
10 | class GGNN_Model(Sparse_Graph_Model):
11 |     @classmethod
12 |     def default_params(cls):
13 |         params = super().default_params()
14 |         params.update({
15 |             'hidden_size': 128,
16 |             'graph_rnn_cell': 'GRU',  # RNN, GRU, or LSTM
17 |             'graph_activation_function': "tanh",
18 |             "message_aggregation_function": "sum",
19 |             'graph_layer_input_dropout_keep_prob': 1.0,
20 |             'graph_dense_between_every_num_gnn_layers': 10000,
21 |             'graph_residual_connection_every_num_layers': 10000,
22 |         })
23 |         return params
24 | 
25 |     @staticmethod
26 |     def name(params: Dict[str, Any]) -> str:
27 |         return "GGNN"
28 | 
29 |     def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None:
30 |         super().__init__(params, task, run_id, result_dir)
31 | 
32 |     def _apply_gnn_layer(self,
33 |                          node_representations: tf.Tensor,
34 |                          adjacency_lists: List[tf.Tensor],
35 |                          type_to_num_incoming_edges: tf.Tensor,
36 |                          num_timesteps: int) -> tf.Tensor:
37 |         return sparse_ggnn_layer(
38 |             node_embeddings=node_representations,
39 |             adjacency_lists=adjacency_lists,
40 |             state_dim=self.params['hidden_size'],
41 |             num_timesteps=num_timesteps,
42 |             gated_unit_type=self.params['graph_rnn_cell'],
43 |             activation_function=self.params['graph_activation_function'],
44 |             message_aggregation_function=self.params['message_aggregation_function'],
45 |         )
46 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/models/gnn_edge_mlp_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, List
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from .sparse_graph_model import Sparse_Graph_Model
 6 | from tasks import Sparse_Graph_Task
 7 | from gnns import sparse_gnn_edge_mlp_layer
 8 | 
 9 | 
10 | class GNN_Edge_MLP_Model(Sparse_Graph_Model):
11 |     @classmethod
12 |     def default_params(cls):
13 |         params = super().default_params()
14 |         params.update({
15 |             'max_nodes_in_batch': 25000,
16 |             'hidden_size': 128,
17 |             "graph_activation_function": "gelu",
18 |             "message_aggregation_function": "sum",
19 |             'graph_inter_layer_norm': True,
20 |             'use_target_state_as_input': True,
21 |             'num_edge_hidden_layers': 1,
22 |         })
23 |         return params
24 | 
25 |     @staticmethod
26 |     def name(params: Dict[str, Any]) -> str:
27 |         return "GNN-Edge-MLP%i" % (params['num_edge_hidden_layers'])
28 | 
29 |     def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None:
30 |         super().__init__(params, task, run_id, result_dir)
31 | 
32 |     def _apply_gnn_layer(self,
33 |                          node_representations: tf.Tensor,
34 |                          adjacency_lists: List[tf.Tensor],
35 |                          type_to_num_incoming_edges: tf.Tensor,
36 |                          num_timesteps: int,
37 |                          ) -> tf.Tensor:
38 |         return sparse_gnn_edge_mlp_layer(
39 |             node_embeddings=node_representations,
40 |             adjacency_lists=adjacency_lists,
41 |             type_to_num_incoming_edges=type_to_num_incoming_edges,
42 |             state_dim=self.params['hidden_size'],
43 |             num_timesteps=num_timesteps,
44 |             activation_function=self.params['graph_activation_function'],
45 |             message_aggregation_function=self.params['message_aggregation_function'],
46 |             use_target_state_as_input=self.params['use_target_state_as_input'],
47 |             num_edge_hidden_layers=self.params['num_edge_hidden_layers']
48 |         )
49 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/models/gnn_film_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, List
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from .sparse_graph_model import Sparse_Graph_Model
 6 | from tasks import Sparse_Graph_Task
 7 | from gnns import sparse_gnn_film_layer
 8 | 
 9 | 
10 | class GNN_FiLM_Model(Sparse_Graph_Model):
11 |     @classmethod
12 |     def default_params(cls):
13 |         params = super().default_params()
14 |         params.update({
15 |             "hidden_size": 128,
16 |             "graph_activation_function": "ReLU",
17 |             "message_aggregation_function": "sum",
18 |             "normalize_messages_by_num_incoming": False,
19 |         })
20 |         return params
21 | 
22 |     @staticmethod
23 |     def name(params: Dict[str, Any]) -> str:
24 |         return "GNN-FiLM"
25 | 
26 |     def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None:
27 |         super().__init__(params, task, run_id, result_dir)
28 | 
29 |     def _apply_gnn_layer(self,
30 |                          node_representations: tf.Tensor,
31 |                          adjacency_lists: List[tf.Tensor],
32 |                          type_to_num_incoming_edges: tf.Tensor,
33 |                          num_timesteps: int) -> tf.Tensor:
34 |         
35 |         return sparse_gnn_film_layer(
36 |             node_embeddings=node_representations,
37 |             adjacency_lists=adjacency_lists,
38 |             type_to_num_incoming_edges=type_to_num_incoming_edges,
39 |             state_dim=self.params['hidden_size'],
40 |             num_timesteps=num_timesteps,
41 |             activation_function=self.params['graph_activation_function'],
42 |             message_aggregation_function=self.params['message_aggregation_function'],
43 |             normalize_by_num_incoming=self.params["normalize_messages_by_num_incoming"],
44 |         )
45 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/models/no_struct_mlp_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, List
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from utils import MLP
 6 | from .sparse_graph_model import Sparse_Graph_Model
 7 | from tasks import Sparse_Graph_Task
 8 | from gnns import sparse_gnn_edge_mlp_layer
 9 | 
10 | 
11 | class No_Struct_MLP_Model(Sparse_Graph_Model):
12 |     @classmethod
13 |     def default_params(cls):
14 |         params = super().default_params()
15 |         params.update({
16 |             'max_nodes_in_batch': 25000,
17 |             'hidden_size': 128,
18 |             "graph_activation_function": "gelu",
19 |             "message_aggregation_function": "sum",
20 |             'graph_inter_layer_norm': True,
21 |             'use_target_state_as_input': True,
22 |             'num_edge_hidden_layers': 0,
23 |         })
24 |         return params
25 | 
26 |     @staticmethod
27 |     def name(params: Dict[str, Any]) -> str:
28 |         return "NoStruct-MLP%i" % (params['num_edge_hidden_layers'])
29 | 
30 |     def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None:
31 |         super().__init__(params, task, run_id, result_dir)
32 | 
33 |     def _apply_gnn_layer(self,
34 |                          node_representations: tf.Tensor,
35 |                          adjacency_lists: List[tf.Tensor],
36 |                          type_to_num_incoming_edges: tf.Tensor,
37 |                          num_timesteps: int
38 |                          ) -> tf.Tensor:
39 |         graph_to_nodes = self._Sparse_Graph_Model__placeholders['graph_to_nodes']
40 |         graph_nodes_list = self._Sparse_Graph_Model__placeholders['graph_nodes_list'] # (None, )
41 |         max_nodes = tf.shape(graph_to_nodes)[1]
42 |         tiled_nodes = tf.tile(tf.expand_dims(graph_to_nodes, axis=-1), (1, 1, max_nodes))
43 |         pairs = tf.concat(
44 |             [tf.expand_dims(tiled_nodes, axis=-1), tf.expand_dims(tf.transpose(tiled_nodes, [0, 2, 1]), axis=-1)],
45 |             axis=-1)
46 |         flat_pairs = tf.reshape(pairs, [-1, 2])
47 |         relevant_edges = tf.reshape(tf.gather(flat_pairs, tf.where(tf.reduce_min(flat_pairs, axis=-1) >= 0)), [-1, 2])
48 |         
49 |         num_types = tf.shape(type_to_num_incoming_edges)[0]
50 |         num_nodes_in_graph = tf.reduce_sum(tf.cast(tf.greater(graph_to_nodes, -1), dtype=tf.float32), axis=-1)
51 |         num_incoming_nodes_per_node = tf.gather(params=num_nodes_in_graph, indices=graph_nodes_list)
52 |         type_to_num_incoming_edges = tf.tile(tf.expand_dims(num_incoming_nodes_per_node, axis=0), [num_types, 1])
53 |         
54 |         return sparse_gnn_edge_mlp_layer(
55 |             node_embeddings=node_representations,
56 |             adjacency_lists=[relevant_edges for _ in adjacency_lists],
57 |             type_to_num_incoming_edges=type_to_num_incoming_edges,
58 |             state_dim=self.params['hidden_size'],
59 |             num_timesteps=num_timesteps,
60 |             activation_function=self.params['graph_activation_function'],
61 |             message_aggregation_function=self.params['message_aggregation_function'],
62 |             use_target_state_as_input=self.params['use_target_state_as_input'],
63 |             num_edge_hidden_layers=self.params['num_edge_hidden_layers'],
64 |         )
65 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/models/rgat_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, List
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from .sparse_graph_model import Sparse_Graph_Model
 6 | from tasks import Sparse_Graph_Task
 7 | from gnns import sparse_rgat_layer
 8 | 
 9 | 
10 | class RGAT_Model(Sparse_Graph_Model):
11 |     @classmethod
12 |     def default_params(cls):
13 |         params = super().default_params()
14 |         params.update({
15 |             'hidden_size': 128,
16 |             'num_heads': 4,
17 |             'graph_activation_function': 'tanh',
18 |             'graph_layer_input_dropout_keep_prob': 1.0,
19 |             'graph_dense_between_every_num_gnn_layers': 10000,
20 |             'graph_residual_connection_every_num_layers': 10000,
21 |         })
22 |         return params
23 | 
24 |     @staticmethod
25 |     def name(params: Dict[str, Any]) -> str:
26 |         return "RGAT"
27 | 
28 |     def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None:
29 |         super().__init__(params, task, run_id, result_dir)
30 | 
31 |     def _apply_gnn_layer(self,
32 |                          node_representations: tf.Tensor,
33 |                          adjacency_lists: List[tf.Tensor],
34 |                          type_to_num_incoming_edges: tf.Tensor,
35 |                          num_timesteps: int) -> tf.Tensor:
36 |         return sparse_rgat_layer(
37 |             node_embeddings=node_representations,
38 |             adjacency_lists=adjacency_lists,
39 |             state_dim=self.params['hidden_size'],
40 |             num_timesteps=num_timesteps,
41 |             num_heads=self.params['num_heads'],
42 |             activation_function=self.params['graph_activation_function'],
43 |         )
44 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/models/rgcn_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, List
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from .sparse_graph_model import Sparse_Graph_Model
 6 | from tasks import Sparse_Graph_Task
 7 | from gnns import sparse_rgcn_layer
 8 | 
 9 | 
10 | class RGCN_Model(Sparse_Graph_Model):
11 |     @classmethod
12 |     def default_params(cls):
13 |         params = super().default_params()
14 |         params.update({
15 |             'hidden_size': 128,
16 |             "graph_activation_function": "ReLU",
17 |             "message_aggregation_function": "sum",
18 |             'graph_layer_input_dropout_keep_prob': 1.0,
19 |             'graph_dense_between_every_num_gnn_layers': 10000,
20 |             'graph_residual_connection_every_num_layers': 10000,
21 |         })
22 |         return params
23 | 
24 |     @staticmethod
25 |     def name(params: Dict[str, Any]) -> str:
26 |         return "RGCN"
27 | 
28 |     def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None:
29 |         super().__init__(params, task, run_id, result_dir)
30 | 
31 |     def _apply_gnn_layer(self,
32 |                          node_representations: tf.Tensor,
33 |                          adjacency_lists: List[tf.Tensor],
34 |                          type_to_num_incoming_edges: tf.Tensor,
35 |                          num_timesteps: int) -> tf.Tensor:
36 |         return sparse_rgcn_layer(
37 |             node_embeddings=node_representations,
38 |             adjacency_lists=adjacency_lists,
39 |             type_to_num_incoming_edges=type_to_num_incoming_edges,
40 |             state_dim=self.params['hidden_size'],
41 |             num_timesteps=num_timesteps,
42 |             activation_function=self.params['graph_activation_function'],
43 |             message_aggregation_function=self.params['message_aggregation_function']
44 |         )
45 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/models/rgdcn_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, List
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from .sparse_graph_model import Sparse_Graph_Model
 6 | from tasks import Sparse_Graph_Task
 7 | from gnns import sparse_rgdcn_layer
 8 | 
 9 | 
10 | class RGDCN_Model(Sparse_Graph_Model):
11 |     @classmethod
12 |     def default_params(cls):
13 |         params = super().default_params()
14 |         params.update({
15 |             'max_nodes_in_batch': 25000,
16 |             'hidden_size': 128,
17 |             'num_channels': 8,
18 |             "use_full_state_for_channel_weights": False,
19 |             "tie_channel_weights": False,
20 |             "graph_activation_function": "ReLU",
21 |             "message_aggregation_function": "sum",
22 |             'graph_inter_layer_norm': True,
23 |         })
24 |         return params
25 | 
26 |     @staticmethod
27 |     def name(params: Dict[str, Any]) -> str:
28 |         return "RGDCN"
29 | 
30 |     def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None:
31 |         params['channel_dim'] = params['hidden_size'] // params['num_channels']
32 |         super().__init__(params, task, run_id, result_dir)
33 | 
34 |     def _apply_gnn_layer(self,
35 |                          node_representations: tf.Tensor,
36 |                          adjacency_lists: List[tf.Tensor],
37 |                          type_to_num_incoming_edges: tf.Tensor,
38 |                          num_timesteps: int) -> tf.Tensor:
39 |         return sparse_rgdcn_layer(
40 |             node_embeddings=node_representations,
41 |             adjacency_lists=adjacency_lists,
42 |             type_to_num_incoming_edges=type_to_num_incoming_edges,
43 |             num_channels=self.params['num_channels'],
44 |             channel_dim=self.params['channel_dim'],
45 |             num_timesteps=num_timesteps,
46 |             use_full_state_for_channel_weights=self.params['use_full_state_for_channel_weights'],
47 |             tie_channel_weights=self.params['tie_channel_weights'],
48 |             activation_function=self.params['graph_activation_function'],
49 |             message_aggregation_function=self.params['message_aggregation_function'],
50 |         )
51 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/models/rgin_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, List
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from .sparse_graph_model import Sparse_Graph_Model
 6 | from tasks import Sparse_Graph_Task
 7 | from gnns import sparse_rgin_layer
 8 | 
 9 | 
10 | class RGIN_Model(Sparse_Graph_Model):
11 |     @classmethod
12 |     def default_params(cls):
13 |         params = super().default_params()
14 |         params.update({
15 |             'hidden_size': 128,
16 |             "graph_activation_function": "ReLU",
17 |             'message_aggregation_function': "sum",
18 |             'graph_dense_between_every_num_gnn_layers': 10000,
19 |             'graph_inter_layer_norm': True,
20 |             'use_target_state_as_input': False,
21 |             'graph_num_edge_MLP_hidden_layers': 1,
22 |             'graph_num_aggr_MLP_hidden_layers': None,
23 |         })
24 |         return params
25 | 
26 |     @staticmethod
27 |     def name(params: Dict[str, Any]) -> str:
28 |         return "RGIN"
29 | 
30 |     def __init__(self, params: Dict[str, Any], task: Sparse_Graph_Task, run_id: str, result_dir: str) -> None:
31 |         super().__init__(params, task, run_id, result_dir)
32 | 
33 |     def _apply_gnn_layer(self,
34 |                          node_representations: tf.Tensor,
35 |                          adjacency_lists: List[tf.Tensor],
36 |                          type_to_num_incoming_edges: tf.Tensor,
37 |                          num_timesteps: int
38 |                          ) -> tf.Tensor:
39 |         return sparse_rgin_layer(
40 |             node_embeddings=node_representations,
41 |             adjacency_lists=adjacency_lists,
42 |             state_dim=self.params['hidden_size'],
43 |             num_timesteps=num_timesteps,
44 |             activation_function=self.params['graph_activation_function'],
45 |             message_aggregation_function=self.params['message_aggregation_function'],
46 |             use_target_state_as_input=self.params['use_target_state_as_input'],
47 |             num_edge_MLP_hidden_layers=self.params['graph_num_edge_MLP_hidden_layers'],
48 |             num_aggr_MLP_hidden_layers=self.params['graph_num_aggr_MLP_hidden_layers'],
49 |         )
50 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/models/self_attention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class SelfAttention:
 6 |     def __init__(self,
 7 |                  num_heads,
 8 |                  model_dim,
 9 |                  dropout_keep_prob):
10 | 
11 |         self.num_heads = num_heads
12 |         self.model_dim = model_dim
13 |         self.dropout_keep_prob = dropout_keep_prob
14 | 
15 |     def multi_head(self, batched_inputs, valid_mask=None):
16 |         q = self._linear_projection(batched_inputs)
17 |         qs = self._split_heads(q)
18 |         tiled_inputs = tf.tile(tf.expand_dims(batched_inputs, axis=1), [1, self.num_heads, 1, 1])
19 |         outputs = self._scaled_dot_product(qs, tiled_inputs, tiled_inputs, valid_mask)  # (batch, num_heads, max_contexts, value_dim)
20 |         output = self._concat_heads(outputs)  # (batch, max_contexts, value_dim * num_heads)
21 |         output = tf.layers.dense(output, units=self.model_dim, use_bias=False,
22 |                                  activation=tf.nn.relu)  # (batch, max_contexts, model_dim)
23 | 
24 |         output = tf.nn.dropout(output, keep_prob=self.dropout_keep_prob)
25 |         return output
26 | 
27 |     def _linear_projection(self, batched_inputs):
28 |         q = tf.layers.dense(batched_inputs, units=self.model_dim * self.num_heads,
29 |                             use_bias=False)  # (batch, max_contexts, key_dim * num_heads)
30 |         # k = tf.layers.dense(batched_inputs, units=self.model_dim,
31 |         #                     use_bias=False)  # (batch, max_contexts, key_dim * num_heads)
32 |         return q
33 | 
34 |     def _split_heads(self, q):
35 | 
36 |         def split_last_dimension_then_transpose(tensor, num_heads, dim):
37 |             tensor = tf.reshape(tensor, [-1, tf.shape(tensor)[1], num_heads,
38 |                                          dim])  # (batch, max_contexts, num_heads, dim)
39 |             return tf.transpose(tensor, [0, 2, 1, 3])  # (batch, num_heads, max_contexts, dim)
40 | 
41 |         qs = split_last_dimension_then_transpose(q, self.num_heads,
42 |                                                  self.model_dim)  # (batch, num_heads, max_contexts, key_dim)
43 |         # ks = split_last_dimension_then_transpose(k, self.num_heads,
44 |         #                                          self.model_dim)  # (batch, num_heads, max_contexts, key_dim)
45 |         return qs
46 | 
47 |     def _scaled_dot_product(self, qs, ks, tiled_inputs, valid_mask):
48 |         queries_dot_keys = tf.matmul(qs, ks, transpose_b=True)  # (batch, num_heads, max_contexts, max_contexts)
49 |         scaled_scores = queries_dot_keys #/ ((self.model_dim // self.num_heads) ** 0.5)  # (batch, num_heads, max_contexts, max_contexts)
50 | 
51 |         if valid_mask is not None:
52 |             mask = tf.log(tf.reshape(valid_mask, (
53 |             tf.shape(valid_mask)[0], 1, 1, tf.shape(valid_mask)[1])))  # (batch, 1, 1, max_contexts)
54 |             scaled_scores += mask
55 | 
56 |         attention_weights = tf.nn.softmax(scaled_scores, axis=-1)  # (batch, num_heads, max_contexts, max_contexts)
57 |         return tf.matmul(attention_weights, tiled_inputs)  # (batch, num_heads, max_contexts, value_dim)
58 | 
59 |     def _concat_heads(self, outputs):
60 |         # outputs: (batch, num_heads, max_contexts, value_dim)
61 |         max_contexts = tf.shape(outputs)[2]
62 |         tensor = tf.transpose(outputs, [0, 2, 1, 3])  # [batch, max_contexts, num_heads, value_dim // num_heads]
63 |         return tf.reshape(tensor, [-1, max_contexts, self.model_dim * self.num_heads])
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     sess = tf.InteractiveSession()
68 |     selfatt = SelfAttention(num_heads=2, model_dim=4, dropout_keep_prob=1.0)
69 |     result_op = selfatt.multi_head(tf.constant(np.arange(24).reshape((2, 3, 4)), dtype=tf.float32))
70 |     sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()))
71 |     result = sess.run(result_op)
72 |     print(result.shape)
73 |     print(result)


--------------------------------------------------------------------------------
/tf-gnn-samples/requirements.txt:
--------------------------------------------------------------------------------
1 | docopt
2 | numpy
3 | dpu-utils>=0.1.30
4 | tensorflow-gpu>=1.13.1


--------------------------------------------------------------------------------
/tf-gnn-samples/run_ppi_benchs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     run_ppi_benchs.py [options] LOG_TARGET_DIR
 5 | 
 6 | Options:
 7 |     -h --help         Show this screen.
 8 |     --num-runs NUM    Number of runs to perform for each configuration. [default: 10]
 9 |     --debug           Turn on debugger.
10 | """
11 | import os
12 | import subprocess
13 | import re
14 | import numpy as np
15 | 
16 | from docopt import docopt
17 | from dpu_utils.utils import run_and_debug
18 | 
19 | MODEL_TYPES = ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"]
20 | 
21 | TEST_RES_RE = re.compile('^Metrics: Avg MicroF1: (0.\d+)')
22 | TIME_RE = re.compile('^Training took (\d+)s')
23 | 
24 | 
25 | def run(args):
26 |     target_dir = args['LOG_TARGET_DIR']
27 |     os.makedirs(target_dir, exist_ok=True)
28 |     print("Starting PPI experiments, will write logfiles for runs into %s." % target_dir)
29 |     num_seeds = int(args.get('--num-runs'))
30 |     print("| %- 13s | %- 17s | %- 10s |" % ("Model", "Avg. MicroF1", "Avg. Time"))
31 |     print("|" + "-" * 15 + "|" + "-" * 19 + "|" + "-" * 12 + "|")
32 |     for model in MODEL_TYPES:
33 |         model_f1s = []
34 |         model_times = []
35 |         for seed in range(1, 1 + num_seeds):
36 |             logfile = os.path.join(target_dir, "%s_seed%i.txt" % (model.lower(), seed))
37 |             with open(logfile, "w") as log_fh:
38 |                 subprocess.check_call(["python",
39 |                                        "train.py",
40 |                                        "--quiet",
41 |                                        "--run-test",
42 |                                        model,
43 |                                        "PPI",
44 |                                        "--model-param-overrides",
45 |                                        "{\"random_seed\": %i}" % seed,
46 |                                        ],
47 |                                       stdout=log_fh,
48 |                                       stderr=log_fh)
49 |             with open(logfile, "r") as log_fh:
50 |                 for line in log_fh.readlines():
51 |                     time_match = TIME_RE.search(line)
52 |                     res_match = TEST_RES_RE.search(line)
53 |                     if time_match is not None:
54 |                         model_times.append(int(time_match.groups()[0]))
55 |                     elif res_match is not None:
56 |                         model_f1s.append(float(res_match.groups()[0]))
57 | 
58 |         print("| %- 13s | %.3f (+/- %.3f) |     % 4.1f |"
59 |               % (model,
60 |                  np.mean(model_f1s),
61 |                  np.std(model_f1s),
62 |                  np.mean(model_times)))
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     args = docopt(__doc__)
67 |     run_and_debug(lambda: run(args), enable_debugging=args['--debug'])
68 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/run_qm9_benchs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     run_qm9_benchs.py [options] LOG_TARGET_DIR
 5 | 
 6 | Options:
 7 |     -h --help         Show this screen.
 8 |     --num-runs NUM    Number of runs to perform for each configuration. [default: 5]
 9 |     --debug           Turn on debugger.
10 | """
11 | import os
12 | import subprocess
13 | import re
14 | import numpy as np
15 | 
16 | from docopt import docopt
17 | from dpu_utils.utils import run_and_debug
18 | 
19 | MODEL_TYPES = ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"]
20 | TASKS = ["mu", "alpha", "HOMO", "LUMO", "gap", "R2", "ZPVE", "U0", "U", "H", "G", "Cv", "Omega"]
21 | 
22 | TEST_RES_RE = re.compile('^Metrics: MAEs: \d+:([0-9.]+) \| Error Ratios: \d+:([0-9.]+)')
23 | TIME_RE = re.compile('^Training took (\d+)s')
24 | 
25 | 
26 | def run(args):
27 |     target_dir = args['LOG_TARGET_DIR']
28 |     os.makedirs(target_dir, exist_ok=True)
29 |     print("Starting QM9 experiments, will write logfiles for runs into %s." % target_dir)
30 |     num_seeds = int(args.get('--num-runs'))
31 |     results = {}
32 |     for model in MODEL_TYPES:
33 |         results[model] = [{"test_errors": [], "times": []} for _ in TASKS]
34 |         for task_id in range(len(TASKS)):
35 |             for seed in range(1, 1 + num_seeds):
36 |                 logfile = os.path.join(target_dir, "%s_task%i_seed%i.txt" % (model, task_id, seed))
37 |                 with open(logfile, "w") as log_fh:
38 |                     subprocess.check_call(["python",
39 |                                            "train.py",
40 |                                            "--run-test",
41 |                                            model,
42 |                                            "QM9",
43 |                                            "--model-param-overrides",
44 |                                            "{\"random_seed\": %i}" % seed,
45 |                                            "--task-param-overrides",
46 |                                            "{\"task_ids\": [%i]}" % task_id,
47 |                                            ],
48 |                                           stdout=log_fh,
49 |                                           stderr=log_fh)
50 |                 with open(logfile, "r") as log_fh:
51 |                     for line in log_fh.readlines():
52 |                         time_match = TIME_RE.search(line)
53 |                         res_match = TEST_RES_RE.search(line)
54 |                         if time_match is not None:
55 |                             results[model][task_id]["times"].append(int(time_match.groups()[0]))
56 |                         elif res_match is not None:
57 |                             results[model][task_id]["test_errors"].append(float(res_match.groups()[1]))
58 | 
59 |     row_fmt_string = "%7s " + "&% 35s " * len(MODEL_TYPES) + "\\\\"
60 |     print(row_fmt_string % tuple([""] + MODEL_TYPES))
61 |     for task_id, task in enumerate(TASKS):
62 |         model_results = []
63 |         for model in MODEL_TYPES:
64 |             err = np.mean(results[model][task_id]["test_errors"])
65 |             std = np.std(results[model][task_id]["test_errors"])
66 |             time_in_min = np.mean(results[model][task_id]["times"]) / 60
67 |             model_results.append("%.2f & ($\pm %.2f$; $%.1f$min)" % (err, std, time_in_min))
68 |         print(row_fmt_string % tuple([task] + model_results))
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     args = docopt(__doc__)
73 |     run_and_debug(lambda: run(args), enable_debugging=args['--debug'])
74 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/run_qm9_benchs_fa.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     run_qm9_benchs.py [options] LOG_TARGET_DIR
 5 | 
 6 | Options:
 7 |     -h --help         Show this screen.
 8 |     --num-runs NUM    Number of runs to perform for each configuration. [default: 5]
 9 |     --debug           Turn on debugger.
10 | """
11 | import os
12 | import subprocess
13 | import re
14 | import numpy as np
15 | 
16 | from docopt import docopt
17 | from dpu_utils.utils import run_and_debug
18 | 
19 | #MODEL_TYPES = ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"]
20 | MODEL_TYPES = ["GNN-Edge-MLP0"] #, "RGAT", "GNN_FiLM"] # ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"]
21 | TASKS = ["mu", "alpha", "HOMO", "LUMO", "gap", "R2", "ZPVE", "U0", "U", "H", "G", "Cv", "Omega"]
22 | 
23 | TEST_RES_RE = re.compile('^Metrics: MAEs: \d+:([0-9.]+) \| Error Ratios: \d+:([0-9.]+)')
24 | TIME_RE = re.compile('^Training took (\d+)s')
25 | 
26 | 
27 | def run(args):
28 |     target_dir = args['LOG_TARGET_DIR']
29 |     os.makedirs(target_dir, exist_ok=True)
30 |     print("Starting QM9 experiments, will write logfiles for runs into %s." % target_dir)
31 |     num_seeds = int(args.get('--num-runs'))
32 |     results = {}
33 |     for model in MODEL_TYPES:
34 |         results[model] = [{"test_errors": [], "times": []} for _ in TASKS]
35 |         for task_id in range(len(TASKS)):
36 |             for seed in range(1, 1 + num_seeds):
37 |                 logfile = os.path.join(target_dir, "%s_task%i_seed%i.txt" % (model, task_id, seed))
38 |                 with open(logfile, "w") as log_fh:
39 |                     subprocess.check_call(["python",
40 |                                            "train.py",
41 |                                            "--run-test",
42 |                                            model,
43 |                                            "QM9",
44 |                                            "--model-param-overrides",
45 |                                            "{\"random_seed\": %i,\"last_layer_fa\":true,\"max_nodes_in_batch\":30000}" % seed,
46 |                                            "--task-param-overrides",
47 |                                            "{\"task_ids\": [%i]}" % task_id,
48 |                                            ],
49 |                                           stdout=log_fh,
50 |                                           stderr=log_fh)
51 |                 with open(logfile, "r") as log_fh:
52 |                     for line in log_fh.readlines():
53 |                         time_match = TIME_RE.search(line)
54 |                         res_match = TEST_RES_RE.search(line)
55 |                         if time_match is not None:
56 |                             results[model][task_id]["times"].append(int(time_match.groups()[0]))
57 |                         elif res_match is not None:
58 |                             results[model][task_id]["test_errors"].append(float(res_match.groups()[1]))
59 | 
60 |     row_fmt_string = "%7s " + "&% 35s " * len(MODEL_TYPES) + "\\\\"
61 |     print(row_fmt_string % tuple([""] + MODEL_TYPES))
62 |     for task_id, task in enumerate(TASKS):
63 |         model_results = []
64 |         for model in MODEL_TYPES:
65 |             err = np.mean(results[model][task_id]["test_errors"])
66 |             std = np.std(results[model][task_id]["test_errors"])
67 |             time_in_min = np.mean(results[model][task_id]["times"]) / 60
68 |             model_results.append("%.2f & ($\pm %.2f$; $%.1f$min)" % (err, std, time_in_min))
69 |         print(row_fmt_string % tuple([task] + model_results))
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     args = docopt(__doc__)
74 |     run_and_debug(lambda: run(args), enable_debugging=args['--debug'])
75 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/run_varmisuse_benchs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     run_varmisuse_benchs.py [options] LOG_TARGET_DIR
 5 | 
 6 | Options:
 7 |     -h --help         Show this screen.
 8 |     --num-runs NUM    Number of runs to perform for each configuration. [default: 5]
 9 |     --debug           Turn on debugger.
10 | """
11 | import os
12 | import subprocess
13 | import re
14 | import numpy as np
15 | 
16 | from docopt import docopt
17 | from dpu_utils.utils import run_and_debug
18 | 
19 | MODEL_TYPES = ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"]
20 | 
21 | TEST_RES_RE = re.compile('^Metrics: Accuracy: (0.\d+)')
22 | VALID_RES_RE = re.compile('Best validation results: Accuracy: (0.\d+)')
23 | MODEL_FILE_RE = re.compile('^Loading model from file (.+)\.')
24 | 
25 | 
26 | def run(args):
27 |     target_dir = args['LOG_TARGET_DIR']
28 |     os.makedirs(target_dir, exist_ok=True)
29 |     print("Starting VarMisuse experiments, will write logfiles for runs into %s." % target_dir)
30 |     num_seeds = int(args.get('--num-runs'))
31 |     print("| %- 14s | %- 17s | %- 17s | %- 17s |" % ("Model",
32 |                                                      "Valid Acc",
33 |                                                      "Test Acc",
34 |                                                      "TestOnly Acc"))
35 |     print("|" + "-" * 16 + "|" + "-" * 19 + "|" + "-" * 19 + "|" + "-" * 19 + "|")
36 |     for model in MODEL_TYPES:
37 |         valid_accs, test_accs, testonly_accs = [], [], []
38 |         for seed in range(1, 1 + num_seeds):
39 |             logfile = os.path.join(target_dir, "%s_seed%i.txt" % (model.lower(), seed))
40 |             test_logfile = os.path.join(target_dir, "%s_seed%i-testonly.txt" % (model.lower(), seed))
41 |             with open(logfile, "w") as log_fh:
42 |                 subprocess.check_call(["python",
43 |                                        "train.py",
44 |                                        "--quiet",
45 |                                        "--run-test",
46 |                                        model,
47 |                                        "VarMisuse",
48 |                                        "--model-param-overrides",
49 |                                        "{\"random_seed\": %i}" % seed,
50 |                                        ],
51 |                                       stdout=log_fh,
52 |                                       stderr=log_fh)
53 |             model_file = None
54 |             with open(logfile, "r") as log_fh:
55 |                 for line in log_fh.readlines():
56 |                     valid_res_match = VALID_RES_RE.search(line)
57 |                     test_res_match = TEST_RES_RE.search(line)
58 |                     model_file_match = MODEL_FILE_RE.search(line)
59 |                     if valid_res_match is not None:
60 |                         valid_accs.append(float(valid_res_match.groups()[0]))
61 |                     elif test_res_match is not None:
62 |                         test_accs.append(float(test_res_match.groups()[0]))
63 |                     elif model_file_match is not None:
64 |                         model_file = model_file_match.groups()[0]
65 | 
66 |             # Run TestOnly
67 |             assert model_file is not None, "Could not find saved model file"
68 |             with open(test_logfile, "w") as log_fh:
69 |                 subprocess.check_call(["python",
70 |                                        "test.py",
71 |                                        "--quiet",
72 |                                        model_file,
73 |                                        "data/varmisuse/graphs-testonly",
74 |                                        ],
75 |                                       stdout=log_fh,
76 |                                       stderr=log_fh)
77 |             with open(test_logfile, "r") as log_fh:
78 |                 for line in log_fh.readlines():
79 |                     test_res_match = TEST_RES_RE.search(line)
80 |                     if test_res_match is not None:
81 |                         testonly_accs.append(float(test_res_match.groups()[0]))
82 |  
83 |         print("| %- 14s | %.3f (+/- %.3f) | %.3f (+/- %.3f) | %.3f (+/- %.3f) |"
84 |               % (model,
85 |                  np.mean(valid_accs),
86 |                  np.std(valid_accs),
87 |                  np.mean(test_accs),
88 |                  np.std(test_accs),
89 |                  np.mean(testonly_accs),
90 |                  np.std(testonly_accs),
91 |                 ))
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     args = docopt(__doc__)
96 |     run_and_debug(lambda: run(args), enable_debugging=args['--debug'])
97 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/run_varmisuse_benchs_fa.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |     run_varmisuse_benchs.py [options] LOG_TARGET_DIR
 5 | 
 6 | Options:
 7 |     -h --help         Show this screen.
 8 |     --num-runs NUM    Number of runs to perform for each configuration. [default: 5]
 9 |     --debug           Turn on debugger.
10 | """
11 | import os
12 | import subprocess
13 | import re
14 | import numpy as np
15 | 
16 | from docopt import docopt
17 | from dpu_utils.utils import run_and_debug
18 | 
19 | #MODEL_TYPES = ["GGNN", "RGCN", "RGAT", "RGIN", "GNN-Edge-MLP0", "GNN-Edge-MLP1", "GNN_FiLM"]
20 | MODEL_TYPES = ["RGAT"]
21 | 
22 | TEST_RES_RE = re.compile('^Metrics: Accuracy: (0.\d+)')
23 | VALID_RES_RE = re.compile('Best validation results: Accuracy: (0.\d+)')
24 | MODEL_FILE_RE = re.compile('^Loading model from file (.+)\.')
25 | 
26 | 
27 | def run(args):
28 |     target_dir = args['LOG_TARGET_DIR']
29 |     os.makedirs(target_dir, exist_ok=True)
30 |     print("Starting VarMisuse experiments, will write logfiles for runs into %s." % target_dir)
31 |     num_seeds = int(args.get('--num-runs'))
32 |     print("| %- 14s | %- 17s | %- 17s | %- 17s |" % ("Model",
33 |                                                      "Valid Acc",
34 |                                                      "Test Acc",
35 |                                                      "TestOnly Acc"))
36 |     print("|" + "-" * 16 + "|" + "-" * 19 + "|" + "-" * 19 + "|" + "-" * 19 + "|")
37 |     for model in MODEL_TYPES:
38 |         valid_accs, test_accs, testonly_accs = [], [], []
39 |         for seed in range(1, 1 + num_seeds):
40 |             logfile = os.path.join(target_dir, "%s_seed%i.txt" % (model.lower(), seed))
41 |             test_logfile = os.path.join(target_dir, "%s_seed%i-testonly.txt" % (model.lower(), seed))
42 |             with open(logfile, "w") as log_fh:
43 |                 subprocess.check_call(["python",
44 |                                        "train.py",
45 |                                        "--quiet",
46 |                                        "--run-test",
47 |                                        model,
48 |                                        "VarMisuse",
49 |                                        "--model-param-overrides",
50 |                                        "{\"random_seed\": %i,\"last_layer_fa\":true}" % seed,
51 |                                        ],
52 |                                       stdout=log_fh,
53 |                                       stderr=log_fh)
54 |             model_file = None
55 |             with open(logfile, "r") as log_fh:
56 |                 for line in log_fh.readlines():
57 |                     valid_res_match = VALID_RES_RE.search(line)
58 |                     test_res_match = TEST_RES_RE.search(line)
59 |                     model_file_match = MODEL_FILE_RE.search(line)
60 |                     if valid_res_match is not None:
61 |                         valid_accs.append(float(valid_res_match.groups()[0]))
62 |                     elif test_res_match is not None:
63 |                         test_accs.append(float(test_res_match.groups()[0]))
64 |                     elif model_file_match is not None:
65 |                         model_file = model_file_match.groups()[0]
66 | 
67 |             # Run TestOnly
68 |             assert model_file is not None, "Could not find saved model file"
69 |             with open(test_logfile, "w") as log_fh:
70 |                 subprocess.check_call(["python",
71 |                                        "test.py",
72 |                                        "--quiet",
73 |                                        model_file,
74 |                                        "data/varmisuse/graphs-testonly",
75 |                                        ],
76 |                                       stdout=log_fh,
77 |                                       stderr=log_fh)
78 |             with open(test_logfile, "r") as log_fh:
79 |                 for line in log_fh.readlines():
80 |                     test_res_match = TEST_RES_RE.search(line)
81 |                     if test_res_match is not None:
82 |                         testonly_accs.append(float(test_res_match.groups()[0]))
83 | 
84 |         print("| %- 14s | %.3f (+/- %.3f) | %.3f (+/- %.3f) | %.3f (+/- %.3f) |"
85 |               % (model,
86 |                  np.mean(valid_accs),
87 |                  np.std(valid_accs),
88 |                  np.mean(test_accs),
89 |                  np.std(test_accs),
90 |                  np.mean(testonly_accs),
91 |                  np.std(testonly_accs),
92 |                 ))
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     args = docopt(__doc__)
97 |     run_and_debug(lambda: run(args), enable_debugging=args['--debug'])
98 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .sparse_graph_task import Sparse_Graph_Task, DataFold
2 | from .qm9_task import QM9_Task
3 | from .citation_network_task import Citation_Network_Task
4 | from .ppi_task import PPI_Task
5 | from .varmisuse_task import VarMisuse_Task
6 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/PPI_GGNN.json:
--------------------------------------------------------------------------------
1 | {"task_params": {},
2 |  "model_params": {"graph_num_layers": 3,
3 |                   "hidden_size": 320, 
4 |                   "max_nodes_in_batch": 12500,
5 |                   "graph_layer_input_dropout_keep_prob": 0.9
6 |                  }
7 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/PPI_GNN-Edge-MLP0.json:
--------------------------------------------------------------------------------
1 | {"task_params": {},
2 |  "model_params": {"graph_num_layers": 5,
3 |                   "hidden_size": 256, 
4 |                   "max_nodes_in_batch": 6000,
5 |                   "graph_layer_input_dropout_keep_prob": 0.8
6 |                  }
7 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/PPI_GNN-Edge-MLP1.json:
--------------------------------------------------------------------------------
1 | {"task_params": {},
2 |  "model_params": {"graph_num_layers": 4,
3 |                   "hidden_size": 320, 
4 |                   "max_nodes_in_batch": 6000,
5 |                   "graph_layer_input_dropout_keep_prob": 0.9
6 |                  }
7 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/PPI_GNN-FiLM.json:
--------------------------------------------------------------------------------
1 | {"task_params": {},
2 |  "model_params": {"graph_num_layers": 4,
3 |                   "hidden_size": 320, 
4 |                   "max_nodes_in_batch": 6000,
5 |                   "graph_layer_input_dropout_keep_prob": 0.9
6 |                  }
7 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/PPI_RGAT.json:
--------------------------------------------------------------------------------
1 | {"task_params": {},
2 |  "model_params": {"graph_num_layers": 3,
3 |                   "hidden_size": 320, 
4 |                   "max_nodes_in_batch": 11000,
5 |                   "graph_layer_input_dropout_keep_prob": 0.9
6 |                  }
7 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/PPI_RGCN.json:
--------------------------------------------------------------------------------
1 | {"task_params": {},
2 |  "model_params": {"graph_num_layers": 4,
3 |                   "hidden_size": 320, 
4 |                   "max_nodes_in_batch": 12500,
5 |                   "graph_layer_input_dropout_keep_prob": 0.9
6 |                  }
7 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/PPI_RGIN.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {
 3 |         "add_self_loop_edges": true
 4 |     },
 5 |     "model_params": {
 6 |         "patience": 25,
 7 |         "graph_num_layers": 5,
 8 |         "hidden_size": 256,
 9 |         "max_nodes_in_batch": 8000,
10 |         "graph_num_edge_MLP_hidden_layers": 1,
11 |         "graph_num_aggr_MLP_hidden_layers": null,
12 |         "graph_layer_input_dropout_keep_prob": 0.8
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/QM9_GGNN.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {},
 3 |     "model_params": {
 4 |         "graph_dense_between_every_num_gnn_layers": 32,
 5 |         "learning_rate": 0.0008471209461829375,
 6 |         "graph_inter_layer_norm": true,
 7 |         "graph_activation_function": "relu",
 8 |         "graph_num_timesteps_per_layer": 1,
 9 |         "graph_model_activation_function": "tanh",
10 |         "momentum": 0.85,
11 |         "optimizer": "RMSProp",
12 |         "clamp_gradient_norm": 1.0,
13 |         "patience": 25,
14 |         "max_epochs": 10000,
15 |         "graph_rnn_cell": "RNN",
16 |         "graph_layer_input_dropout_keep_prob": 1.0,
17 |         "graph_num_layers": 6,
18 |         "message_aggregation_function": "sum",
19 |         "graph_residual_connection_every_num_layers": 2,
20 |         "hidden_size": 128,
21 |         "max_nodes_in_batch": 50000,
22 |         "learning_rate_decay": 0.98
23 |     }
24 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/QM9_GNN-Edge-MLP0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {},
 3 |     "model_params": {
 4 |         "max_nodes_in_batch": 50000,
 5 |         "graph_num_layers": 8,
 6 |         "graph_num_timesteps_per_layer": 1,
 7 |         "graph_layer_input_dropout_keep_prob": 0.9,
 8 |         "graph_dense_between_every_num_gnn_layers": 32,
 9 |         "graph_model_activation_function": "tanh",
10 |         "graph_residual_connection_every_num_layers": 2,
11 |         "graph_inter_layer_norm": true,
12 |         "max_epochs": 10000,
13 |         "patience": 25,
14 |         "optimizer": "RMSProp",
15 |         "learning_rate": 0.0005072060718321982,
16 |         "learning_rate_decay": 0.98,
17 |         "lr_for_num_graphs_per_batch": null,
18 |         "momentum": 0.85,
19 |         "clamp_gradient_norm": 1.0,
20 |         "hidden_size": 128,
21 |         "graph_activation_function": "relu",
22 |         "message_aggregation_function": "sum",
23 |         "graph_message_weights_dropout_ratio": 0.0,
24 |         "use_target_state_as_input": true,
25 |         "num_edge_hidden_layers": 0
26 |     }
27 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/QM9_GNN-Edge-MLP1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {},
 3 |     "model_params": {
 4 |         "max_nodes_in_batch": 50000,
 5 |         "graph_num_layers": 8,
 6 |         "graph_num_timesteps_per_layer": 1,
 7 |         "graph_layer_input_dropout_keep_prob": 0.9,
 8 |         "graph_dense_between_every_num_gnn_layers": 32,
 9 |         "graph_model_activation_function": "tanh",
10 |         "graph_residual_connection_every_num_layers": 2,
11 |         "graph_inter_layer_norm": false,
12 |         "max_epochs": 10000,
13 |         "patience": 25,
14 |         "optimizer": "Adam",
15 |         "learning_rate": 0.0006482335154980316,
16 |         "learning_rate_decay": 0.98,
17 |         "lr_for_num_graphs_per_batch": null,
18 |         "momentum": 0.85,
19 |         "clamp_gradient_norm": 1.0,
20 |         "hidden_size": 128,
21 |         "graph_activation_function": "gelu",
22 |         "message_aggregation_function": "sum",
23 |         "graph_message_weights_dropout_ratio": 0.0,
24 |         "use_target_state_as_input": true,
25 |         "num_edge_hidden_layers": 1
26 |     }
27 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/QM9_GNN-FiLM.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {},
 3 |     "model_params": {
 4 |         "message_aggregation_function": "sum",
 5 |         "graph_activation_function": "elu",
 6 |         "momentum": 0.85,
 7 |         "learning_rate_decay": 0.98,
 8 |         "patience": 25,
 9 |         "normalize_messages_by_num_incoming": false,
10 |         "max_epochs": 10000,
11 |         "graph_num_timesteps_per_layer": 1,
12 |         "optimizer": "RMSProp",
13 |         "hidden_size": 128,
14 |         "graph_num_layers": 8,
15 |         "graph_residual_connection_every_num_layers": 2,
16 |         "graph_layer_input_dropout_keep_prob": 0.9,
17 |         "learning_rate": 0.0006654723503723253,
18 |         "graph_inter_layer_norm": true,
19 |         "graph_dense_between_every_num_gnn_layers": 32,
20 |         "max_nodes_in_batch": 50000,
21 |         "graph_model_activation_function": "tanh",
22 |         "clamp_gradient_norm": 1.0
23 |     }
24 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/QM9_NoStruct-MLP0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {},
 3 |     "model_params": {
 4 |         "max_nodes_in_batch": 50000,
 5 |         "graph_num_layers": 8,
 6 |         "graph_num_timesteps_per_layer": 1,
 7 |         "graph_layer_input_dropout_keep_prob": 0.9,
 8 |         "graph_dense_between_every_num_gnn_layers": 32,
 9 |         "graph_model_activation_function": "tanh",
10 |         "graph_residual_connection_every_num_layers": 2,
11 |         "graph_inter_layer_norm": true,
12 |         "max_epochs": 10000,
13 |         "patience": 25,
14 |         "optimizer": "RMSProp",
15 |         "learning_rate": 0.0005072060718321982,
16 |         "learning_rate_decay": 0.98,
17 |         "lr_for_num_graphs_per_batch": null,
18 |         "momentum": 0.85,
19 |         "clamp_gradient_norm": 1.0,
20 |         "hidden_size": 128,
21 |         "graph_activation_function": "relu",
22 |         "message_aggregation_function": "sum",
23 |         "graph_message_weights_dropout_ratio": 0.0,
24 |         "use_target_state_as_input": true,
25 |         "num_edge_hidden_layers": 0
26 |     }
27 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/QM9_RGAT.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {},
 3 |     "model_params": {
 4 |         "graph_model_activation_function": "tanh",
 5 |         "patience": 25,
 6 |         "optimizer": "RMSProp",
 7 |         "graph_activation_function": "elu",
 8 |         "learning_rate_decay": 0.98,
 9 |         "max_nodes_in_batch": 50000,
10 |         "graph_layer_input_dropout_keep_prob": 0.9,
11 |         "graph_inter_layer_norm": false,
12 |         "clamp_gradient_norm": 1.0,
13 |         "graph_num_layers": 8,
14 |         "momentum": 0.85,
15 |         "graph_dense_between_every_num_gnn_layers": 32,
16 |         "hidden_size": 128,
17 |         "graph_residual_connection_every_num_layers": 2,
18 |         "num_heads": 8,
19 |         "learning_rate": 0.0005800837190772856,
20 |         "graph_num_timesteps_per_layer": 1,
21 |         "max_epochs": 10000
22 |     }
23 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/QM9_RGCN.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {},
 3 |     "model_params": {
 4 |         "graph_residual_connection_every_num_layers": 2,
 5 |         "max_nodes_in_batch": 50000,
 6 |         "graph_num_layers": 8,
 7 |         "graph_model_activation_function": "tanh",
 8 |         "graph_layer_input_dropout_keep_prob": 1.0,
 9 |         "graph_activation_function": "leaky_relu",
10 |         "graph_num_timesteps_per_layer": 1,
11 |         "learning_rate_decay": 0.98,
12 |         "max_epochs": 10000,
13 |         "momentum": 0.85,
14 |         "message_aggregation_function": "sum",
15 |         "graph_dense_between_every_num_gnn_layers": 32,
16 |         "learning_rate": 0.0005720408870458782,
17 |         "graph_inter_layer_norm": true,
18 |         "hidden_size": 128,
19 |         "clamp_gradient_norm": 1.0,
20 |         "patience": 25,
21 |         "optimizer": "RMSProp"
22 |     }
23 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/QM9_RGIN.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {
 3 |         "add_self_loop_edges": true
 4 |     },
 5 |     "model_params": {
 6 |         "max_nodes_in_batch": 50000,
 7 |         "graph_num_layers": 6,
 8 |         "graph_num_timesteps_per_layer": 1,
 9 |         "graph_layer_input_dropout_keep_prob": 0.9,
10 |         "graph_dense_between_every_num_gnn_layers": 32,
11 |         "graph_model_activation_function": "tanh",
12 |         "graph_residual_connection_every_num_layers": 2,
13 |         "graph_inter_layer_norm": false,
14 |         "max_epochs": 10000,
15 |         "patience": 25,
16 |         "optimizer": "RMSProp",
17 |         "learning_rate": 0.000700776770702023,
18 |         "learning_rate_decay": 0.98,
19 |         "lr_for_num_graphs_per_batch": null,
20 |         "momentum": 0.85,
21 |         "clamp_gradient_norm": 1.0,
22 |         "hidden_size": 128,
23 |         "graph_activation_function": "elu",
24 |         "message_aggregation_function": "sum",
25 |         "use_target_state_as_input": false,
26 |         "graph_num_edge_MLP_hidden_layers": 1,
27 |         "graph_num_aggr_MLP_hidden_layers": null
28 |     }
29 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/VarMisuse_GGNN.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {
 3 |         "add_self_loop_edges": false
 4 |     },
 5 |     "model_params": {
 6 |         "max_nodes_in_batch": 100000,
 7 |         "graph_num_layers": 6,
 8 |         "graph_num_timesteps_per_layer": 1,
 9 |         "graph_layer_input_dropout_keep_prob": 0.8,
10 |         "graph_message_weights_dropout_ratio": 0.0,
11 |         "graph_dense_between_every_num_gnn_layers": 10000,
12 |         "graph_model_activation_function": "tanh",
13 |         "graph_residual_connection_every_num_layers": 10000,
14 |         "graph_inter_layer_norm": false,
15 |         "max_epochs": 10000,
16 |         "patience": 5,
17 |         "optimizer": "Adam",
18 |         "learning_rate": 0.00015,
19 |         "learning_rate_decay": 0.98,
20 |         "momentum": 0.85,
21 |         "clamp_gradient_norm": 1.0,
22 |         "random_seed": 0,
23 |         "hidden_size": 96,
24 |         "graph_rnn_cell": "GRU",
25 |         "graph_activation_function": "tanh",
26 |         "message_aggregation_function": "sum"
27 |     }
28 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/VarMisuse_GNN-Edge-MLP0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {
 3 |         "add_self_loop_edges": false
 4 |     },
 5 |     "model_params": {
 6 |         "max_nodes_in_batch": 50000,
 7 |         "graph_num_layers": 8,
 8 |         "graph_num_timesteps_per_layer": 1,
 9 |         "graph_layer_input_dropout_keep_prob": 0.9,
10 |         "graph_dense_between_every_num_gnn_layers": 1,
11 |         "graph_model_activation_function": "tanh",
12 |         "graph_residual_connection_every_num_layers": 2,
13 |         "graph_inter_layer_norm": true,
14 |         "max_epochs": 10000,
15 |         "patience": 5,
16 |         "optimizer": "Adam",
17 |         "learning_rate": 0.00015,
18 |         "learning_rate_decay": 0.98,
19 |         "lr_for_num_graphs_per_batch": 30,
20 |         "momentum": 0.85,
21 |         "clamp_gradient_norm": 1.0,
22 |         "random_seed": 0,
23 |         "hidden_size": 128,
24 |         "graph_activation_function": "gelu",
25 |         "message_aggregation_function": "sum",
26 |         "graph_message_weights_dropout_ratio": 0.0,
27 |         "use_target_state_as_input": true,
28 |         "num_edge_hidden_layers": 0
29 |     }
30 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/VarMisuse_GNN-Edge-MLP1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {
 3 |         "add_self_loop_edges": false
 4 |     },
 5 |     "model_params": {
 6 |         "max_nodes_in_batch": 45000,
 7 |         "graph_num_layers": 10,
 8 |         "graph_num_timesteps_per_layer": 1,
 9 |         "graph_layer_input_dropout_keep_prob": 0.9,
10 |         "graph_dense_between_every_num_gnn_layers": 1,
11 |         "graph_model_activation_function": "tanh",
12 |         "graph_residual_connection_every_num_layers": 2,
13 |         "graph_inter_layer_norm": true,
14 |         "max_epochs": 10000,
15 |         "patience": 5,
16 |         "optimizer": "Adam",
17 |         "learning_rate": 0.00015,
18 |         "learning_rate_decay": 0.98,
19 |         "lr_for_num_graphs_per_batch": 30,
20 |         "momentum": 0.85,
21 |         "clamp_gradient_norm": 1.0,
22 |         "random_seed": 0,
23 |         "hidden_size": 128,
24 |         "graph_activation_function": "gelu",
25 |         "message_aggregation_function": "sum",
26 |         "graph_message_weights_dropout_ratio": 0.0,
27 |         "use_target_state_as_input": true,
28 |         "num_edge_hidden_layers": 1
29 |     }
30 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/VarMisuse_GNN-FiLM.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {
 3 |         "add_self_loop_edges": true
 4 |     },
 5 |     "model_params": {
 6 |         "max_nodes_in_batch": 60000,
 7 |         "graph_num_layers": 10,
 8 |         "graph_num_timesteps_per_layer": 1,
 9 |         "graph_layer_input_dropout_keep_prob": 0.8,
10 |         "graph_message_weights_dropout_ratio": 0.0,
11 |         "graph_dense_between_every_num_gnn_layers": 1,
12 |         "graph_model_activation_function": "tanh",
13 |         "graph_residual_connection_every_num_layers": 2,
14 |         "graph_inter_layer_norm": false,
15 |         "max_epochs": 10000,
16 |         "patience": 5,
17 |         "optimizer": "Adam",
18 |         "learning_rate": 0.00015,
19 |         "learning_rate_decay": 0.98,
20 |         "momentum": 0.85,
21 |         "clamp_gradient_norm": 1.0,
22 |         "random_seed": 0,
23 |         "hidden_size": 128,
24 |         "graph_activation_function": "ReLU",
25 |         "message_aggregation_function": "sum",
26 |         "normalize_messages_by_num_incoming": false
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/VarMisuse_NoStruct-MLP1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {
 3 |         "add_self_loop_edges": false
 4 |     },
 5 |     "model_params": {
 6 |         "max_nodes_in_batch": 45000,
 7 |         "graph_num_layers": 10,
 8 |         "graph_num_timesteps_per_layer": 1,
 9 |         "graph_layer_input_dropout_keep_prob": 0.9,
10 |         "graph_dense_between_every_num_gnn_layers": 1,
11 |         "graph_model_activation_function": "tanh",
12 |         "graph_residual_connection_every_num_layers": 2,
13 |         "graph_inter_layer_norm": true,
14 |         "max_epochs": 10000,
15 |         "patience": 5,
16 |         "optimizer": "Adam",
17 |         "learning_rate": 0.00015,
18 |         "learning_rate_decay": 0.98,
19 |         "lr_for_num_graphs_per_batch": 30,
20 |         "momentum": 0.85,
21 |         "clamp_gradient_norm": 1.0,
22 |         "random_seed": 0,
23 |         "hidden_size": 128,
24 |         "graph_activation_function": "gelu",
25 |         "message_aggregation_function": "sum",
26 |         "graph_message_weights_dropout_ratio": 0.0,
27 |         "use_target_state_as_input": true,
28 |         "num_edge_hidden_layers": 1
29 |     }
30 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/VarMisuse_RGAT.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {
 3 |         "add_self_loop_edges": true
 4 |     },
 5 |     "model_params": {
 6 |         "max_nodes_in_batch": 50000,
 7 |         "graph_num_layers": 8,
 8 |         "graph_num_timesteps_per_layer": 1,
 9 |         "graph_layer_input_dropout_keep_prob": 0.9,
10 |         "graph_dense_between_every_num_gnn_layers": 10000,
11 |         "graph_model_activation_function": "tanh",
12 |         "graph_residual_connection_every_num_layers": 10000,
13 |         "graph_inter_layer_norm": false,
14 |         "max_epochs": 10000,
15 |         "patience": 5,
16 |         "optimizer": "Adam",
17 |         "learning_rate": 0.00015,
18 |         "learning_rate_decay": 0.98,
19 |         "lr_for_num_graphs_per_batch": 30,
20 |         "momentum": 0.85,
21 |         "clamp_gradient_norm": 1.0,
22 |         "random_seed": 0,
23 |         "hidden_size": 96,
24 |         "num_heads": 8,
25 |         "graph_activation_function": "tanh"
26 |     }
27 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/VarMisuse_RGCN.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {
 3 |         "add_self_loop_edges": true
 4 |     },
 5 |     "model_params": {
 6 |         "max_nodes_in_batch": 75000,
 7 |         "graph_num_layers": 10,
 8 |         "graph_num_timesteps_per_layer": 1,
 9 |         "graph_layer_input_dropout_keep_prob": 0.9,
10 |         "graph_dense_between_every_num_gnn_layers": 10000,
11 |         "graph_model_activation_function": "tanh",
12 |         "graph_residual_connection_every_num_layers": 10000,
13 |         "graph_inter_layer_norm": false,
14 |         "max_epochs": 10000,
15 |         "patience": 5,
16 |         "optimizer": "Adam",
17 |         "learning_rate": 0.00015,
18 |         "learning_rate_decay": 0.98,
19 |         "momentum": 0.85,
20 |         "clamp_gradient_norm": 1.0,
21 |         "random_seed": 0,
22 |         "hidden_size": 128,
23 |         "graph_activation_function": "ReLU",
24 |         "message_aggregation_function": "sum",
25 |         "min_epochs": 8
26 |     }
27 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/tasks/default_hypers/VarMisuse_RGIN.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_params": {
 3 |         "add_self_loop_edges": true
 4 |     },
 5 |     "model_params": {
 6 |         "max_nodes_in_batch": 50000,
 7 |         "graph_num_layers": 6,
 8 |         "graph_num_timesteps_per_layer": 1,
 9 |         "graph_layer_input_dropout_keep_prob": 0.8,
10 |         "graph_dense_between_every_num_gnn_layers": 1,
11 |         "graph_model_activation_function": "tanh",
12 |         "graph_residual_connection_every_num_layers": 2,
13 |         "graph_inter_layer_norm": true,
14 |         "max_epochs": 10000,
15 |         "patience": 5,
16 |         "optimizer": "Adam",
17 |         "learning_rate": 0.00015,
18 |         "learning_rate_decay": 0.98,
19 |         "lr_for_num_graphs_per_batch": 30,
20 |         "momentum": 0.85,
21 |         "clamp_gradient_norm": 1.0,
22 |         "hidden_size": 128,
23 |         "graph_activation_function": "ReLU",
24 |         "message_aggregation_function": "sum",
25 |         "use_target_state_as_input": false,
26 |         "graph_num_edge_MLP_hidden_layers": 1,
27 |         "graph_num_aggr_MLP_hidden_layers": null
28 |     }
29 | }


--------------------------------------------------------------------------------
/tf-gnn-samples/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |    test.py [options] STORED_MODEL_PATH [DATA_PATH]
 5 | 
 6 | STORED_MODEL is the path of a model snapshot created by train.py.
 7 | DATA_PATH is the location of the data to test on.
 8 | 
 9 | Options:
10 |     -h --help                       Show this screen.
11 |     --result-dir DIR                Directory to store logfiles and trained models. [default: trained_models]
12 |     --azure-info PATH               Azure authentication information file (JSON). [default: azure_auth.json]
13 |     --quiet                         Show less output.
14 |     --debug                         Turn on debugger.
15 | """
16 | import json
17 | from typing import Optional
18 | 
19 | from docopt import docopt
20 | from dpu_utils.utils import run_and_debug, RichPath
21 | 
22 | from utils.model_utils import restore
23 | 
24 | 
25 | def test(model_path: str, test_data_path: Optional[RichPath], result_dir: str, quiet: bool = False, run_id: str = None):
26 |     model = restore(model_path, result_dir, run_id)
27 |     model.params['max_nodes_in_batch'] = 2 * model.params['max_nodes_in_batch']  # We can process larger batches if we don't do training
28 |     test_data_path = test_data_path or RichPath.create(model.task.default_data_path())
29 |     model.log_line(" Using the following task params: %s" % json.dumps(model.task.params))
30 |     model.log_line(" Using the following model params: %s" % json.dumps(model.params))
31 |     model.test(test_data_path)
32 | 
33 | 
34 | def run(args):
35 |     azure_info_path = args.get('--azure-info', None)
36 |     model_path = args['STORED_MODEL_PATH']
37 |     test_data_path = args.get('DATA_PATH')
38 |     if test_data_path is not None:
39 |         test_data_path = RichPath.create(test_data_path, azure_info_path)
40 |     result_dir = args.get('--result-dir', 'trained_models')
41 |     test(model_path, test_data_path, result_dir, quiet=args.get('--quiet'))
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     args = docopt(__doc__)
46 |     run_and_debug(lambda: run(args), enable_debugging=args['--debug'])
47 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Usage:
  4 |    train.py [options] MODEL_NAME TASK_NAME
  5 | 
  6 | MODEL_NAME has to be one of the supported models, which currently are
  7 |  GGNN, GNN-Edge-MLP, GNN-FiLM, RGAT, RGCN, RGDCN
  8 | 
  9 | Options:
 10 |     -h --help                       Show this screen.
 11 |     --data-path PATH                Path to load data from, has task-specific defaults under data/.
 12 |     --result-dir DIR                Directory to store logfiles and trained models. [default: trained_models]
 13 |     --run-test                      Indicate if the task's test should be run.
 14 |     --model-param-overrides PARAMS  Parameter settings overriding model defaults (in JSON format).
 15 |     --task-param-overrides PARAMS   Parameter settings overriding task defaults (in JSON format).
 16 |     --quiet                         Show less output.
 17 |     --tensorboard DIR               Dump tensorboard event files to DIR.
 18 |     --azure-info=<path>             Azure authentication information file (JSON). [default: azure_auth.json]
 19 |     --debug                         Turn on debugger.
 20 | """
 21 | import json
 22 | import os
 23 | import sys
 24 | import time
 25 | 
 26 | from docopt import docopt
 27 | from dpu_utils.utils import run_and_debug, RichPath, git_tag_run
 28 | 
 29 | from utils.model_utils import name_to_model_class, name_to_task_class
 30 | from test import test
 31 | 
 32 | 
 33 | def run(args):
 34 |     azure_info_path = args.get('--azure-info', None)
 35 |     model_cls, additional_model_params = name_to_model_class(args['MODEL_NAME'])
 36 |     task_cls, additional_task_params = name_to_task_class(args['TASK_NAME'])
 37 | 
 38 |     # Collect parameters from first the class defaults, potential task defaults, and then CLI:
 39 |     task_params = task_cls.default_params()
 40 |     task_params.update(additional_task_params)
 41 |     model_params = model_cls.default_params()
 42 |     model_params.update(additional_model_params)
 43 | 
 44 |     # Load potential task-specific defaults:
 45 |     task_model_default_hypers_file = \
 46 |         os.path.join(os.path.dirname(__file__),
 47 |                      "tasks",
 48 |                      "default_hypers",
 49 |                      "%s_%s.json" % (task_cls.name(), model_cls.name(model_params)))
 50 |     if os.path.exists(task_model_default_hypers_file):
 51 |         print("Loading task/model-specific default parameters from %s." % task_model_default_hypers_file)
 52 |         with open(task_model_default_hypers_file, "rt") as f:
 53 |             default_task_model_hypers = json.load(f)
 54 |         task_params.update(default_task_model_hypers['task_params'])
 55 |         model_params.update(default_task_model_hypers['model_params'])
 56 | 
 57 |     # Load overrides from command line:
 58 |     task_params.update(json.loads(args.get('--task-param-overrides') or '{}'))
 59 |     model_params.update(json.loads(args.get('--model-param-overrides') or '{}'))
 60 | 
 61 |     # Finally, upgrade every parameters that's a path to a RichPath:
 62 |     task_params_orig = dict(task_params)
 63 |     for (param_name, param_value) in task_params.items():
 64 |         if param_name.endswith("_path"):
 65 |             task_params[param_name] = RichPath.create(param_value, azure_info_path)
 66 | 
 67 |     # Now prepare to actually run by setting up directories, creating object instances and running:
 68 |     result_dir = args.get('--result-dir', 'trained_models')
 69 |     os.makedirs(result_dir, exist_ok=True)
 70 |     task = task_cls(task_params)
 71 |     data_path = args.get('--data-path') or task.default_data_path()
 72 |     data_path = RichPath.create(data_path, azure_info_path)
 73 |     task.load_data(data_path)
 74 | 
 75 |     random_seeds = model_params['random_seed']
 76 |     if not isinstance(random_seeds, list):
 77 |         random_seeds = [random_seeds]
 78 | 
 79 |     for random_seed in random_seeds:
 80 |         model_params['random_seed'] = random_seed
 81 |         run_id = "_".join([task_cls.name(), model_cls.name(model_params), time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())])
 82 | 
 83 |         model = model_cls(model_params, task, run_id, result_dir)
 84 |         model.log_line("Run %s starting." % run_id)
 85 |         model.log_line(" Using the following task params: %s" % json.dumps(task_params_orig))
 86 |         model.log_line(" Using the following model params: %s" % json.dumps(model_params))
 87 | 
 88 |         if sys.stdin.isatty():
 89 |             try:
 90 |                 git_sha = git_tag_run(run_id)
 91 |                 model.log_line(" git tagged as %s" % git_sha)
 92 |             except:
 93 |                 print(" Tried tagging run in git, but failed.")
 94 |                 pass
 95 | 
 96 |         model.initialize_model( )
 97 |         model.train(quiet=args.get('--quiet'), tf_summary_path=args.get('--tensorboard'))
 98 | 
 99 |         if args.get('--run-test'):
100 |             test(model.best_model_file, data_path, result_dir, quiet=args.get('--quiet'), run_id=run_id)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     args = docopt(__doc__)
105 |     run_and_debug(lambda: run(args), enable_debugging=args['--debug'])
106 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import SMALL_NUMBER, BIG_NUMBER, get_gated_unit, get_aggregation_function, get_activation, MLP, micro_f1
2 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/utils/add_child_ids.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from argparse import ArgumentParser
 3 | 
 4 | raw_keys = ['Child', 'NextToken', 'ComputedFrom', 'LastUse', 'LastWrite', 'LastLexicalUse', 'FormalArgName', 'GuardedBy', 'GuardedByNegation', 'UsesSubtoken']
 5 | 
 6 | if __name__ == '__main__':
 7 |     parser = ArgumentParser()
 8 |     parser.add_argument("--edges", dest="edges", required=True)
 9 |     args = parser.parse_args()
10 |     
11 |     with open(args.edges, 'rb') as file:
12 |         raw_edges = pickle.load(file)
13 | 
14 |     parent_to_children = {}
15 |     child_to_parent = {}
16 |     for s, t in raw_edges['Child']:
17 |         if not s in parent_to_children:
18 |             parent_to_children[s] = []
19 |         parent_to_children[s].append(t)
20 |         child_to_parent[t] = s
21 |         
22 |     cur = 0
23 |     next_map = {}
24 |     for s, t in raw_edges['NextToken']:
25 |         next_map[s] = t
26 |     prev_map = {t:s for s,t in next_map.items()}
27 | 
28 |     def get_all_next(n):
29 |         result = []
30 |         cur = n
31 |         while cur in next_map:
32 |             next_item = next_map[cur]
33 |             result.append(next_item)
34 |             cur = next_item
35 |         return result
36 | 
37 |     def get_all_prev(n):
38 |         result = []
39 |         cur = n
40 |         while cur in prev_map:
41 |             prev_item = prev_map[cur]
42 |             result.append(prev_item)
43 |             cur = prev_item
44 |         return result
45 | 
46 |     
47 |     nodes = child_to_parent.keys()
48 |     left_nodes = list(nodes)
49 | 
50 |     parent_to_descendants = {}
51 |     def get_parent_to_descendants(p):
52 |         desc = set()
53 |         for c in parent_to_children[p]:
54 |             if c in parent_to_children: # if c is a parent itself
55 |                 desc.update(get_parent_to_descendants(c))
56 |             else:
57 |                 desc.add(c)
58 |         return desc
59 |                 
60 |     for p in parent_to_children.keys():
61 |         desc = get_parent_to_descendants(p)
62 |         parent_to_descendants[p] = desc
63 | 
64 |     roots = set()
65 |     for n in nodes:
66 |         cur = n
67 |         while cur in child_to_parent:
68 |             cur = child_to_parent[cur]
69 |         roots.add(cur)
70 |     
71 |     print(raw_edges)
72 |     


--------------------------------------------------------------------------------
/tf-gnn-samples/utils/citation_network_utils.py:
--------------------------------------------------------------------------------
  1 | # This is largely copied from https://raw.githubusercontent.com/tkipf/gcn/master/gcn/utils.py
  2 | # It is Copyright (c) 2016 Thomas Kipf, under the MIT license (see LICENSE for a copy)
  3 | 
  4 | import numpy as np
  5 | import pickle as pkl
  6 | import scipy.sparse as sp
  7 | import sys
  8 | 
  9 | 
 10 | def parse_index_file(filename):
 11 |     """Parse index file."""
 12 |     index = []
 13 |     for line in open(filename):
 14 |         index.append(int(line.strip()))
 15 |     return index
 16 | 
 17 | 
 18 | def sample_mask(idx, l):
 19 |     """Create mask."""
 20 |     mask = np.zeros(l)
 21 |     mask[idx] = 1
 22 |     return np.array(mask, dtype=np.bool)
 23 | 
 24 | 
 25 | def load_data(directory: str, dataset_str: str):
 26 |     """
 27 |     Loads input data from gcn/data directory
 28 | 
 29 |     ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
 30 |     ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
 31 |     ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
 32 |         (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
 33 |     ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
 34 |     ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
 35 |     ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
 36 |     ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
 37 |         object;
 38 |     ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
 39 | 
 40 |     All objects above must be saved using python pickle module.
 41 | 
 42 |     :param dataset_str: Dataset name
 43 |     :return: All data input files loaded (as well the training/test data).
 44 |     """
 45 |     names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
 46 |     objects = []
 47 |     for i in range(len(names)):
 48 |         with open("{}/ind.{}.{}".format(directory, dataset_str, names[i]), 'rb') as f:
 49 |             if sys.version_info > (3, 0):
 50 |                 objects.append(pkl.load(f, encoding='latin1'))
 51 |             else:
 52 |                 objects.append(pkl.load(f))
 53 | 
 54 |     x, y, tx, ty, allx, ally, graph = tuple(objects)
 55 |     test_idx_reorder = parse_index_file("{}/ind.{}.test.index".format(directory, dataset_str))
 56 |     test_idx_range = np.sort(test_idx_reorder)
 57 | 
 58 |     if dataset_str == 'citeseer':
 59 |         # Fix citeseer dataset (there are some isolated nodes in the graph)
 60 |         # Find isolated nodes, add them as zero-vecs into the right position
 61 |         test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
 62 |         tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
 63 |         tx_extended[test_idx_range-min(test_idx_range), :] = tx
 64 |         tx = tx_extended
 65 |         ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
 66 |         ty_extended[test_idx_range-min(test_idx_range), :] = ty
 67 |         ty = ty_extended
 68 | 
 69 |     features = sp.vstack((allx, tx)).tolil()
 70 |     features[test_idx_reorder, :] = features[test_idx_range, :]
 71 | 
 72 |     labels = np.vstack((ally, ty))
 73 |     labels[test_idx_reorder, :] = labels[test_idx_range, :]
 74 | 
 75 |     idx_test = test_idx_range.tolist()
 76 |     idx_train = range(len(y))
 77 |     idx_val = range(len(y), len(y)+500)
 78 | 
 79 |     train_mask = sample_mask(idx_train, labels.shape[0])
 80 |     val_mask = sample_mask(idx_val, labels.shape[0])
 81 |     test_mask = sample_mask(idx_test, labels.shape[0])
 82 | 
 83 |     y_train = np.zeros(labels.shape)
 84 |     y_val = np.zeros(labels.shape)
 85 |     y_test = np.zeros(labels.shape)
 86 |     y_train[train_mask, :] = labels[train_mask, :]
 87 |     y_val[val_mask, :] = labels[val_mask, :]
 88 |     y_test[test_mask, :] = labels[test_mask, :]
 89 | 
 90 |     return graph, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
 91 | 
 92 | 
 93 | def sparse_to_tuple(sparse_mx):
 94 |     """Convert sparse matrix to tuple representation."""
 95 |     def to_tuple(mx):
 96 |         if not sp.isspmatrix_coo(mx):
 97 |             mx = mx.tocoo()
 98 |         coords = np.vstack((mx.row, mx.col)).transpose()
 99 |         values = mx.data
100 |         shape = mx.shape
101 |         # All of these will need to be sorted:
102 |         sort_indices = np.lexsort(np.rot90(coords))
103 |         return coords[sort_indices], values[sort_indices], shape
104 | 
105 |     if isinstance(sparse_mx, list):
106 |         for i in range(len(sparse_mx)):
107 |             sparse_mx[i] = to_tuple(sparse_mx[i])
108 |     else:
109 |         sparse_mx = to_tuple(sparse_mx)
110 | 
111 |     return sparse_mx
112 | 
113 | 
114 | def preprocess_features(features):
115 |     """Row-normalize feature matrix and convert to tuple representation"""
116 |     rowsum = np.array(features.sum(1))
117 |     r_inv = np.power(rowsum, -1).flatten()
118 |     r_inv[np.isinf(r_inv)] = 0.
119 |     r_mat_inv = sp.diags(r_inv)
120 |     features = r_mat_inv.dot(features)
121 |     return features.toarray()  # densify -- these are tiny and we don't care
122 | 
123 | 
124 | def normalize_adj(adj):
125 |     """Symmetrically normalize adjacency matrix."""
126 |     adj = sp.coo_matrix(adj)
127 |     rowsum = np.array(adj.sum(1))
128 |     d_inv_sqrt = np.power(rowsum, -0.5).flatten()
129 |     d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
130 |     d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
131 |     return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
132 | 
133 | 
134 | def preprocess_adj(adj):
135 |     """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
136 |     adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
137 |     return sparse_to_tuple(adj_normalized)
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/utils/compute_diameters.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | import json
 3 | import networkx as nx
 4 | import gzip
 5 | import numpy as np
 6 | import statistics
 7 | 
 8 | def compute_diameter(adjacency_list):
 9 |     # graph is a list of edges
10 |     # every edge is a list: [source, type, target]
11 |     g = nx.Graph()
12 |     for edge_source, _, edge_target in adjacency_list:
13 |         g.add_edge(edge_source, edge_target)
14 |     return nx.diameter(g)
15 | 
16 | if __name__ == '__main__':
17 |     parser = ArgumentParser()
18 |     parser.add_argument("--data", dest="data", required=True)
19 |     args = parser.parse_args()
20 |     
21 |     with gzip.open(args.data, 'r') as file:
22 |         lines = file.readlines()
23 |     
24 |     objs = [json.loads(line) for line in lines]
25 |     graphs = [o['graph'] for o in objs]
26 | 
27 |     diameters = [compute_diameter(graph) for graph in graphs]
28 |     print('Max diameter: ', max(diameters))
29 |     print('Mean diameter: ', np.mean(diameters))
30 |     print('stddev: ', statistics.stdev(diameters))
31 | 
32 |     percentiles = range(10, 110, 10)
33 |     percentile_results = np.percentile(diameters, percentiles)
34 |     for i, res in zip(percentiles, percentile_results):
35 |         print('Diameters - {} percentile: {}'.format(i, res))


--------------------------------------------------------------------------------
/tf-gnn-samples/utils/model_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | from typing import Tuple, Type, Dict, Any
 4 | 
 5 | import pickle
 6 | 
 7 | from models import (Sparse_Graph_Model, GGNN_Model, GNN_FiLM_Model, GNN_Edge_MLP_Model,
 8 |                     RGAT_Model, RGCN_Model, RGDCN_Model, RGIN_Model)
 9 | from models.no_struct_mlp_model import No_Struct_MLP_Model
10 | from tasks import Sparse_Graph_Task, QM9_Task, Citation_Network_Task, PPI_Task, VarMisuse_Task
11 | 
12 | 
13 | def name_to_task_class(name: str) -> Tuple[Type[Sparse_Graph_Task], Dict[str, Any]]:
14 |     name = name.lower()
15 |     if name == "qm9":
16 |         return QM9_Task, {}
17 |     if name == "cora":
18 |         return Citation_Network_Task, {"data_kind": "cora"}
19 |     if name == "citeseer":
20 |         return Citation_Network_Task, {"data_kind": "citeseer"}
21 |     if name == "pubmed":
22 |         return Citation_Network_Task, {"data_kind": "pubmed"}
23 |     if name == "citationnetwork":
24 |         return Citation_Network_Task, {}
25 |     if name == "ppi":
26 |         return PPI_Task, {}
27 |     if name == "varmisuse":
28 |         return VarMisuse_Task, {}
29 | 
30 |     raise ValueError("Unknown task type '%s'" % name)
31 | 
32 | 
33 | def name_to_model_class(name: str) -> Tuple[Type[Sparse_Graph_Model], Dict[str, Any]]:
34 |     name = name.lower()
35 |     if name in ["ggnn", "ggnn_model"]:
36 |         return GGNN_Model, {}
37 |     if name in ["gnn_edge_mlp", "gnn-edge-mlp", "gnn_edge_mlp_model"]:
38 |         return GNN_Edge_MLP_Model, {}
39 |     if name in ["gnn_edge_mlp0", "gnn-edge-mlp0", "gnn_edge_mlp0_model"]:
40 |         return GNN_Edge_MLP_Model, {'num_edge_hidden_layers': 0}
41 |     if name in ["gnn_edge_mlp1", "gnn-edge-mlp1", "gnn_edge_mlp1_model"]:
42 |         return GNN_Edge_MLP_Model, {'num_edge_hidden_layers': 1}
43 |     if name in ["gnn_edge_mlp", "gnn-edge-mlp"]:
44 |         return GNN_Edge_MLP_Model, {}
45 |     if name in ["gnn_film", "gnn-film", "gnn_film_model"]:
46 |         return GNN_FiLM_Model, {}
47 |     if name in ["rgat", "rgat_model"]:
48 |         return RGAT_Model, {}
49 |     if name in ["rgcn", "rgcn_model"]:
50 |         return RGCN_Model, {}
51 |     if name in ["rgdcn", "rgdcn_model"]:
52 |         return RGDCN_Model, {}
53 |     if name in ["rgin", "rgin_model"]:
54 |         return RGIN_Model, {}
55 |     if name in ['nostruct', 'no_struct', 'no-struct', 'nostruct-mlp1']:
56 |         return No_Struct_MLP_Model, {'num_edge_hidden_layers': 1}
57 | 
58 |     raise ValueError("Unknown model type '%s'" % name)
59 | 
60 | 
61 | def restore(saved_model_path: str, result_dir: str, run_id: str = None) -> Sparse_Graph_Model:
62 |     print("Loading model from file %s." % saved_model_path)
63 |     with open(saved_model_path, 'rb') as in_file:
64 |         data_to_load = pickle.load(in_file)
65 | 
66 |     model_cls, _ = name_to_model_class(data_to_load['model_class'])
67 |     task_cls, additional_task_params = name_to_task_class(data_to_load['task_class'])
68 | 
69 |     if run_id is None:
70 |         run_id = "_".join([task_cls.name(), model_cls.name(data_to_load['model_params']), time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())])
71 | 
72 |     task = task_cls(data_to_load['task_params'])
73 |     task.restore_from_metadata(data_to_load['task_metadata'])
74 | 
75 |     model = model_cls(data_to_load['model_params'], task, run_id, result_dir)
76 |     model.load_weights(data_to_load['weights'])
77 | 
78 |     model.log_line("Loaded model from snapshot %s." % saved_model_path)
79 | 
80 |     return model
81 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/utils/prep_baseline.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import json
  3 | import pickle
  4 | import sys
  5 | from argparse import ArgumentParser
  6 | import gzip
  7 | import re
  8 | import os
  9 | import random
 10 | import multiprocessing as mp
 11 | 
 12 | ignore_list = ['openlivewriter', 'botbuilder']
 13 | CANDIDATE_BEGIN = '<CANDIDATE>'
 14 | CANDIDATE_END = '</CANDIDATE>'
 15 | SLOT = '<SLOT>'
 16 | project_name_map = {
 17 |     'akka.net': 'akka'
 18 | }
 19 | filename_mapping = {
 20 |     'C:\\Users\\t-mialla\\Documents\\sampleProjects\\SignalR\\src\\Microsoft.AspNet.SignalR.Core\\Messaging\\Cursor.cs': 'Core\\Messaging\\Cursor.cs'
 21 | }
 22 | 
 23 | RE_WORDS = re.compile(r'''
 24 |     # Find words in a string. Order matters!
 25 |     [A-Z]+(?=[A-Z][a-z]) |  # All upper case before a capitalized word
 26 |     [A-Z]?[a-z]+ |  # Capitalized words / all lower case
 27 |     [A-Z]+ |  # All upper case
 28 |     \d+ | # Numbers
 29 |     _ |
 30 |     \" |
 31 |     .+
 32 | ''', re.VERBOSE)
 33 | 
 34 | def split_subtokens(str):
 35 |     return [subtok for subtok in RE_WORDS.findall(str) if not subtok == '_']
 36 | 
 37 | def get_immediate_subdirectories(a_dir):
 38 |     return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
 39 |             if os.path.isdir(os.path.join(a_dir, name))]
 40 | 
 41 | def get_immediate_files(dir):
 42 |     return [(os.path.join(dir, name)) for name in os.listdir(dir)
 43 |             if os.path.isfile(os.path.join(dir, name))]
 44 | 
 45 | def collect_tokens(path):
 46 |     projects = get_immediate_subdirectories(path)
 47 |     
 48 |     tokens_dict = {}
 49 |     for proj in projects:
 50 |         proj_name = proj.split('/')[-1]
 51 |         if proj_name in project_name_map:
 52 |             proj_name = project_name_map[proj_name]
 53 |         tokens_file_name = f'{proj}/{proj_name}-tokens.json.gz'
 54 |         if proj_name in ignore_list:
 55 |             continue
 56 |         #if os.path.isfile(tokens_file_name):
 57 |         with gzip.open(tokens_file_name, 'r') as file:
 58 |             lines = file.readlines()
 59 |             objs = json.loads(lines[0])
 60 |             for o in objs:
 61 |                 tokens_dict[o['Provenance']] = o['Tokens']
 62 |     return tokens_dict
 63 | 
 64 | def create_sequences(path, tokens_dict, out_path):
 65 |     subsets = get_immediate_subdirectories(path) # train, valid, test, testonly
 66 |     process_gz_file_func = functools.partial(process_gz_file, tokens_dict)
 67 |     for dir in subsets:
 68 |         dir_name = dir.split('/')[-1]
 69 |         out_dir_path = f'{out_path}/{dir_name}'
 70 |         if os.path.isdir(out_dir_path):
 71 |             raise ValueError(f'{out_path}/{dir_name} already exists')
 72 |         os.mkdir(out_dir_path)
 73 |         files = get_immediate_files(dir)
 74 |         with open(f'{out_dir_path}/source.txt', 'w') as out_source_file, \
 75 |                 open(f'{out_dir_path}/target.txt', 'w') as out_target_file:
 76 |             with mp.Pool(64) as pool:
 77 |                 #results = [process_gz_file(file, tokens_dict) for file in files]
 78 |                 results = pool.imap_unordered(process_gz_file_func, files)
 79 |                 for example in results:
 80 |                     for source, target in zip(*example):
 81 |                         out_source_file.write(source)
 82 |                         out_target_file.write(target)
 83 | 
 84 | def process_gz_file(tokens_dict, gz_file_name):
 85 |     sources, targets = [], []
 86 |     with gzip.open(gz_file_name, 'r') as gz_file:
 87 |         lines = gz_file.readlines()
 88 |         objs = [json.loads(l) for l in lines]
 89 |         for o in objs:
 90 |             filename = o['filename']
 91 |             if filename in tokens_dict:
 92 |                 tokens = tokens_dict[filename]
 93 |             elif filename in filename_mapping:
 94 |                 tokens = tokens_dict[filename_mapping[filename]]
 95 |             else:
 96 |                 found_filenames = [name for name in tokens_dict.keys() if filename.endswith(name)]
 97 |                 if len(found_filenames) != 1:
 98 |                     found_filenames = [name for name in found_filenames if name != 's']
 99 |                     if len(found_filenames) != 1:
100 |                         raise ValueError(
101 |                             f'Looking for filename: {filename}, but found in tokens_dict: {found_filenames}')
102 | 
103 |                 tokens = tokens_dict[found_filenames[0]]
104 |                 print(f'Taking {found_filenames[0]} instead of {filename}')
105 |             slot_token_index = o['slotTokenIdx']
106 |             tokens[slot_token_index] = SLOT
107 |             subtokens = [' '.join(split_subtokens(tok)) for tok in tokens]
108 |             candidates = [' '.join(split_subtokens(candi['SymbolName'])) for candi in o['SymbolCandidates']]
109 |             # Important to shuffle, because the first one is always the correct one
110 |             random.shuffle(candidates)
111 |             label = [' '.join(split_subtokens(candi['SymbolName'])) for candi in o['SymbolCandidates'] if
112 |                      candi['IsCorrect'] == True]
113 |             if len(label) is not 1:
114 |                 raise ValueError(f'Found {len(label)} correct labels in {gz_file_name}, example {o["filename"]}')
115 |             label = label[0]
116 |             outline = ' '.join(subtokens) + ' ' + ' '.join(
117 |                 [CANDIDATE_BEGIN + ' ' + candi + ' ' + CANDIDATE_END for candi in candidates]) + '\n'
118 |             sources.append(outline)
119 |             targets.append(label + '\n')
120 |     return sources, targets
121 | 
122 | if __name__ == '__main__':
123 |     parser = ArgumentParser()
124 |     parser.add_argument("--raw", dest="raw_path", required=True)
125 |     parser.add_argument("--reorg", dest="reorg_path", required=True)
126 |     parser.add_argument("--out", dest="out_path", required=True)
127 | 
128 |     args = parser.parse_args()
129 |     
130 |     #tokens_dict = collect_tokens(args.raw_path)
131 |     with open('tokens.pkl', 'rb') as file:
132 |         tokens_dict = pickle.load(file)
133 |     sequences = create_sequences(args.reorg_path, tokens_dict, args.out_path)
134 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/utils/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Callable, Union, List
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | BIG_NUMBER = 1e7
  7 | SMALL_NUMBER = 1e-7
  8 | 
  9 | 
 10 | def get_gated_unit(units: int, gated_unit: str, activation_function: str):
 11 |     activation_fn = get_activation(activation_function)
 12 |     gated_unit_name = gated_unit.lower()
 13 |     if gated_unit_name == 'rnn':
 14 |         return tf.keras.layers.SimpleRNNCell(units, activation=activation_fn)
 15 |     if gated_unit_name == 'gru':
 16 |         return tf.keras.layers.GRUCell(units, activation=activation_fn)
 17 |     if gated_unit_name == 'lstm':
 18 |         return tf.keras.layers.LSTMCell(units, activation=activation_fn)
 19 |     else:
 20 |         raise Exception("Unknown RNN cell type '%s'." % gated_unit)
 21 | 
 22 | 
 23 | def get_aggregation_function(aggregation_fun: Optional[str]):
 24 |     if aggregation_fun in ['sum', 'unsorted_segment_sum']:
 25 |         return tf.unsorted_segment_sum
 26 |     if aggregation_fun in ['max', 'unsorted_segment_max']:
 27 |         return tf.unsorted_segment_max
 28 |     if aggregation_fun in ['mean', 'unsorted_segment_mean']:
 29 |         return tf.unsorted_segment_mean
 30 |     if aggregation_fun in ['sqrt_n', 'unsorted_segment_sqrt_n']:
 31 |         return tf.unsorted_segment_sqrt_n
 32 |     else:
 33 |         raise ValueError("Unknown aggregation function '%s'!" % aggregation_fun)
 34 | 
 35 | 
 36 | def get_activation(activation_fun: Optional[str]):
 37 |     if activation_fun is None:
 38 |         return None
 39 |     activation_fun = activation_fun.lower()
 40 |     if activation_fun == 'linear':
 41 |         return None
 42 |     if activation_fun == 'tanh':
 43 |         return tf.tanh
 44 |     if activation_fun == 'relu':
 45 |         return tf.nn.relu
 46 |     if activation_fun == 'leaky_relu':
 47 |         return tf.nn.leaky_relu
 48 |     if activation_fun == 'elu':
 49 |         return tf.nn.elu
 50 |     if activation_fun == 'selu':
 51 |         return tf.nn.selu
 52 |     if activation_fun == 'gelu':
 53 |         def gelu(input_tensor):
 54 |             cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
 55 |             return input_tensor * cdf
 56 |         return gelu
 57 |     else:
 58 |         raise ValueError("Unknown activation function '%s'!" % activation_fun)
 59 | 
 60 | 
 61 | def micro_f1(logits, labels):
 62 |     # Everything on int, because who trusts float anyway?
 63 |     predicted = tf.round(tf.nn.sigmoid(logits))
 64 |     predicted = tf.cast(predicted, dtype=tf.int32)
 65 |     labels = tf.cast(labels, dtype=tf.int32)
 66 | 
 67 |     true_pos = tf.count_nonzero(predicted * labels)
 68 |     false_pos = tf.count_nonzero(predicted * (labels - 1))
 69 |     false_neg = tf.count_nonzero((predicted - 1) * labels)
 70 | 
 71 |     precision = true_pos / (true_pos + false_pos)
 72 |     recall = true_pos / (true_pos + false_neg)
 73 |     fmeasure = (2 * precision * recall) / (precision + recall)
 74 |     return tf.cast(fmeasure, tf.float32)
 75 | 
 76 | 
 77 | class MLP(object):
 78 |     def __init__(self,
 79 |                  out_size: int,
 80 |                  hidden_layers: Union[List[int], int] = 1,
 81 |                  use_biases: bool = False,
 82 |                  activation_fun: Optional[Callable[[tf.Tensor], tf.Tensor]] = tf.nn.relu,
 83 |                  dropout_rate: Union[float, tf.Tensor] = 0.0,
 84 |                  name: Optional[str] = "MLP",
 85 |                  ):
 86 |         """
 87 |         Create new MLP with given number of hidden layers.
 88 | 
 89 |         Arguments:
 90 |             out_size: Dimensionality of output.
 91 |             hidden_layers: Either an integer determining number of hidden layers, who will have out_size units each;
 92 |                 or list of integers whose lengths determines the number of hidden layers and whose contents the
 93 |                 number of units in each layer.
 94 |             use_biases: Flag indicating use of bias in fully connected layers.
 95 |             activation_fun: Activation function applied between hidden layers (NB: the output of the MLP
 96 |                 is always the direct result of a linear transformation)
 97 |             dropout_rate: Dropout applied to inputs of each MLP layer.
 98 |         """
 99 |         if isinstance(hidden_layers, int):
100 |             hidden_layer_sizes = [out_size] * hidden_layers
101 |         else:
102 |             hidden_layer_sizes = hidden_layers
103 | 
104 |         if len(hidden_layer_sizes) > 1:
105 |             assert activation_fun is not None, "Multiple linear layers without an activation"
106 | 
107 |         self.__dropout_rate = dropout_rate
108 |         self.__name = name
109 |         with tf.variable_scope(self.__name):
110 |             self.__layers = []  # type: List[tf.layers.Dense]
111 |             for hidden_layer_size in hidden_layer_sizes:
112 |                 self.__layers.append(tf.layers.Dense(units=hidden_layer_size,
113 |                                                      use_bias=use_biases,
114 |                                                      activation=activation_fun))
115 |             # Output layer:
116 |             self.__layers.append(tf.layers.Dense(units=out_size,
117 |                                                  use_bias=use_biases,
118 |                                                  activation=None))
119 | 
120 |     def __call__(self, input: tf.Tensor) -> tf.Tensor:
121 |         with tf.variable_scope(self.__name):
122 |             activations = input
123 |             for layer in self.__layers[:-1]:
124 |                 activations = tf.nn.dropout(activations, rate=self.__dropout_rate)
125 |                 activations = layer(activations)
126 |             return self.__layers[-1](activations)
127 | 


--------------------------------------------------------------------------------
/tf-gnn-samples/utils/varmisuse_data_splitter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Usage:
 4 |    varmisuse_data_splitter.py [options] RAW_DATA_DIR OUT_DIR
 5 | 
 6 | Reads in datapoints from a set of files and creates smaller files mixing these, in a format
 7 | suitable for streaming them into the training process.
 8 | 
 9 | Options:
10 |     -h --help                       Show this screen.
11 |     --chunk-size NUM                Number of samples per output file. [default: 100]
12 |     --num-workers NUM               Number of worker processes. Defaults to number of CPU cores.
13 |     --window-size NUM               Number of samples to load before mixing and writing things out. [default: 5000]
14 |     --azure-info=<path>             Azure authentication information file (JSON). [default: azure_auth.json]
15 |     --debug                         Turn on debugger.
16 | """
17 | from typing import List, Any
18 | 
19 | import numpy as np
20 | from more_itertools import chunked
21 | from docopt import docopt
22 | from dpu_utils.utils import run_and_debug, RichPath
23 | from multiprocessing import Process, Queue, cpu_count
24 | 
25 | 
26 | def _data_loading_worker(file_queue: Queue, result_queue: Queue) -> None:
27 |     while True:
28 |         next_path = file_queue.get()
29 |         if next_path is None:  # Our signal that all files have been processed
30 |             file_queue.put(None)  # Signal to the other workers
31 |             result_queue.put(None)  # Signal to the controller that we are done
32 |             break
33 | 
34 |         # Read the file and push examples out as soon as we get them:
35 |         for raw_sample in next_path.read_by_file_suffix():
36 |             result_queue.put(raw_sample)
37 | 
38 | 
39 | def _write_data(out_dir: RichPath, window_idx: int, chunk_size: int, data_window: List[Any]):
40 |     np.random.shuffle(data_window)
41 |     for chunk_idx, data_chunk in enumerate(chunked(data_window, chunk_size)):
42 |         out_file = out_dir.join('chunk_%i-%i.jsonl.gz' % (window_idx, chunk_idx))
43 |         out_file.save_as_compressed_file(data_chunk)
44 | 
45 | 
46 | def run(args):
47 |     azure_info_path = args.get('--azure-info', None)
48 |     in_dir = RichPath.create(args['RAW_DATA_DIR'], azure_info_path)
49 |     out_dir = RichPath.create(args['OUT_DIR'], azure_info_path)
50 |     out_dir.make_as_dir()
51 | 
52 |     num_workers = int(args.get('--num-workers') or cpu_count())
53 |     chunk_size = int(args['--chunk-size'])
54 |     window_size = int(args['--window-size'])
55 | 
56 |     files_to_load = list(in_dir.iterate_filtered_files_in_dir("*.gz"))
57 |     path_queue = Queue(maxsize=len(files_to_load) + 1)
58 |     result_queue = Queue(1000)
59 | 
60 |     # Set up list of work to do:
61 |     for path in files_to_load:
62 |         path_queue.put(path)
63 |     path_queue.put(None)  # Signal for the end of the queue
64 | 
65 |     # Set up workers:
66 |     workers = []
67 |     for _ in range(num_workers):
68 |         workers.append(Process(target=_data_loading_worker,
69 |                                args=(path_queue, result_queue,)))
70 |         workers[-1].start()
71 | 
72 |     # Consume the data:
73 |     num_workers_terminated = 0
74 |     data_window = []
75 |     window_idx = 0
76 |     while num_workers_terminated < len(workers):
77 |         parsed_sample = result_queue.get()
78 |         if parsed_sample is None:
79 |             num_workers_terminated += 1  # Worker signaled that it's done
80 |         else:
81 |             data_window.append(parsed_sample)
82 |             if len(data_window) >= window_size:
83 |                 _write_data(out_dir, window_idx, chunk_size, data_window)
84 |                 data_window = []
85 |                 window_idx += 1
86 | 
87 |     # Write out the remainder of the data:
88 |     _write_data(out_dir, window_idx, chunk_size, data_window)
89 | 
90 |     # Clean up the workers:
91 |     for worker in workers:
92 |         worker.join()
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     args = docopt(__doc__)
97 |     run_and_debug(lambda: run(args), enable_debugging=args['--debug'])
98 | 


--------------------------------------------------------------------------------