├── .gitignore
├── LICENSE
├── README.md
├── SubgraphCountingMatching
    ├── README.md
    ├── config.py
    ├── constants.py
    ├── dataset.py
    ├── evaluate.py
    ├── models
    │   ├── __init__.py
    │   ├── basemodel.py
    │   ├── cnn.py
    │   ├── compgcn.py
    │   ├── container.py
    │   ├── dmplrp.py
    │   ├── dmpnn.py
    │   ├── embed.py
    │   ├── filter.py
    │   ├── lrp.py
    │   ├── pred.py
    │   ├── rgcn.py
    │   ├── rgin.py
    │   ├── rnn.py
    │   └── txl.py
    ├── train.py
    └── utils
    │   ├── __init__.py
    │   ├── act.py
    │   ├── anneal.py
    │   ├── cyclical.py
    │   ├── dl.py
    │   ├── graph.py
    │   ├── init.py
    │   ├── io.py
    │   ├── log.py
    │   ├── sampler.py
    │   └── scheduler.py
└── UnsupervisedNodeClassification
    ├── Data
        └── README.md
    ├── Evaluate
        ├── README.md
        ├── evaluate.py
        ├── evaluate.sh
        ├── link_prediction.py
        ├── node_classification.py
        └── utils.py
    ├── Model
        ├── CompGCN
        │   ├── run.sh
        │   └── src
        │   │   ├── main.py
        │   │   ├── model.py
        │   │   └── utils.py
        ├── DMPNN
        │   ├── run.sh
        │   └── src
        │   │   ├── main.py
        │   │   ├── model.py
        │   │   └── utils.py
        ├── R-GCN
        │   ├── run.sh
        │   └── src
        │   │   ├── main.py
        │   │   ├── model.py
        │   │   └── utils.py
        ├── R-GIN
        │   ├── run.sh
        │   └── src
        │   │   ├── main.py
        │   │   ├── model.py
        │   │   └── utils.py
        └── README.md
    ├── README.md
    └── Transform
        ├── README.md
        ├── transform.py
        ├── transform.sh
        └── transform_model.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Data
  2 | Data
  3 | data
  4 | dumps
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | .vscode
113 | .idea
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Sean Liu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Graph Convolutional Networks with Dual Message Passing for Subgraph Isomorphism Counting and Matching
 2 | 
 3 | This repository is an official implementation of the paper Graph Convolutional Networks with Dual Message Passing for Subgraph Isomorphism Counting and Matching.
 4 | 
 5 | ## Introduction
 6 | 
 7 | We propose dual message passing neural networks (DMPNNs) to enhance the substructure representation learning in an asynchronous way for subgraph isomorphism counting and matching as well as unsupervised node classification. 
 8 | 
 9 | ## Reproduction
10 | 
11 | ### Package Dependencies
12 | * tqdm
13 | * numpy
14 | * pandas
15 | * scipy
16 | * numba >= 0.54.0
17 | * python-igraph == 0.9.11
18 | * torch >= 1.7.0
19 | * dgl >= 0.6.0
20 | 
21 | Please refer to `SubgraphCountingMatching` and `UnsupervisedNodeClassification`
22 | 
23 | 
24 | ### Citation
25 | ```bibtex
26 | @inproceedings{liu2022graph,
27 |   author    = {Xin Liu, Yangqiu Song},
28 |   title     = {Graph Convolutional Networks with Dual Message Passing for Subgraph Isomorphism Counting and Matching},
29 |   booktitle = {AAAI},
30 |   year      = {2022}
31 | }
32 | ```
33 | 
34 | ### Miscellaneous
35 | Please send any questions about the code and/or the algorithm to <xliucr@cse.ust.hk>.
36 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/README.md:
--------------------------------------------------------------------------------
  1 | # Subgraph Isomorphism Counting and Matching
  2 | 
  3 | This part is modified from [NeuralSubgraphCounting](https://github.com/HKUST-KnowComp/NeuralSubgraphCounting)
  4 | 
  5 | ## Reproduction
  6 | 
  7 | ### Stage 1: Download
  8 | 
  9 | We conduct experiments on 4 subgraph isomorphism benchmark datasets: ```Erdos-Renyi```, ```Regular```, ```Complex```, and ```MUTAG```.
 10 | 
 11 | Please download data from [OneDrive](https://hkustconnect-my.sharepoint.com/:f:/g/personal/xliucr_connect_ust_hk/ErzdTZguJnFBok2QKUr3yAYBHaReWOYOOAEca0uGzgBlyQ?e=UTs21h).
 12 | 
 13 | ### Stage 2: Training
 14 | 
 15 | We add ```DMPNN``` and ```CompGCN``` for heterogeneous Message-Passing implementaions.
 16 | We also add ```DMPLRP``` and ```LRP``` for local relational pooling implementations.
 17 | 
 18 | * In order to add reversed edges, please set `--add_rev True`.
 19 | * In order to joint learn counting and matching, please set `--node_pred True --match_weights node` (for graph models), `--edge_pred True --match_weights edge` (for sequence models), or `--node_pred True --edge_pred True --match_weights node,edge` (for CompGCN, DMPNN, and DMPLRP, but no further improvement).
 20 | 
 21 | ##### For Erdos-Renyi
 22 | ```bash
 23 | python train.py \
 24 |     --pattern_dir data/Erdos-Renyi/patterns \
 25 |     --graph_dir data/Erdos-Renyi/graphs \
 26 |     --metadata_dir data/Erdos-Renyi/metadata \
 27 |     --save_data_dir data/Erdos-Renyi/datasets \
 28 |     --save_model_dir dumps/Erdos-Renyi \
 29 |     --add_rev True \
 30 |     --hid_dim 64 --node_pred True --edge_pred False \
 31 |     --match_weights node \
 32 |     --enc_net Multihot --enc_base 2 \
 33 |     --emb_net Equivariant --share_emb_net True \
 34 |     --rep_net DMPNN \
 35 |     --rep_num_pattern_layers 3 --rep_num_graph_layers 3 \
 36 |     --rep_residual True --rep_dropout 0.0 --share_rep_net True \
 37 |     --pred_net SumPredictNet --pred_hid_dim 64 --pred_dropout 0.0 \
 38 |     --max_npv 4 --max_npe 10 --max_npvl 1 --max_npel 1 \
 39 |     --max_ngv 10 --max_nge 48 --max_ngvl 1 --max_ngel 1 \
 40 |     --train_grad_steps 1 --train_batch_size 64 \
 41 |     --train_log_steps 10 --eval_batch_size 64 \
 42 |     --lr 1e-3 --train_epochs 100 \
 43 |     --seed 0 --gpu_id 0
 44 | ```
 45 | 
 46 | ##### For Regular
 47 | ```bash
 48 | python train.py \
 49 |     --pattern_dir data/Regular/patterns \
 50 |     --graph_dir data/Regular/graphs \
 51 |     --metadata_dir data/Regular/metadata \
 52 |     --save_data_dir data/Regular/datasets \
 53 |     --save_model_dir dumps/Regular \
 54 |     --add_rev True \
 55 |     --hid_dim 64 --node_pred True --edge_pred False \
 56 |     --match_weights node \
 57 |     --enc_net Multihot --enc_base 2 \
 58 |     --emb_net Equivariant --share_emb_net True \
 59 |     --rep_net DMPNN \
 60 |     --rep_num_pattern_layers 3 --rep_num_graph_layers 3 \
 61 |     --rep_residual True --rep_dropout 0.0 --share_rep_net True \
 62 |     --pred_net SumPredictNet --pred_hid_dim 64 --pred_dropout 0.0 \
 63 |     --max_npv 4 --max_npe 10 --max_npvl 1 --max_npel 1 \
 64 |     --max_ngv 30 --max_nge 90 --max_ngvl 1 --max_ngel 1  \
 65 |     --train_grad_steps 1 --train_batch_size 64 \
 66 |     --train_log_steps 10 --eval_batch_size 64 \
 67 |     --lr 1e-3 --train_epochs 100 \
 68 |     --seed 0 --gpu_id 0
 69 | ```
 70 | 
 71 | ##### For Complex
 72 | ```bash
 73 | python train.py \
 74 |     --pattern_dir data/Complex/patterns \
 75 |     --graph_dir data/Complex/graphs \
 76 |     --metadata_dir data/Complex/metadata_withoutloop \
 77 |     --save_data_dir data/Complex/datasets \
 78 |     --save_model_dir dumps/Complex \
 79 |     --add_rev True \
 80 |     --hid_dim 64 --node_pred True --edge_pred False \
 81 |     --match_weights node \
 82 |     --enc_net Multihot --enc_base 2 \
 83 |     --emb_net Equivariant --share_emb_net True \
 84 |     --rep_net DMPNN \
 85 |     --rep_num_pattern_layers 3 --rep_num_graph_layers 3 \
 86 |     --rep_residual True --rep_dropout 0.0 --share_rep_net True \
 87 |     --pred_net SumPredictNet --pred_hid_dim 64 --pred_dropout 0.0 \
 88 |     --max_npv 8 --max_npe 8 --max_npvl 8 --max_npel 8 \
 89 |     --max_ngv 64 --max_nge 256 --max_ngvl 16 --max_ngel 16 \
 90 |     --train_grad_steps 1 --train_batch_size 512 \
 91 |     --train_log_steps 100 --eval_batch_size 512 \
 92 |     --lr 1e-3 --train_epochs 100 \
 93 |     --seed 0 --gpu_id 0
 94 | ```
 95 | 
 96 | ##### For MUTAG
 97 | ```bash
 98 | python train.py \
 99 |     --pattern_dir data/MUTAG/patterns \
100 |     --graph_dir data/MUTAG/graphs \
101 |     --metadata_dir data/MUTAG/metadata \
102 |     --save_data_dir data/MUTAG/datasets \
103 |     --save_model_dir dumps/MUTAG \
104 |     --add_rev True \
105 |     --hid_dim 64 --node_pred True --edge_pred False \
106 |     --match_weights node \
107 |     --enc_net Multihot --enc_base 2 \
108 |     --emb_net Equivariant --share_emb_net True \
109 |     --rep_net DMPNN \
110 |     --rep_num_pattern_layers 3 --rep_num_graph_layers 3 \
111 |     --rep_residual True --rep_dropout 0.0 --share_rep_net True \
112 |     --pred_net SumPredictNet --pred_hid_dim 64 --pred_dropout 0.0 \
113 |     --max_npv 4 --max_npe 3 --max_npvl 2 --max_npel 2 \
114 |     --max_ngv 28 --max_nge 66 --max_ngvl 7 --max_ngel 4 \
115 |     --train_grad_steps 1 --train_batch_size 32 \
116 |     --train_log_steps 10 --eval_batch_size 32 \
117 |     --lr 1e-3 --train_epochs 200 \
118 |     --seed 0 --gpu_id 0
119 | ```
120 | 
121 | ### Stage 3: Evaluation
122 | 
123 | ```bash
124 | python evaluate.py \
125 |     --pattern_dir data/MUTAG/patterns \
126 |     --graph_dir data/MUTAG/graphs \
127 |     --metadata_dir data/MUTAG/metadata \
128 |     --save_data_dir data/MUTAG/datasets \
129 |     --load_model_dir dumps/MUTAG/DMPNN_SumPredictNet_2021_12_09_14_11_52 \
130 |     --eval_batch_size 64
131 | ```
132 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/constants.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import os
 3 | import re
 4 | 
 5 | INF = 1e30
 6 | _INF = -1e30
 7 | EPS = 1e-8
 8 | PI = 3.141592653589793
 9 | 
10 | LEAKY_RELU_A = 1 / 5.5
11 | 
12 | LOOPFLAG = "is_loop"
13 | REVFLAG = "is_reversed"
14 | NORM = "norm"
15 | INDEGREE = "in_deg"
16 | INNORM = "in_norm"
17 | OUTDEGREE = "out_deg"
18 | OUTNORM = "out_norm"
19 | NODEID = "id"
20 | EDGEID = "id"
21 | NODELABEL = "label"
22 | EDGELABEL = "label"
23 | NODEEIGENV = "node_eigenv"
24 | EDGEEIGENV = "edge_eigenv"
25 | NODEFEAT = "node_feat"
26 | EDGEFEAT = "edge_feat"
27 | NODETYPE = "node_type"
28 | EDGETYPE = "edge_type"
29 | NODEMSG = "node_msg"
30 | EDGEMSG = "edge_msg"
31 | NODEAGG = "node_agg"
32 | EDGEAGG = "edge_agg"
33 | NODEOUTPUT = "node_out"
34 | EDGEOUTPUT = "edge_out"
35 | 
36 | INIT_STEPS = 600
37 | SCHEDULE_STEPS = 10000
38 | NUM_CYCLES = 2
39 | MIN_PERCENT = 1e-3
40 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/evaluate.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import gc
  3 | import math
  4 | import numpy as np
  5 | import os
  6 | import pickle
  7 | import random
  8 | import time
  9 | import torch as th
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import warnings
 13 | from collections import OrderedDict
 14 | from copy import deepcopy
 15 | from functools import partial
 16 | from itertools import chain
 17 | from sklearn.metrics import roc_auc_score
 18 | from tensorboardX import SummaryWriter
 19 | from torch.utils.data import DataLoader
 20 | from constants import *
 21 | from dataset import *
 22 | 
 23 | from torch.optim import AdamW
 24 | 
 25 | from config import get_eval_config
 26 | from utils.graph import compute_norm, compute_largest_eigenvalues, convert_to_dual_graph, get_dual_subisomorphisms
 27 | from utils.log import init_logger, close_logger, generate_log_line, generate_best_line, get_best_epochs
 28 | from utils.io import load_data, load_config, save_config, save_results
 29 | from utils.scheduler import map_scheduler_str_to_scheduler
 30 | from utils.sampler import BucketSampler, CircurriculumSampler
 31 | from utils.anneal import anneal_fn
 32 | from utils.cyclical import cyclical_fn
 33 | from models import *
 34 | 
 35 | from train import process_model_config, load_model
 36 | from train import load_edgeseq_datasets, load_graphadj_datasets
 37 | from train import remove_loops, add_reversed_edges, convert_to_dual_data
 38 | from train import calculate_degrees, calculate_norms, calculate_eigenvalues
 39 | from train import evaluate_epoch
 40 | 
 41 | warnings.filterwarnings("ignore")
 42 | 
 43 | 
 44 | if __name__ == "__main__":
 45 |     config = get_eval_config()
 46 | 
 47 |     random.seed(config["seed"])
 48 |     th.manual_seed(config["seed"])
 49 |     np.random.seed(config["seed"])
 50 | 
 51 |     ts = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
 52 |     if not config["load_model_dir"] or not os.path.exists(config["load_model_dir"]):
 53 |         raise FileNotFoundError
 54 |     model_name = "_".join(os.path.split(config["load_model_dir"])[1].split("_")[:2])
 55 | 
 56 |     if config["save_data_dir"]:
 57 |         os.makedirs(config["save_data_dir"], exist_ok=True)
 58 | 
 59 |     # set device
 60 |     if isinstance(config["gpu_id"], int) and config["gpu_id"] >= 0:
 61 |         device = th.device("cuda:%d" % (config["gpu_id"]))
 62 |     else:
 63 |         device = th.device("cpu")
 64 | 
 65 |     # set logger and writer
 66 |     logger = init_logger(log_file=os.path.join(config["load_model_dir"], "eval_log_%s.txt" % (ts)), log_tag=model_name)
 67 |     logger.info("evaluation config: ", str(config))
 68 | 
 69 |     # create/load model
 70 |     model, best_epochs = load_model(config["load_model_dir"])
 71 |     for metric, epochs in best_epochs.items():
 72 |         for data_type in epochs:
 73 |             logger.info(
 74 |                 generate_best_line(
 75 |                     data_type,
 76 |                     epochs[data_type][0],
 77 |                     epochs[data_type][0],
 78 |                     **{
 79 |                         metric: "{:.3f}".format(epochs[data_type][1])
 80 |                     }
 81 |                 )
 82 |             )
 83 |     model_config = load_config(os.path.join(config["load_model_dir"], "config.json"), as_dict=True)
 84 |     for k, v in model_config.items():
 85 |         if k not in config:
 86 |             config[k] = v
 87 |     model.expand(pred_return_weights=config["match_weights"], **process_model_config(config))
 88 |     model = model.to(device)
 89 |     logger.info(model)
 90 |     logger.info("number of parameters: %d" % (sum(p.numel() for p in model.parameters() if p.requires_grad)))
 91 | 
 92 |     # load data
 93 |     if config["rep_net"] in ["CNN", "RNN", "TXL"]:
 94 |         datasets = load_edgeseq_datasets(
 95 |             pattern_dir=config["pattern_dir"],
 96 |             graph_dir=config["graph_dir"],
 97 |             metadata_dir=config["metadata_dir"],
 98 |             save_data_dir=config["save_data_dir"],
 99 |             num_workers=config["num_workers"],
100 |             logger=logger
101 |         )
102 |     else:
103 |         datasets = load_graphadj_datasets(
104 |             pattern_dir=config["pattern_dir"],
105 |             graph_dir=config["graph_dir"],
106 |             metadata_dir=config["metadata_dir"],
107 |             save_data_dir=config["save_data_dir"],
108 |             num_workers=config["num_workers"],
109 |             logger=logger
110 |         )
111 | 
112 |     # remove loops
113 |     if "withoutloop" in config["metadata_dir"] or "withoutloop" in config["save_data_dir"]:
114 |         for data_type in datasets:
115 |             remove_loops(datasets[data_type])
116 | 
117 |     max_ngv = config["max_ngv"]
118 |     max_nge = config["max_nge"]
119 |     max_ngvl = config["max_ngvl"]
120 |     max_ngel = config["max_ngel"]
121 |     if config["share_emb_net"]:
122 |         max_npv = max_ngv
123 |         max_npe = max_nge
124 |         max_npvl = max_ngvl
125 |         max_npel = max_ngel
126 |     else:
127 |         max_npv = config["max_npv"]
128 |         max_npe = config["max_npe"]
129 |         max_npvl = config["max_npvl"]
130 |         max_npel = config["max_npel"]
131 | 
132 |     # compute the p_len and g_len for original data
133 |     for data_type in datasets:
134 |         if isinstance(datasets[data_type], EdgeSeqDataset):
135 |             for x in datasets[data_type]:
136 |                 x["g_len"] = len(x["graph"])
137 |                 x["p_len"] = len(x["pattern"])
138 |         elif isinstance(datasets[data_type], GraphAdjDataset):
139 |             for x in datasets[data_type]:
140 |                 x["g_len"] = len(x["graph"])
141 |                 x["p_len"] = len(x["pattern"])
142 |                 if NODEID not in x["graph"].ndata:
143 |                     x["graph"].ndata[NODEID] = th.arange(x["graph"].number_of_nodes())
144 |                 if EDGEID not in x["graph"].edata:
145 |                     x["graph"].edata[EDGEID] = th.arange(x["graph"].number_of_edges())
146 |                 if NODEID not in x["pattern"].ndata:
147 |                     x["pattern"].ndata[NODEID] = th.arange(x["pattern"].number_of_nodes())
148 |                 if EDGEID not in x["pattern"].edata:
149 |                     x["pattern"].edata[EDGEID] = th.arange(x["pattern"].number_of_edges())
150 | 
151 |     # add E reversed edges
152 |     if config["add_rev"]:
153 |         if logger:
154 |             logger.info("adding reversed edges...")
155 |         for data_type in datasets:
156 |             add_reversed_edges(datasets[data_type], max_npe, max_npel, max_nge, max_ngel)
157 |         max_npe *= 2
158 |         max_npel *= 2
159 |         max_nge *= 2
160 |         max_ngel *= 2
161 |     
162 |     # convert graphs to conj_graphs
163 |     if config["convert_dual"]:
164 |         if logger:
165 |             logger.info("converting dual graphs and isomorphisms...")
166 |         for data_type in datasets:
167 |             convert_to_dual_data(datasets[data_type])
168 |         avg_gd = math.ceil(max_nge / max_ngv)
169 |         avg_pd = math.ceil(max_npe / max_npv)
170 | 
171 |         max_ngv, max_nge = max_nge, (avg_gd * avg_gd) * max_ngv - max_ngv
172 |         max_npv, max_npe = max_npe, (avg_pd * avg_pd) * max_npv - max_npv
173 |         max_ngvl, max_ngel = max_ngel, max_ngvl
174 |         max_npvl, max_npel = max_npel, max_npvl
175 | 
176 |     # calculate the degrees, norms, and lambdas
177 |     if logger:
178 |         logger.info("calculating degress...")
179 |     for data_type in datasets:
180 |         calculate_degrees(datasets[data_type])
181 |         calculate_norms(datasets[data_type], self_loop=True)
182 |         calculate_eigenvalues(datasets[data_type])
183 | 
184 |     if config["rep_net"].endswith("LRP"):
185 |         lrp_datasets = OrderedDict()
186 |         share_memory = "small" not in config["graph_dir"]
187 |         cache = dict() if share_memory else None
188 |         for data_type in datasets:
189 |             LRPDataset.seq_len = config["lrp_seq_len"]
190 |             lrp_datasets[data_type] = LRPDataset(
191 |                 datasets[data_type],
192 |                 cache=cache,
193 |                 num_workers=config["num_workers"],
194 |                 share_memory=share_memory
195 |             )
196 |             for x in lrp_datasets[data_type]:
197 |                 x["g_len"] = len(x["graph"])
198 |                 x["p_len"] = len(x["pattern"])
199 |         del cache
200 |         del datasets
201 |         gc.collect()
202 |         datasets = lrp_datasets
203 | 
204 |     # set records
205 |     eval_metrics = {"train": None, "dev": None, "test": None}
206 | 
207 |     logger.info("-" * 80)
208 |     for data_type, dataset in datasets.items():
209 |         sampler = BucketSampler(
210 |             dataset,
211 |             group_by=["g_len", "p_len"],
212 |             batch_size=config["eval_batch_size"],
213 |             shuffle=False,
214 |             seed=config["seed"],
215 |             drop_last=False
216 |         )
217 |         data_loader = DataLoader(
218 |             dataset,
219 |             batch_sampler=sampler,
220 |             collate_fn=partial(dataset.batchify, return_weights=config["match_weights"]),
221 |         )
222 |         eval_metric, eval_results = evaluate_epoch(
223 |             model, data_type, data_loader, device, config, 0, logger, None
224 |         )
225 |         save_results(
226 |             eval_results, os.path.join(config["load_model_dir"], "eval_%s_results_%s.json" % (data_type, ts))
227 |         )
228 | 
229 |         eval_metrics[data_type] = eval_metric
230 | 
231 |     for data_type in eval_metrics:
232 |         if eval_metrics[data_type] is not None:
233 |             logger.info(
234 |                 generate_best_line(
235 |                     data_type,
236 |                     0,
237 |                     0,
238 |                     **{
239 |                         "eval-" + config["eval_metric"]: "{:.3f}".format(eval_metrics[data_type])
240 |                     }
241 |                 )
242 |             )
243 |     logger.info("=" * 80)
244 | 
245 |     close_logger(logger)
246 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .basemodel import BaseModel, EdgeSeqModel, GraphAdjModel, GraphAdjModelV2
 2 | from .cnn import CNN
 3 | from .rnn import RNN
 4 | from .txl import TransformerXL
 5 | from .rgcn import RGCN
 6 | from .rgin import RGIN
 7 | from .compgcn import CompGCN
 8 | from .dmpnn import DMPNN
 9 | from .lrp import LRP
10 | from .dmplrp import DMPLRP
11 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/models/cnn.py:
--------------------------------------------------------------------------------
  1 | import torch as th
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from .basemodel import EdgeSeqModel
  6 | from .container import *
  7 | # from ..utils.act import map_activation_str_to_layer
  8 | # from ..utils.init import init_weight, init_module
  9 | from utils.act import map_activation_str_to_layer
 10 | from utils.init import init_weight, init_module
 11 | 
 12 | 
 13 | class CNNLayer(nn.Module):
 14 |     def __init__(
 15 |         self,
 16 |         in_channels,
 17 |         out_channels,
 18 |         kernel_size,
 19 |         padding=-1,
 20 |         stride=1,
 21 |         groups=1,
 22 |         dilation=1,
 23 |         batch_norm=True,
 24 |         act_func="relu",
 25 |         dropout=0.0
 26 |     ):
 27 |         super(CNNLayer, self).__init__()
 28 |         self.in_channels = in_channels
 29 |         self.out_channels = out_channels
 30 |         if padding == -1:
 31 |             padding = kernel_size//2
 32 | 
 33 |         self.conv = nn.Conv1d(
 34 |             in_channels, out_channels,
 35 |             kernel_size=kernel_size, padding=padding,
 36 |             stride=stride, groups=groups, dilation=dilation
 37 |         )
 38 |         self.act = map_activation_str_to_layer(act_func, inplace=True)
 39 |         self.pool = nn.MaxPool1d(
 40 |             kernel_size=kernel_size//stride, stride=1, padding=padding
 41 |         )
 42 |         if batch_norm:
 43 |             self.bn = nn.BatchNorm1d(out_channels)
 44 |         else:
 45 |             self.bn = None
 46 |         self.drop = nn.Dropout(dropout)
 47 | 
 48 |         # init
 49 |         init_module(self.conv, init="normal", activation=act_func)
 50 | 
 51 |     def forward(self, x):
 52 |         o = self.conv(x)
 53 |         o = self.act(o)
 54 |         o = self.pool(o)
 55 |         if self.bn is not None:
 56 |             o = self.bn(o)
 57 |         o = self.drop(o)
 58 | 
 59 |         return o
 60 | 
 61 |     def get_output_dim(self):
 62 |         return self.out_channels
 63 | 
 64 |     def extra_repr(self):
 65 |         ""
 66 | 
 67 | 
 68 | class CNN(EdgeSeqModel):
 69 |     def __init__(self, **kw):
 70 |         super(CNN, self).__init__(**kw)
 71 | 
 72 |     def create_rep_net(self, type, **kw):
 73 |         if type == "graph":
 74 |             num_layers = kw.get("rep_num_graph_layers", 1)
 75 |         elif type == "pattern":
 76 |             if self.share_rep_net:
 77 |                 return self.g_rep_net
 78 |             num_layers = kw.get("rep_num_pattern_layers", 1)
 79 |         act_func = kw.get("rep_act_func", "relu")
 80 |         dropout = kw.get("rep_dropout", 0.0)
 81 | 
 82 |         batch_norm = kw.get("rep_cnn_batch_norm", True)
 83 |         kernel_sizes = kw.get("rep_cnn_kernel_sizes", 2)
 84 |         paddings = kw.get("rep_cnn_paddings", -1)
 85 |         strides = kw.get("rep_cnn_strides", 1)
 86 | 
 87 |         if isinstance(kernel_sizes, int):
 88 |             kernel_sizes = [kernel_sizes] * num_layers
 89 |         if isinstance(paddings, int):
 90 |             paddings = [paddings] * num_layers
 91 |         if isinstance(strides, int):
 92 |             strides = [strides] * num_layers
 93 | 
 94 |         cnn = ModuleList()
 95 |         for i in range(num_layers):
 96 |             cnn.add_module(
 97 |                 "%s_cnn_(%d)" % (type, i),
 98 |                 CNNLayer(
 99 |                     self.hid_dim,
100 |                     self.hid_dim,
101 |                     kernel_size=kernel_sizes[i],
102 |                     padding=paddings[i],
103 |                     stride=strides[i],
104 |                     batch_norm=batch_norm,
105 |                     act_func=act_func,
106 |                     dropout=dropout
107 |                 )
108 |             )
109 | 
110 |         return ModuleDict({"cnn": cnn})
111 | 
112 |     def get_pattern_rep(self, p_emb, mask=None):
113 |         if mask is None:
114 |             outputs = [p_emb.transpose(1, 2)]
115 |             for layer in self.p_rep_net["cnn"]:
116 |                 o = layer(outputs[-1])
117 |                 if self.rep_residual and o.size() == outputs[-1].size():
118 |                     outputs.append(outputs[-1] + o)
119 |                 else:
120 |                     outputs.append(o)
121 |             for i in range(len(outputs)):
122 |                 outputs[i] = outputs[i].transpose(1, 2)
123 |         else:
124 |             gate = mask.float().transpose(1, 2)
125 |             outputs = [p_emb.transpose(1, 2) * gate]
126 |             for layer in self.p_rep_net["cnn"]:
127 |                 gate = F.max_pool1d(
128 |                     gate,
129 |                     kernel_size=layer.conv.kernel_size,
130 |                     stride=layer.conv.stride,
131 |                     padding=layer.conv.padding,
132 |                     dilation=layer.conv.dilation
133 |                 )
134 |                 gate = F.max_pool1d(
135 |                     gate,
136 |                     kernel_size=layer.pool.kernel_size,
137 |                     stride=layer.pool.stride,
138 |                     padding=layer.pool.padding,
139 |                     dilation=layer.pool.dilation
140 |                 )
141 |                 o = layer(outputs[-1])
142 |                 o = o * gate
143 |                 if self.rep_residual and o.size() == outputs[-1].size():
144 |                     outputs.append(outputs[-1] + o)
145 |                 else:
146 |                     outputs.append(o)
147 |             for i in range(len(outputs)):
148 |                 outputs[i] = outputs[i].transpose(1, 2)
149 | 
150 |         return outputs[-1]
151 | 
152 |     def get_graph_rep(self, g_emb, mask=None, gate=None):
153 |         if mask is None and gate is None:
154 |             outputs = [g_emb.transpose(1, 2)]
155 |             for layer in self.g_rep_net["cnn"]:
156 |                 o = layer(outputs[-1])
157 |                 if self.rep_residual and o.size() == outputs[-1].size():
158 |                     outputs.append(outputs[-1] + o)
159 |                 else:
160 |                     outputs.append(o)
161 |             for i in range(len(outputs)):
162 |                 outputs[i] = outputs[i].transpose(1, 2)
163 |         else:
164 |             gate = ((mask.float() if mask is not None else 1) * (gate if gate is not None else 1)).transpose(1, 2)
165 |             outputs = [g_emb.transpose(1, 2) * gate]
166 |             for layer in self.g_rep_net["cnn"]:
167 |                 gate = F.max_pool1d(
168 |                     gate,
169 |                     kernel_size=layer.conv.kernel_size,
170 |                     stride=layer.conv.stride,
171 |                     padding=layer.conv.padding,
172 |                     dilation=layer.conv.dilation
173 |                 )
174 |                 gate = F.max_pool1d(
175 |                     gate,
176 |                     kernel_size=layer.pool.kernel_size,
177 |                     stride=layer.pool.stride,
178 |                     padding=layer.pool.padding,
179 |                     dilation=layer.pool.dilation
180 |                 )
181 |                 o = layer(outputs[-1])
182 |                 o = o * gate
183 |                 if self.rep_residual and o.size() == outputs[-1].size():
184 |                     outputs.append(outputs[-1] + o)
185 |                 else:
186 |                     outputs.append(o)
187 |             for i in range(len(outputs)):
188 |                 outputs[i] = outputs[i].transpose(1, 2)
189 | 
190 |         return outputs[-1]
191 | 
192 |     def refine_edge_weights(self, weights, use_max=False):
193 |         if weights is None:
194 |             return None
195 |         dim = weights.dim()
196 |         dtype = weights.dtype
197 |         if dim == 2:
198 |             weights = weights.unsqueeze(-1)
199 |         weights = weights.transpose(1, 2).float()
200 |         if use_max:
201 |             for layer in self.g_rep_net["cnn"]:
202 |                 if isinstance(layer, CNNLayer):
203 |                     weights = F.max_pool1d(
204 |                         weights,
205 |                         kernel_size=layer.conv.kernel_size,
206 |                         stride=layer.conv.stride,
207 |                         padding=layer.conv.padding,
208 |                         dilation=layer.conv.dilation,
209 |                     )
210 |                     weights = F.max_pool1d(
211 |                         weights,
212 |                         kernel_size=layer.pool.kernel_size,
213 |                         stride=layer.pool.stride,
214 |                         padding=layer.pool.padding,
215 |                         dilation=layer.pool.dilation,
216 |                     )
217 |         else:
218 |             for layer in self.g_rep_net["cnn"]:
219 |                 if isinstance(layer, CNNLayer):
220 |                     weights = sum(layer.conv.kernel_size) * F.avg_pool1d(
221 |                         weights,
222 |                         kernel_size=layer.conv.kernel_size,
223 |                         stride=layer.conv.stride,
224 |                         padding=layer.conv.padding,
225 |                         # dilation=layer.conv.dilation,
226 |                     )
227 |                     weights = F.max_pool1d(
228 |                         weights,
229 |                         kernel_size=layer.pool.kernel_size,
230 |                         stride=layer.pool.stride,
231 |                         padding=layer.pool.padding,
232 |                         dilation=layer.pool.dilation,
233 |                     )
234 |         weights = weights.transpose(1, 2)
235 |         if dim == 2:
236 |             weights = weights.squeeze(-1)
237 |         return weights.to(dtype)
238 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/models/dmpnn.py:
--------------------------------------------------------------------------------
  1 | import dgl.function as fn
  2 | import torch as th
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | from .basemodel import GraphAdjModelV2
  7 | from .container import *
  8 | # from ..constants import *
  9 | # from ..utils.act import map_activation_str_to_layer
 10 | # from ..utils.init import init_weight, init_module
 11 | from constants import *
 12 | from utils.act import map_activation_str_to_layer
 13 | from utils.init import init_weight, init_module
 14 | 
 15 | 
 16 | class DMPLayer(nn.Module):
 17 |     def __init__(
 18 |         self,
 19 |         input_dim,
 20 |         hidden_dim,
 21 |         init_neigenv=4.0, # empirical value of triangles 
 22 |         init_eeigenv=4.0, # empirical value of triangles
 23 |         bias=True,
 24 |         num_mlp_layers=2,
 25 |         batch_norm=True,
 26 |         act_func="relu",
 27 |         dropout=0.0
 28 |     ):
 29 |         super(DMPLayer, self).__init__()
 30 |         self.input_dim = input_dim
 31 |         self.hidden_dim = hidden_dim
 32 | 
 33 |         self.in_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim))
 34 |         self.out_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim))
 35 |         self.src_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim))
 36 |         self.dst_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim))
 37 |         self.nloop_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim))
 38 |         self.eloop_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim))
 39 |         if bias:
 40 |             self.nbias = nn.Parameter(th.Tensor(hidden_dim))
 41 |             self.ebias = nn.Parameter(th.Tensor(hidden_dim))
 42 |         else:
 43 |             self.register_parameter("nbias", None)
 44 |             self.register_parameter("ebias", None)
 45 |         self.nmlp = []
 46 |         for i in range(num_mlp_layers):
 47 |             self.nmlp.append(nn.Linear(hidden_dim, hidden_dim))
 48 |             if i != num_mlp_layers - 1:
 49 |                 if batch_norm:
 50 |                     self.nmlp.append(nn.BatchNorm1d(hidden_dim))
 51 |                 self.nmlp.append(map_activation_str_to_layer(act_func))
 52 |         self.nmlp = Sequential(*self.nmlp)
 53 |         self.emlp = []
 54 |         for i in range(num_mlp_layers):
 55 |             self.emlp.append(nn.Linear(hidden_dim, hidden_dim))
 56 |             if i != num_mlp_layers - 1:
 57 |                 if batch_norm:
 58 |                     self.emlp.append(nn.BatchNorm1d(hidden_dim))
 59 |                 self.emlp.append(map_activation_str_to_layer(act_func))
 60 |         self.emlp = Sequential(*self.emlp)
 61 |         self.act = map_activation_str_to_layer(act_func)
 62 |         self.drop = nn.Dropout(dropout)
 63 | 
 64 |         # init
 65 |         init_weight(self.in_weight, activation=act_func, init="uniform")
 66 |         init_weight(self.out_weight, activation=act_func, init="uniform")
 67 |         init_weight(self.src_weight, activation=act_func, init="uniform")
 68 |         init_weight(self.dst_weight, activation=act_func, init="uniform")
 69 |         init_weight(self.nloop_weight, activation=act_func, init="uniform")
 70 |         init_weight(self.eloop_weight, activation=act_func, init="uniform")
 71 |         for module in self.nmlp.modules():
 72 |             init_module(module, activation=act_func, init="uniform")
 73 |         for module in self.emlp.modules():
 74 |             init_module(module, activation=act_func, init="uniform")
 75 |         if bias:
 76 |             nn.init.zeros_(self.nbias)
 77 |             nn.init.zeros_(self.ebias)
 78 | 
 79 |         # reparamerization tricks
 80 |         with th.no_grad():
 81 |             self.in_weight.data.div_(init_neigenv)
 82 |             self.out_weight.data.div_(init_neigenv)
 83 |             self.nloop_weight.data.div_(init_neigenv)
 84 |             self.src_weight.data.div_(init_eeigenv)
 85 |             self.dst_weight.data.div_(init_eeigenv)
 86 |             self.eloop_weight.data.div_(init_eeigenv)
 87 | 
 88 |         # register functions
 89 |         self.node_init_func = self._node_init_func
 90 |         self.edge_init_func = self._edge_init_func
 91 |         self.node_message_func = self._node_message_func
 92 |         self.node_reduce_func = fn.sum(msg=NODEMSG, out=NODEAGG)
 93 |         self.node_update_func = self._node_update_func
 94 |         self.edge_update_func = self._edge_update_func
 95 | 
 96 |     def _node_init_func(self, graph, node_feat=None):
 97 |         if node_feat is not None:
 98 |             graph.ndata[NODEFEAT] = node_feat
 99 | 
100 |         if OUTDEGREE not in graph.ndata:
101 |             graph.ndata[OUTDEGREE] = graph.out_degrees()
102 | 
103 |         return graph.ndata[NODEFEAT]
104 | 
105 |     def _edge_init_func(self, graph, edge_feat=None):
106 |         if edge_feat is not None:
107 |             graph.edata[EDGEFEAT] = edge_feat
108 | 
109 |         return graph.edata[EDGEFEAT]
110 | 
111 |     def _node_message_func(self, edges):
112 |         edge_msg = th.matmul(edges.dst[NODEFEAT], self.dst_weight) - th.matmul(edges.src[NODEFEAT], self.src_weight)
113 |         node_msg = -th.matmul(edges.data[EDGEFEAT], self.in_weight)
114 | 
115 |         # no need to half them further
116 |         if REVFLAG in edges.data:
117 |             rmask = edges.data[REVFLAG].view(-1, 1)
118 |             mask = ~(rmask)
119 | 
120 |             rev_edge_msg = th.matmul(edges.src[NODEFEAT], self.dst_weight) - th.matmul(edges.dst[NODEFEAT], self.src_weight)
121 |             rev_node_msg = th.matmul(edges.data[EDGEFEAT], self.out_weight)
122 | 
123 |             edge_msg = edge_msg.masked_fill(rmask, 0.0) + rev_edge_msg.masked_fill(mask, 0.0)
124 |             node_msg = node_msg.masked_fill(rmask, 0.0) + rev_node_msg.masked_fill(mask, 0.0)
125 | 
126 |         edges.data[EDGEAGG] = edge_msg
127 |         return {NODEMSG: node_msg}
128 | 
129 |     def _node_update_func(self, nodes):
130 |         agg = nodes.data[NODEAGG]
131 |         out = th.matmul(nodes.data[NODEFEAT], self.nloop_weight) + agg
132 |         if self.nbias is not None:
133 |             out = out + self.nbias
134 |         if len(self.nmlp) > 0:
135 |             out = self.nmlp(out)
136 |         else:
137 |             out = self.act(out)
138 |         out = self.drop(out)
139 | 
140 |         return {NODEOUTPUT: out}
141 | 
142 |     def _edge_update_func(self, edges):
143 |         agg = edges.data[EDGEAGG]
144 |         d = edges.dst[OUTDEGREE].unsqueeze(-1).float()
145 |         d = (1 + d).log2() # avoid nan ...
146 |         add = 2 * (1 + d) * th.matmul(edges.data[EDGEFEAT], (self.src_weight - self.dst_weight))
147 |         out = th.matmul(edges.data[EDGEFEAT], self.eloop_weight) + add + agg
148 |         if self.ebias is not None:
149 |             out = out + self.ebias
150 |         if len(self.emlp) > 0:
151 |             out = self.emlp(out)
152 |         else:
153 |             out = self.act(out)
154 |         out = self.drop(out)
155 | 
156 |         return {EDGEOUTPUT: out}
157 | 
158 |     def forward(self, graph, node_feat, edge_feat):
159 |         # g = graph.local_var()
160 |         g = graph
161 |         self.node_init_func(g, node_feat)
162 |         self.edge_init_func(g, edge_feat)
163 |         g.update_all(self.node_message_func, self.node_reduce_func, self.node_update_func)
164 |         g.apply_edges(self.edge_update_func)
165 | 
166 |         return g.ndata.pop(NODEOUTPUT), g.edata.pop(EDGEOUTPUT)
167 | 
168 |     def extra_repr(self):
169 |         summary = [
170 |             "in=%s, out=%s" % (self.input_dim, self.hidden_dim),
171 |         ]
172 | 
173 |         return "\n".join(summary)
174 | 
175 |     def get_output_dim(self):
176 |         return self.hidden_dim
177 | 
178 | 
179 | class DMPNN(GraphAdjModelV2):
180 |     def __init__(self, **kw):
181 |         super(DMPNN, self).__init__(**kw)
182 | 
183 |     def create_rep_net(self, type, **kw):
184 |         if type == "graph":
185 |             num_layers = kw.get("rep_num_graph_layers", 1)
186 |         elif type == "pattern":
187 |             if self.share_rep_net:
188 |                 return self.g_rep_net
189 |             num_layers = kw.get("rep_num_pattern_layers", 1)
190 |         init_neigenv = kw.get("init_neigenv", 4.0)
191 |         init_eeigenv = kw.get("init_eeigenv", 4.0)
192 |         num_mlp_layers = kw.get("rep_dmpnn_num_mlp_layers", 2)
193 |         batch_norm = kw.get("rep_dmpnn_batch_norm", False)
194 |         act_func = kw.get("rep_act_func", "relu")
195 |         dropout = kw.get("rep_dropout", 0.0)
196 | 
197 |         dmpnn = ModuleList()
198 |         for i in range(num_layers):
199 |             dmpnn.add_module(
200 |                 "%s_dmpnn_(%d)" % (type, i),
201 |                 DMPLayer(
202 |                     self.hid_dim,
203 |                     self.hid_dim,
204 |                     init_neigenv=init_neigenv,
205 |                     init_eeigenv=init_eeigenv,
206 |                     num_mlp_layers=num_mlp_layers,
207 |                     batch_norm=batch_norm,
208 |                     act_func=act_func,
209 |                     dropout=dropout
210 |                 )
211 |             )
212 | 
213 |         return ModuleDict({"dmpnn": dmpnn})
214 | 
215 |     def get_pattern_rep(self, pattern, p_v_emb, p_e_emb, v_mask=None, e_mask=None):
216 |         if v_mask is not None:
217 |             p_v_zero_mask = ~(v_mask)
218 |             v_outputs = [p_v_emb.masked_fill(p_v_zero_mask, 0.0)]
219 |         else:
220 |             p_v_zero_mask = None
221 |             v_outputs = [p_v_emb]
222 | 
223 |         if e_mask is not None:
224 |             p_e_zero_mask = ~(e_mask)
225 |             e_outputs = [p_e_emb.masked_fill(p_e_zero_mask, 0.0)]
226 |         else:
227 |             p_e_zero_mask = None
228 |             e_outputs = [p_e_emb]
229 | 
230 |         for layer in self.p_rep_net["dmpnn"]:
231 |             v, e = layer(pattern, v_outputs[-1], e_outputs[-1])
232 |             if p_v_zero_mask is not None:
233 |                 v = v.masked_fill(p_v_zero_mask, 0.0)
234 |             if p_e_zero_mask is not None:
235 |                 e = e.masked_fill(p_e_zero_mask, 0.0)
236 |             if self.rep_residual and v_outputs[-1].size() == v.size() and e_outputs[-1].size() == e.size():
237 |                 v_outputs.append(v_outputs[-1] + v)
238 |                 e_outputs.append(e_outputs[-1] + e)
239 |             else:
240 |                 v_outputs.append(v)
241 |                 e_outputs.append(e)
242 | 
243 |         return v_outputs[-1], e_outputs[-1]
244 | 
245 |     def get_graph_rep(self, graph, g_v_emb, g_e_emb, v_mask=None, e_mask=None, v_gate=None, e_gate=None):
246 |         if v_mask is not None or v_gate is not None:
247 |             if v_gate is None:
248 |                 v_gate = v_mask.float()
249 |             elif v_mask is not None:
250 |                 v_gate = v_mask.float() * v_gate
251 |             v_outputs = [g_v_emb * v_gate]
252 |         else:
253 |             v_outputs = [g_v_emb]
254 | 
255 |         if e_mask is not None or e_gate is not None:
256 |             if e_gate is None:
257 |                 e_gate = e_mask.float()
258 |             elif e_mask is not None:
259 |                 e_gate = e_mask.float() * e_gate
260 |             e_outputs = [g_e_emb * e_gate]
261 |         else:
262 |             e_outputs = [g_e_emb]
263 | 
264 |         for layer in self.g_rep_net["dmpnn"]:
265 |             v, e = layer(graph, v_outputs[-1], e_outputs[-1])
266 |             if v_gate is not None:
267 |                 v = v * v_gate
268 |             if e_gate is not None:
269 |                 e = e * e_gate
270 |             if self.rep_residual and v_outputs[-1].size() == v.size() and e_outputs[-1].size() == e.size():
271 |                 v_outputs.append(v_outputs[-1] + v)
272 |                 e_outputs.append(e_outputs[-1] + e)
273 |             else:
274 |                 v_outputs.append(v)
275 |                 e_outputs.append(e)
276 | 
277 |         return v_outputs[-1], e_outputs[-1]
278 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/models/embed.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numba
  3 | import numpy as np
  4 | import torch as th
  5 | import torch.nn as nn
  6 | 
  7 | 
  8 | @numba.jit(numba.int64[:](numba.int64[:], numba.int64), nopython=True)
  9 | def _get_enc_len(x, base=10):
 10 |     lens = np.zeros((len(x), ), dtype=np.int64)
 11 |     for i, n in enumerate(x):
 12 |         cnt = 0
 13 |         while n > 0:
 14 |             n = n // base
 15 |             cnt += 1
 16 |         # avoid 0 length
 17 |         if cnt == 0:
 18 |             cnt = 1
 19 |         lens[i] = cnt
 20 |     return lens
 21 | 
 22 | 
 23 | def get_enc_len(x, base=10):
 24 |     if isinstance(x, int):
 25 |         return _get_enc_len(np.array([x], dtype=np.int64), base)[0]
 26 |     elif isinstance(x, float):
 27 |         return _get_enc_len(np.array([int(x)], dtype=np.int64), base)[0]
 28 |     if isinstance(x, th.Tensor):
 29 |         x = x.numpy()
 30 |     elif not isinstance(x, np.ndarray):
 31 |         x = np.array(x)
 32 |     x = x.astype(np.int64)
 33 |     x_shape = x.shape
 34 | 
 35 |     return _get_enc_len(x.reshape(-1), base).reshape(*x_shape)
 36 | 
 37 | 
 38 | @numba.jit(
 39 |     numba.int64[:, :](numba.int64[:], numba.int64, numba.int64),
 40 |     nopython=True,
 41 |     nogil=True
 42 | )
 43 | def _int2anybase(x, len_x, base):
 44 |     numbers = np.zeros((len(x), len_x), dtype=np.int64)
 45 |     for i, n in enumerate(x):
 46 |         n = n % base**len_x
 47 |         idx = len_x - 1
 48 |         while n:
 49 |             numbers[i, idx] = n % base
 50 |             n = n // base
 51 |             idx -= 1
 52 | 
 53 |     return numbers
 54 | 
 55 | 
 56 | def int2anybase(x, len_x, base=10):
 57 |     if isinstance(x, int):
 58 |         return _int2anybase(np.array([x], dtype=np.int64), len_x, base)[0]
 59 |     elif isinstance(x, float):
 60 |         return _int2anybase(np.array([int(x)], dtype=np.int64), len_x, base)[0]
 61 |     if isinstance(x, th.Tensor):
 62 |         x = x.numpy()
 63 |     elif not isinstance(x, np.ndarray):
 64 |         x = np.array(x)
 65 |     x = x.astype(np.int64)
 66 | 
 67 |     return _int2anybase(x, len_x, base)
 68 | 
 69 | 
 70 | @numba.jit(
 71 |     numba.int64[:, :](numba.int64[:], numba.int64, numba.int64),
 72 |     nopython=True,
 73 |     nogil=True
 74 | )
 75 | def _int2multihot(x, len_x, base):
 76 |     rep = np.zeros((len(x), len_x * base), dtype=np.int64)
 77 |     for i, n in enumerate(x):
 78 |         n = n % base**len_x
 79 |         idx = (len_x - 1) * base
 80 |         while n:
 81 |             rep[i, idx + n % base] = 1
 82 |             n = n // base
 83 |             idx -= base
 84 |         while idx >= 0:
 85 |             rep[i, idx] = 1
 86 |             idx -= base
 87 |     return rep
 88 | 
 89 | 
 90 | def int2multihot(x, len_x, base=10):
 91 |     if isinstance(x, int):
 92 |         return _int2multihot(np.array([x], dtype=np.int64), len_x, base)[0]
 93 |     elif isinstance(x, float):
 94 |         return _int2multihot(np.array([int(x)], dtype=np.int64), len_x, base)[0]
 95 |     if isinstance(x, th.Tensor):
 96 |         x = x.numpy()
 97 |     elif not isinstance(x, np.ndarray):
 98 |         x = np.array(x)
 99 |     x = x.astype(np.int64)
100 | 
101 |     return _int2multihot(x, len_x, base)
102 | 
103 | 
104 | 
105 | class Embedding(nn.Embedding):
106 |     def __init__(self, num_embeddings, embedding_dim, **kw):
107 |         super(Embedding, self).__init__(num_embeddings, embedding_dim, **kw)
108 | 
109 |     def forward(self, x):
110 |         if x.dtype == th.long:
111 |             emb = super(Embedding, self).forward(x)
112 |         elif x.dtype == th.float and x.size(-1) == self.num_embeddings:
113 |             x_size = x.size()
114 |             emb = th.matmul(x.view(-1, x_size[-1]), self.weight)
115 |             emb = emb.view(x_size[:-1] + (self.embedding_dim, ))
116 |         else:
117 |             raise NotImplementedError
118 |         return emb
119 | 
120 |     def get_output_dim(self):
121 |         return self.embedding_dim
122 | 
123 | 
124 | class NormalEmbedding(Embedding):
125 |     def __init__(self, num_embeddings, embedding_dim, **kw):
126 |         super(NormalEmbedding, self).__init__(num_embeddings, embedding_dim, **kw)
127 | 
128 |         # init
129 |         nn.init.normal_(self.weight, 0.0, 1.0)
130 |         if self.padding_idx is not None:
131 |             with th.no_grad():
132 |                 self.weight[self.padding_idx].fill_(0)
133 | 
134 | 
135 | class UniformEmbedding(Embedding):
136 |     def __init__(self, num_embeddings, embedding_dim, **kw):
137 |         super(UniformEmbedding, self).__init__(num_embeddings, embedding_dim, **kw)
138 | 
139 |         # init
140 |         nn.init.uniform_(self.weight, -1.0, 1.0)
141 |         if self.padding_idx is not None:
142 |             with th.no_grad():
143 |                 self.weight[self.padding_idx].fill_(0)
144 | 
145 | 
146 | class OrthogonalEmbedding(Embedding):
147 |     def __init__(self, num_embeddings, embedding_dim, **kw):
148 |         super(OrthogonalEmbedding, self).__init__(num_embeddings, embedding_dim, **kw)
149 | 
150 |         # init
151 |         nn.init.orthogonal_(self.weight)
152 |         if self.padding_idx is not None:
153 |             with th.no_grad():
154 |                 self.weight[self.padding_idx].fill_(0)
155 | 
156 | 
157 | """
158 | Ravanbakhsh, S.; Schneider, J.; and Poczos, B.
159 | Equivariance Through Parameter-Sharing.
160 | In Proceedings of International Conference on Machine Learning, volume 70, of JMLR: W&CP, August 2017.
161 | """
162 | class EquivariantEmbedding(Embedding):
163 |     def __init__(self, num_embeddings, embedding_dim, **kw):
164 |         super(EquivariantEmbedding, self).__init__(num_embeddings, embedding_dim, **kw)
165 | 
166 |         self.row_vec = nn.Parameter(th.Tensor(self.embedding_dim, ))
167 | 
168 |         # init
169 |         self.allow_forward = True
170 |         nn.init.normal_(self.row_vec, 0.0, 1.0)
171 |         with th.no_grad():
172 |             for i in range(num_embeddings):
173 |                 self.weight[i].data.copy_(th.roll(self.row_vec, i, 0))
174 | 
175 |     def forward(self, x):
176 |         if not self.allow_forward:
177 |             with th.no_grad():
178 |                 for i in range(self.num_embeddings):
179 |                     self.weight[i] = th.roll(self.row_vec, i, 0)
180 |             self.allow_forward = True
181 | 
182 |         if x.dtype == th.long:
183 |             emb = super(EquivariantEmbedding, self).forward(x)
184 |         elif x.dtype == th.float and x.size(-1) == self.num_embeddings:
185 |             x_size = x.size()
186 |             emb = th.mm(x.view(-1, x_size[-1]), self.weight)
187 |             emb = emb.view(x_size[:-1] + (self.embedding_dim, ))
188 |         else:
189 |             raise NotImplementedError
190 |         return emb
191 | 
192 |     def backward(self, x):
193 |         self.allow_forward = False
194 |         return super(EquivariantEmbedding, self).backward(x)
195 | 
196 | 
197 | class MultihotEmbedding(Embedding):
198 |     def __init__(self, max_n=1024, base=2):
199 |         self.max_n = max_n
200 |         self.base = base
201 | 
202 |         enc_len = get_enc_len(max_n-1, base)
203 |         super(MultihotEmbedding, self).__init__(max_n, 2*enc_len)
204 |         with th.no_grad():
205 |             self.weight.data.copy_(th.from_numpy(int2multihot(np.arange(0, max_n), enc_len, base)).float())
206 | 
207 |     def extra_repr(self):
208 |         return "base=%d, max_n=%d, enc_dim=%d" % (self.base, self.max_n, self.weight.shape[1])
209 | 
210 | 
211 | class PositionEmbedding(Embedding):
212 |     def __init__(self, embedding_dim, max_len=512, scale=1):
213 | 
214 |         freq_seq = th.arange(0, embedding_dim, 2.0, dtype=th.float)
215 |         inv_freq = th.pow(10000, (freq_seq / embedding_dim)).reciprocal()
216 |         sinusoid_inp = th.ger(th.arange(0, max_len, 1.0), inv_freq)
217 |         super(PositionEmbedding, self).__init__(max_len, embedding_dim)
218 |         with th.no_grad():
219 |             self.weight.data.copy_(th.cat([th.sin(sinusoid_inp), th.cos(sinusoid_inp)], dim=-1) * scale)
220 | 
221 |     def extra_repr(self):
222 |         return "embedding_dim=%d, max_len=%d" % (self.weight.shape[1], self.weight.shape[0])
223 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/models/filter.py:
--------------------------------------------------------------------------------
 1 | import torch as th
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class ScalarFilter(nn.Module):
 7 |     def __init__(self):
 8 |         super(ScalarFilter, self).__init__()
 9 | 
10 |     def forward(self, p_x, g_x):
11 |         """
12 |         input should be scalar: bsz x l1, bsz x l2
13 |         return bsz x l2
14 |         """
15 |         matrix = g_x.unsqueeze(2) - p_x.unsqueeze(1) # bsz x l1 x l2
16 |         return th.max(matrix == 0, dim=2)[0]
17 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/models/rgin.py:
--------------------------------------------------------------------------------
  1 | import dgl.function as fn
  2 | import torch as th
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | from .basemodel import GraphAdjModel
  7 | from .container import *
  8 | # from ..constants import *
  9 | # from ..utils.act import map_activation_str_to_layer
 10 | # from ..utils.init import init_weight, init_module
 11 | from constants import *
 12 | from utils.act import map_activation_str_to_layer
 13 | from utils.init import init_weight, init_module
 14 | 
 15 | 
 16 | class RGINLayer(nn.Module):
 17 |     def __init__(
 18 |         self,
 19 |         input_dim,
 20 |         hidden_dim,
 21 |         num_rels=1,
 22 |         regularizer="basis",
 23 |         num_bases=-1,
 24 |         num_mlp_layers=2,
 25 |         self_loop=True,
 26 |         bias=True,
 27 |         batch_norm=False,
 28 |         act_func="relu",
 29 |         dropout=0.0,
 30 |     ):
 31 |         super(RGINLayer, self).__init__()
 32 |         assert regularizer in ["none", "basis", "bdd"]
 33 | 
 34 |         self.input_dim = input_dim
 35 |         self.hidden_dim = hidden_dim
 36 |         self.num_rels = num_rels
 37 |         self.regularizer = regularizer
 38 |         if regularizer == "none" or num_bases is None or num_bases > num_rels or num_bases <= 0:
 39 |             self.num_bases = num_rels
 40 |         else:
 41 |             self.num_bases = num_bases
 42 |         if self_loop:
 43 |             self.loop_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim))
 44 |         else:
 45 |             self.register_parameter("loop_weight", None)
 46 |         if bias:
 47 |             self.bias = nn.Parameter(th.Tensor(hidden_dim))
 48 |         else:
 49 |             self.register_parameter("bias", None)
 50 |         self.mlp = []
 51 |         for i in range(num_mlp_layers):
 52 |             self.mlp.append(nn.Linear(hidden_dim, hidden_dim))
 53 |             if i != num_mlp_layers - 1:
 54 |                 if batch_norm:
 55 |                     self.mlp.append(nn.BatchNorm1d(hidden_dim))
 56 |                 self.mlp.append(map_activation_str_to_layer(act_func))
 57 |         self.mlp = Sequential(*self.mlp)
 58 |         self.act = map_activation_str_to_layer(act_func)
 59 |         self.drop = nn.Dropout(dropout)
 60 | 
 61 |         if regularizer == "none" or regularizer == "basis":
 62 |             # add basis weights
 63 |             self.weight = nn.Parameter(th.Tensor(self.num_bases, self.input_dim, self.hidden_dim))
 64 |             if self.num_bases < self.num_rels:
 65 |                 # linear combination coefficients
 66 |                 self.w_comp = nn.Parameter(th.Tensor(self.num_rels, self.num_bases))
 67 |             else:
 68 |                 self.register_parameter("w_comp", None)
 69 |         elif regularizer == "bdd":
 70 |             if input_dim % self.num_bases != 0 or hidden_dim % self.num_bases != 0:
 71 |                 raise ValueError('Feature size must be a multiplier of num_bases (%d).' % self.num_bases)
 72 |             # add block diagonal weights
 73 |             submat_in = input_dim // self.num_bases
 74 |             submat_out = hidden_dim // self.num_bases
 75 | 
 76 |             # assuming input_dim and hidden_dim are both divisible by num_bases
 77 |             self.weight = nn.Parameter(th.Tensor(self.num_rels, self.num_bases * submat_in * submat_out))
 78 |             self.register_parameter("w_comp", None)
 79 |         else:
 80 |             raise ValueError("Regularizer must be either 'basis' or 'bdd'")
 81 |         
 82 |         # init
 83 |         init_weight(self.weight, activation=act_func, init="uniform")
 84 |         if self.w_comp is not None:
 85 |             init_weight(self.w_comp, activation=act_func, init="uniform")
 86 |         if self_loop:
 87 |             init_weight(self.loop_weight, activation=act_func, init="uniform")
 88 |         nn.init.zeros_(self.bias)
 89 | 
 90 |         self.node_init_func = self._node_init_func
 91 |         self.edge_init_func = self._edge_init_func
 92 |         if regularizer == "none" or regularizer == "basis":
 93 |             self.node_message_func = self._basis_message_func
 94 |         elif regularizer == "bdd":
 95 |             self.node_message_func = self._bdd_message_func
 96 |         else:
 97 |             raise ValueError("Regularizer must be either 'basis' or 'bdd'")
 98 |         self.node_reduce_func = fn.sum(msg=NODEMSG, out=NODEAGG)
 99 |         self.node_update_func = self._node_update_func
100 |         self.edge_update_func = None
101 | 
102 |     def _basis_message_func(self, edges):
103 |         if self.num_bases < self.num_rels:
104 |             # generate all weights from bases
105 |             weight = self.weight.view(self.num_bases, self.input_dim * self.hidden_dim)
106 |             weight = th.matmul(self.w_comp, weight).view(self.num_rels, self.input_dim, self.hidden_dim)
107 |         else:
108 |             weight = self.weight
109 |         weight = weight.index_select(0, edges.data[EDGETYPE])
110 |         msg = th.bmm(edges.src[NODEFEAT].unsqueeze(1), weight).squeeze(1)
111 | 
112 |         return {NODEMSG: msg}
113 | 
114 |     def _bdd_message_func(self, edges):
115 |         submat_in = self.input_dim // self.num_bases
116 |         submat_out = self.hidden_dim // self.num_bases
117 |         weight = self.weight.index_select(0, edges.data[EDGETYPE]).view(-1, submat_in, submat_out)
118 |         msg = th.bmm(edges.src[NODEFEAT].view(-1, 1, submat_in), weight).view(-1, self.hidden_dim)
119 | 
120 |         return {NODEMSG: msg}
121 | 
122 |     @property
123 |     def self_loop(self):
124 |         return hasattr(self, "loop_weight") and self.loop_weight is not None
125 | 
126 |     def _node_init_func(self, graph, node_feat=None):
127 |         if node_feat is not None:
128 |             graph.ndata[NODEFEAT] = node_feat
129 |         return node_feat
130 | 
131 |     def _edge_init_func(self, graph, edge_type=None):
132 |         if edge_type is not None:
133 |             graph.edata[EDGETYPE] = edge_type
134 | 
135 |         return edge_type
136 |     
137 |     def _node_update_func(self, nodes):
138 |         agg = nodes.data[NODEAGG]
139 | 
140 |         if self.self_loop:
141 |             loop_msg = th.matmul(nodes.data[NODEFEAT], self.loop_weight)
142 |             out = agg + loop_msg
143 |         else:
144 |             out = agg
145 |         if self.bias is not None:
146 |             out = out + self.bias
147 |         if len(self.mlp) > 0:
148 |             out = self.mlp(out)
149 |         else:
150 |             out = self.act(out)
151 |         out = self.act(out)
152 |         out = self.drop(out)
153 | 
154 |         return {NODEOUTPUT: out}
155 | 
156 |     def forward(self, g, node_feat, edge_type):
157 |         self.node_init_func(g, node_feat)
158 |         self.edge_init_func(g, edge_type)
159 |         g.update_all(self.node_message_func, self.node_reduce_func, self.node_update_func)
160 |         return g.ndata.pop(NODEOUTPUT), edge_type
161 | 
162 |     def get_output_dim(self):
163 |         return self.hidden_dim
164 | 
165 |     def extra_repr(self):
166 |         summary = [
167 |             "in=%d, out=%d," % (self.input_dim, self.hidden_dim),
168 |             "num_rels=%d, regularizer=%s, num_bases=%d," % (self.num_rels, self.regularizer, self.num_bases),
169 |             "edge_norm=%s, self_loop=%s, bias=%s," % (self.edge_norm, self.self_loop, self.bias is not None),
170 |         ]
171 | 
172 |         return "\n".join(summary)
173 | 
174 | 
175 | class RGIN(GraphAdjModel):
176 |     def __init__(self, **kw):
177 |         super(RGIN, self).__init__(**kw)
178 | 
179 |     def create_rep_net(self, type, **kw):
180 |         if type == "graph":
181 |             num_layers = kw.get("rep_num_graph_layers", 1)
182 |             num_rels = self.max_ngel
183 |         elif type == "pattern":
184 |             if self.share_rep_net:
185 |                 return self.g_rep_net
186 |             num_layers = kw.get("rep_num_pattern_layers", 1)
187 |             num_rels = self.max_npel
188 |         regularizer = kw.get("rep_rgin_regularizer", "basis")
189 |         num_bases = kw.get("rep_rgin_num_bases", -1)
190 |         num_mlp_layers = kw.get("rep_rgin_num_mlp_layers", 2)
191 |         batch_norm = kw.get("rep_rgin_batch_norm", False)
192 |         act_func = kw.get("rep_act_func", "relu")
193 |         dropout = kw.get("rep_dropout", 0.0)
194 | 
195 |         rgin = ModuleList()
196 |         for i in range(num_layers):
197 |             rgin.add_module(
198 |                 "%s_rgin_(%d)" % (type, i),
199 |                 RGINLayer(
200 |                     self.hid_dim,
201 |                     self.hid_dim,
202 |                     num_rels=num_rels,
203 |                     regularizer=regularizer,
204 |                     num_bases=num_bases,
205 |                     num_mlp_layers=num_mlp_layers,
206 |                     batch_norm=batch_norm,
207 |                     act_func=act_func,
208 |                     dropout=dropout
209 |                 )
210 |             )
211 | 
212 |         return ModuleDict({"rgin": rgin})
213 | 
214 |     def get_pattern_rep(self, pattern, p_emb, mask=None):
215 |         if mask is not None:
216 |             p_zero_mask = ~(mask)
217 |             outputs = [p_emb.masked_fill(p_zero_mask, 0.0)]
218 |             etype = pattern.edata["label"]
219 |             for layer in self.p_rep_net["rgin"]:
220 |                 o, etype = layer(pattern, outputs[-1], etype)
221 |                 outputs.append(o.masked_fill(p_zero_mask, 0.0))
222 |         else:
223 |             outputs = [p_emb]
224 |             etype = pattern.edata["label"]
225 |             for layer in self.p_rep_net["rgin"]:
226 |                 o, etype = layer(pattern, outputs[-1], etype)
227 |                 if self.rep_residual and outputs[-1].size() == o.size():
228 |                     outputs.append(outputs[-1] + o)
229 |                 else:
230 |                     outputs.append(o)
231 | 
232 |         return outputs[-1]
233 | 
234 |     def get_graph_rep(self, graph, g_emb, mask=None, gate=None):
235 |         if mask is None and gate is None:
236 |             outputs = [g_emb]
237 |             etype = graph.edata["label"]
238 |             for layer in self.g_rep_net["rgin"]:
239 |                 o, etype = layer(graph, outputs[-1], etype)
240 |                 if self.rep_residual and outputs[-1].size() == o.size():
241 |                     outputs.append(outputs[-1] + o)
242 |                 else:
243 |                     outputs.append(o)
244 |         else:
245 |             if gate is None:
246 |                 gate = mask.float()
247 |             elif mask is not None:
248 |                 gate = mask.float() * gate
249 | 
250 |             outputs = [g_emb * gate]
251 |             etype = graph.edata["label"]
252 |             for layer in self.g_rep_net["rgin"]:
253 |                 o, etype = layer(graph, outputs[-1], etype)
254 |                 o = o * gate
255 |                 if self.rep_residual and outputs[-1].size() == o.size():
256 |                     outputs.append(outputs[-1] + o)
257 |                 else:
258 |                     outputs.append(o)
259 | 
260 |         return outputs[-1]
261 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/models/rnn.py:
--------------------------------------------------------------------------------
  1 | import torch as th
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from .basemodel import EdgeSeqModel
  6 | from .container import *
  7 | # from ..utils.act import map_activation_str_to_layer
  8 | # from ..utils.init import init_weight, init_module
  9 | from utils.act import map_activation_str_to_layer
 10 | from utils.init import init_weight, init_module
 11 | 
 12 | 
 13 | class RNNLayer(nn.Module):
 14 |     def __init__(self, rep_rnn_type, input_dim, hid_dim, layer_norm=False, bidirectional=False, dropout=0.0):
 15 |         super(RNNLayer, self).__init__()
 16 |         self.input_dim = input_dim
 17 |         self.hid_dim = hid_dim
 18 |         if rep_rnn_type == "LSTM":
 19 |             self.layer = nn.LSTM(
 20 |                 input_dim, hid_dim//2 if bidirectional else hid_dim,
 21 |                 bidirectional=bidirectional, batch_first=True
 22 |             )
 23 |         elif rep_rnn_type == "GRU":
 24 |             self.layer = nn.GRU(
 25 |                 input_dim, hid_dim//2 if bidirectional else hid_dim,
 26 |                 bidirectional=bidirectional, batch_first=True
 27 |             )
 28 |         elif rep_rnn_type == "RNN":
 29 |             self.layer = nn.RNN(
 30 |                 input_dim, hid_dim//2 if bidirectional else hid_dim,
 31 |                 bidirectional=bidirectional, batch_first=True
 32 |             )
 33 |         if layer_norm:
 34 |             self.ln = nn.LayerNorm(hid_dim)
 35 |         else:
 36 |             self.ln = None
 37 |         self.drop = nn.Dropout(dropout)
 38 | 
 39 |         # init
 40 |         init_module(self.layer)
 41 | 
 42 |     def forward(self, x):
 43 |         o = self.layer(x)[0]
 44 |         if self.ln is not None:
 45 |             o = self.ln(o)
 46 |         o = self.drop(o)
 47 | 
 48 |         return o
 49 | 
 50 |     def get_output_dim(self):
 51 |         return self.hid_dim - self.hid_dim % 2
 52 | 
 53 |     def extra_repr(self):
 54 |         ""
 55 | 
 56 | 
 57 | class RNN(EdgeSeqModel):
 58 |     def __init__(self, **kw):
 59 |         super(RNN, self).__init__(**kw)
 60 | 
 61 |     def create_rep_net(self, type, **kw):
 62 |         if type == "graph":
 63 |             num_layers = kw.get("rep_num_graph_layers", 1)
 64 |         elif type == "pattern":
 65 |             if self.share_rep_net:
 66 |                 return self.g_rep_net
 67 |             num_layers = kw.get("rep_num_pattern_layers", 1)
 68 |         rep_rnn_type = kw.get("rep_rnn_type", "LSTM")
 69 |         bidirectional = kw.get("rep_rnn_bidirectional", False)
 70 |         dropout = kw.get("rep_dropout", 0.0)
 71 | 
 72 |         rnn = ModuleList()
 73 |         for i in range(num_layers):
 74 |             rnn.add_module(
 75 |                 "%s_rnn_(%d)" % (type, i),
 76 |                 RNNLayer(
 77 |                     rep_rnn_type,
 78 |                     self.hid_dim, self.hid_dim,
 79 |                     bidirectional=bidirectional,
 80 |                     dropout=dropout
 81 |                 )
 82 |             )
 83 | 
 84 |         return ModuleDict({"rnn": rnn})
 85 | 
 86 |     def get_pattern_rep(self, p_emb, mask=None):
 87 |         if mask is not None:
 88 |             p_zero_mask = ~(mask)
 89 |             outputs = [p_emb.masked_fill(p_zero_mask, 0.0)]
 90 |             for layer in self.p_rep_net["rnn"]:
 91 |                 o = layer(outputs[-1])
 92 |                 outputs.append(o.masked_fill(p_zero_mask, 0.0))
 93 |         else:
 94 |             outputs = [p_emb]
 95 |             for layer in self.p_rep_net["rnn"]:
 96 |                 o = layer(outputs[-1])
 97 |                 if self.rep_residual and outputs[-1].size() == o.size():
 98 |                     outputs.append(outputs[-1] + o)
 99 |                 else:
100 |                     outputs.append(o)
101 | 
102 |         return outputs[-1]
103 | 
104 |     def get_graph_rep(self, g_emb, mask=None, gate=None):
105 |         if mask is None and gate is None:
106 |             outputs = [g_emb]
107 |             for layer in self.g_rep_net["rnn"]:
108 |                 o = layer(outputs[-1])
109 |                 if self.rep_residual and outputs[-1].size() == o.size():
110 |                     outputs.append(outputs[-1] + o)
111 |                 else:
112 |                     outputs.append(o)
113 |         else:
114 |             gate = ((mask.float() if mask is not None else 1) * (gate if gate is not None else 1))
115 |             outputs = [g_emb * gate]
116 |             for layer in self.g_rep_net["rnn"]:
117 |                 o = layer(outputs[-1])
118 |                 o = o * gate
119 |                 if self.rep_residual and outputs[-1].size() == o.size():
120 |                     outputs.append(outputs[-1] + o)
121 |                 else:
122 |                     outputs.append(o)
123 | 
124 |         return outputs[-1]
125 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .act import *
 2 | from .anneal import *
 3 | from .cyclical import *
 4 | from .dl import *
 5 | from .init import *
 6 | from .io import *
 7 | from .log import *
 8 | from .sampler import *
 9 | from .scheduler import *
10 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/utils/anneal.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | 
 4 | PI = 3.141592653589793
 5 | INIT_STEPS = 600
 6 | SCHEDULE_STEPS = 10000
 7 | NUM_CYCLES = 2
 8 | MIN_PERCENT = 1e-3
 9 | 
10 | 
11 | def anneal_fn(
12 |     fn,
13 |     current_step,
14 |     num_init_steps=INIT_STEPS,
15 |     num_anneal_steps=SCHEDULE_STEPS,
16 |     num_cycles=NUM_CYCLES,
17 |     value1=0.0,
18 |     value2=1.0
19 | ):
20 |     if current_step < num_init_steps:
21 |         return anneal_fn(
22 |             fn,
23 |             current_step,
24 |             num_init_steps=0,
25 |             num_anneal_steps=num_init_steps * 2,
26 |             num_cycles=1,
27 |             value1=value2,
28 |             value2=value1
29 |         )
30 |     if current_step > num_anneal_steps:
31 |         return value2
32 | 
33 |     if not fn or fn == "none" or fn == "constant":
34 |         return value2
35 | 
36 |     progress = float(num_cycles * (current_step - num_init_steps)) / max(1, num_anneal_steps - num_init_steps) % 1
37 | 
38 |     if fn == "linear":
39 |         if progress < 0.5:
40 |             return float(value1 + (value2 - value1) * progress * 2)
41 |         else:
42 |             return value2
43 |     elif fn == "cosine":
44 |         if progress < 0.5:
45 |             return float(value1 + (value2 - value1) * (1 - math.cos(PI * progress * 2)) / 2)
46 |         else:
47 |             return value2
48 |     else:
49 |         raise NotImplementedError
50 | 
51 | 
52 | class AnnealManner:
53 |     def __init__(
54 |         self,
55 |         fn,
56 |         current_step=0,
57 |         num_init_steps=INIT_STEPS,
58 |         num_anneal_steps=SCHEDULE_STEPS,
59 |         num_cycles=NUM_CYCLES,
60 |         value1=0.0,
61 |         value2=1.0
62 |     ):
63 |         self.fn = fn
64 |         self.num_init_steps = num_init_steps
65 |         self.num_anneal_steps = num_anneal_steps
66 |         self.num_cycles = num_cycles
67 |         self.value1 = value1
68 |         self.value2 = value2
69 | 
70 |         self._step_count = current_step
71 | 
72 |     def step(self, step=None):
73 |         value = anneal_fn(
74 |             self.fn,
75 |             self._step_count,
76 |             num_init_steps=self.num_init_steps,
77 |             num_anneal_steps=self.num_anneal_steps,
78 |             num_cycles=self.num_cycles,
79 |             value1=self.value1,
80 |             value2=self.value2
81 |         )
82 | 
83 |         if step is not None:
84 |             self._step_count = step
85 |         else:
86 |             self._step_count += 1
87 | 
88 |         return value
89 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/utils/cyclical.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | 
 4 | PI = 3.141592653589793
 5 | INIT_STEPS = 600
 6 | SCHEDULE_STEPS = 10000
 7 | NUM_CYCLES = 2
 8 | MIN_PERCENT = 1e-3
 9 | 
10 | 
11 | def cyclical_fn(
12 |     fn,
13 |     current_step,
14 |     num_init_steps=INIT_STEPS,
15 |     num_cyclical_steps=SCHEDULE_STEPS,
16 |     num_cycles=NUM_CYCLES,
17 |     value1=0.0,
18 |     value2=1.0
19 | ):
20 |     if current_step < num_init_steps:
21 |         return cyclical_fn(
22 |             fn,
23 |             current_step,
24 |             num_init_steps=0,
25 |             num_cyclical_steps=num_init_steps * 2,
26 |             num_cycles=1,
27 |             value1=value2,
28 |             value2=value1
29 |         )
30 |     if current_step > num_cyclical_steps:
31 |         return value2
32 | 
33 |     if not fn or fn == "none" or fn == "constant":
34 |         return value2
35 | 
36 |     progress = float(num_cycles * (current_step - num_init_steps)) / max(1, num_cyclical_steps - num_init_steps) % 1
37 |     if fn == "linear":
38 |         if progress < 0.5:
39 |             return float(value1 + (value2 - value1) * (progress * 2))
40 |         else:
41 |             return float(value2 + (value1 - value2) * (progress * 2 - 1))
42 |     elif fn == "cosine":
43 |         return float(value1 + (value2 - value1) * (1 - math.cos(PI * progress * 2)) / 2)
44 |     else:
45 |         raise NotImplementedError
46 | 
47 | 
48 | class CyclicalManner:
49 |     def __init__(
50 |         self,
51 |         fn,
52 |         current_step=0,
53 |         num_init_steps=INIT_STEPS,
54 |         num_cyclical_steps=SCHEDULE_STEPS,
55 |         num_cycles=NUM_CYCLES,
56 |         value1=0.0,
57 |         value2=1.0
58 |     ):
59 |         self.fn = fn
60 |         self.num_init_steps = num_init_steps
61 |         self.num_cyclical_steps = num_cyclical_steps
62 |         self.num_cycles = num_cycles
63 |         self.value1 = value1
64 |         self.value2 = value2
65 | 
66 |         self._step_count = current_step
67 | 
68 |     def step(self, step=None):
69 |         value = cyclical_fn(
70 |             self.fn,
71 |             self._step_count,
72 |             num_init_steps=self.num_init_steps,
73 |             num_cyclical_steps=self.num_cyclical_steps,
74 |             num_cycles=self.num_cycles,
75 |             value1=self.value1,
76 |             value2=self.value2
77 |         )
78 | 
79 |         if step is not None:
80 |             self._step_count = step
81 |         else:
82 |             self._step_count += 1
83 | 
84 |         return value
85 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/utils/dl.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import igraph as ig
  3 | import json
  4 | import math
  5 | import numpy as np
  6 | import torch as th
  7 | import torch.nn as nn
  8 | import os
  9 | from argparse import Namespace
 10 | from collections import OrderedDict, namedtuple
 11 | from multiprocessing import Pool
 12 | from tqdm import tqdm
 13 | 
 14 | from .act import *
 15 | 
 16 | 
 17 | def segment_data(data, max_len, pre_pad=False):
 18 |     pad_len = max_len - data.size(1) % max_len
 19 |     if pad_len != max_len:
 20 |         pad_size = list(data.size())
 21 |         pad_size[1] = pad_len
 22 |         zero_pad = th.zeros(pad_size, device=data.device, dtype=data.dtype, requires_grad=False)
 23 |         if pre_pad:
 24 |             data = th.cat([zero_pad, data], dim=1)
 25 |         else:
 26 |             data = th.cat([data, zero_pad], dim=1)
 27 |     return th.split(data, max_len, dim=1)
 28 | 
 29 | 
 30 | def segment_length(data_len, max_len):
 31 |     bsz = data_len.size(0)
 32 |     list_len = math.ceil(data_len.max().float() / max_len)
 33 |     segment_lens = th.arange(
 34 |         0, max_len * list_len, max_len, dtype=data_len.dtype, device=data_len.device, requires_grad=False
 35 |     ).view(1, list_len)
 36 |     diff = data_len.view(-1, 1) - segment_lens
 37 |     fill_max = diff > max_len
 38 |     fill_zero = diff < 0
 39 |     segment_lens = diff.masked_fill(fill_max, max_len)
 40 |     segment_lens.masked_fill_(fill_zero, 0)
 41 |     return th.split(segment_lens.view(bsz, -1), 1, dim=1)
 42 | 
 43 | 
 44 | def split_ids(x_ids):
 45 |     if x_ids[0] == x_ids[-1]:
 46 |         return th.LongTensor([x_ids.size(0)]).to(x_ids.device)
 47 |     diff = th.roll(x_ids, -1, 0) - x_ids
 48 |     return th.masked_select(th.arange(1, x_ids.size(0) + 1, device=x_ids.device), diff.bool())
 49 | 
 50 | 
 51 | def split_and_batchify_graph_feats(batched_graph_feats, graph_sizes, pre_pad=False):
 52 |     bsz = graph_sizes.size(0)
 53 |     dtype, device = batched_graph_feats.dtype, batched_graph_feats.device
 54 | 
 55 |     min_size, max_size = graph_sizes.min(), graph_sizes.max()
 56 |     if min_size == max_size:
 57 |         feats = batched_graph_feats.view(bsz, max_size, -1)
 58 |         mask = th.ones((bsz, max_size), dtype=th.bool, device=device)
 59 |         return feats, mask
 60 |     else:
 61 |         feats = []
 62 |         mask = th.zeros((bsz, max_size), dtype=th.bool, device=device, requires_grad=False)
 63 | 
 64 |         graph_sizes_list = graph_sizes.view(-1).tolist()
 65 |         idx = 0
 66 |         if pre_pad:
 67 |             for i, l in enumerate(graph_sizes_list):
 68 |                 if l < max_size:
 69 |                     feats.append(th.zeros((max_size - l, ) + batched_graph_feats.size()[1:], dtype=dtype, device=device))
 70 |                 feats.append(batched_graph_feats[idx:idx + l])
 71 |                 mask[i, -l:].fill_(1)
 72 |                 idx += l
 73 |         else:
 74 |             for i, l in enumerate(graph_sizes_list):
 75 |                 feats.append(batched_graph_feats[idx:idx + l])
 76 |                 if l < max_size:
 77 |                     feats.append(th.zeros((max_size - l, ) + batched_graph_feats.size()[1:], dtype=dtype, device=device))
 78 |                 mask[i, :l].fill_(1)
 79 |                 idx += l
 80 |         feats = th.cat(feats, 0).view(bsz, max_size, -1)
 81 |     return feats, mask
 82 | 
 83 | 
 84 | def batch_convert_list_to_tensor(batch_list, max_seq_len=-1, pad_id=0, pre_pad=False):
 85 |     batch_tensor = [th.tensor(v) for v in batch_list]
 86 |     return batch_convert_tensor_to_tensor(batch_tensor, max_seq_len=max_seq_len, pad_id=pad_id, pre_pad=pre_pad)
 87 | 
 88 | 
 89 | def batch_convert_tensor_to_tensor(batch_tensor, max_seq_len=-1, pad_id=0, pre_pad=False):
 90 |     batch_lens = [len(v) for v in batch_tensor]
 91 |     if max_seq_len == -1:
 92 |         max_seq_len = max(batch_lens)
 93 | 
 94 |     result = th.empty(
 95 |         [len(batch_tensor), max_seq_len] + list(batch_tensor[0].size())[1:],
 96 |         dtype=batch_tensor[0].dtype,
 97 |         device=batch_tensor[0].device
 98 |     ).fill_(pad_id)
 99 |     for i, t in enumerate(batch_tensor):
100 |         len_t = batch_lens[i]
101 |         if len_t < max_seq_len:
102 |             if pre_pad:
103 |                 result[i, -len_t:].data.copy_(t)
104 |             else:
105 |                 result[i, :len_t].data.copy_(t)
106 |         elif len_t == max_seq_len:
107 |             result[i].data.copy_(t)
108 |         else:
109 |             result[i].data.copy_(t[:max_seq_len])
110 |     return result
111 | 
112 | 
113 | def batch_convert_len_to_mask(batch_lens, max_seq_len=-1, pre_pad=False):
114 |     if max_seq_len == -1:
115 |         max_seq_len = max(batch_lens)
116 |     mask = th.ones(
117 |         (len(batch_lens), max_seq_len),
118 |         dtype=th.bool,
119 |         device=batch_lens[0].device if isinstance(batch_lens[0], th.Tensor) else th.device("cpu")
120 |     )
121 |     if pre_pad:
122 |         for i, l in enumerate(batch_lens):
123 |             mask[i, :-l].fill_(0)
124 |     else:
125 |         for i, l in enumerate(batch_lens):
126 |             mask[i, l:].fill_(0)
127 |     return mask
128 | 
129 | 
130 | def batch_convert_mask_to_start_and_end(mask):
131 |     cumsum = mask.cumsum(dim=-1) * 2
132 |     start_indices = cumsum.masked_fill(mask == 0, mask.size(-1)).min(dim=-1)[1]
133 |     end_indices = cumsum.max(dim=-1)[1]
134 | 
135 |     return start_indices, end_indices
136 | 
137 | 
138 | def convert_dgl_graph_to_edgeseq(graph, x_emb, x_len, e_emb):
139 |     uid, vid, eid = graph.all_edges(form="all", order="srcdst")
140 |     e = e_emb[eid]
141 |     if x_emb is not None:
142 |         u, v = x_emb[uid], x_emb[vid]
143 |         e = th.cat([u, v, e], dim=1)
144 |     e_len = th.tensor(graph.batch_num_edges, dtype=x_len.dtype, device=x_len.device).view(x_len.size())
145 |     return e, e_len
146 | 
147 | 
148 | def mask_seq_by_len(x, len_x):
149 |     x_size = list(x.size())
150 |     if x_size[1] == len_x.max():
151 |         mask = batch_convert_len_to_mask(len_x)
152 |         mask_size = x_size[0:2] + [1] * (len(x_size) - 2)
153 |         x = x * mask.view(*mask_size)
154 |     return x
155 | 
156 | 
157 | def expand_dimensions(old_module, new_module, pre_pad=True):
158 |     with th.no_grad():
159 |         # if type(old_module) != type(new_module):
160 |         #     raise ValueError("Error: the two input should have the same type.")
161 |         if isinstance(old_module, th.Tensor) or isinstance(old_module, nn.Parameter):
162 |             nn.init.zeros_(new_module)
163 |             old_size = old_module.size()
164 |             if pre_pad:
165 |                 if len(old_size) == 1:
166 |                     new_module.data[-old_size[0]:].copy_(old_module)
167 |                 elif len(old_size) == 2:
168 |                     new_module.data[-old_size[0]:, -old_size[1]:].copy_(old_module)
169 |                 elif len(old_size) == 3:
170 |                     new_module.data[-old_size[0]:, -old_size[1]:, -old_size[2]:].copy_(old_module)
171 |                 elif len(old_size) == 4:
172 |                     new_module.data[-old_size[0]:, -old_size[1]:, -old_size[2]:, -old_size[3]:].copy_(old_module)
173 |                 else:
174 |                     raise NotImplementedError
175 |             else:
176 |                 if len(old_size) == 1:
177 |                     new_module.data[:old_size[0]].copy_(old_module)
178 |                 elif len(old_size) == 2:
179 |                     new_module.data[:old_size[0], :old_size[1]].copy_(old_module)
180 |                 elif len(old_size) == 3:
181 |                     new_module.data[:old_size[0], :old_size[1], :old_size[2]].copy_(old_module)
182 |                 elif len(old_size) == 4:
183 |                     new_module.data[:old_size[0], :old_size[1], :old_size[2], :old_size[3]].copy_(old_module)
184 |                 else:
185 |                     raise NotImplementedError
186 |             return
187 | 
188 |         old_param_d = dict(old_module.named_parameters())
189 |         for name, params in new_module.named_parameters():
190 |             if name in old_param_d:
191 |                 expand_dimensions(old_param_d[name], params, pre_pad)
192 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/utils/init.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import igraph as ig
  3 | import json
  4 | import math
  5 | import numpy as np
  6 | import torch as th
  7 | import torch.nn as nn
  8 | import os
  9 | from argparse import Namespace
 10 | from collections import OrderedDict, namedtuple
 11 | from multiprocessing import Pool
 12 | from tqdm import tqdm
 13 | 
 14 | from .act import *
 15 | 
 16 | 
 17 | def calculate_gain(activation):
 18 |     if isinstance(activation, str):
 19 |         if activation in ["none", "maximum", "minimum"]:
 20 |             nonlinearity = "linear"
 21 |         elif activation in ["relu", "relu6", "elu", "selu", "celu", "gelu"]:
 22 |             nonlinearity = "relu"
 23 |         elif activation in ["leaky_relu", "prelu"]:
 24 |             nonlinearity = "leaky_relu"
 25 |         elif activation in ["softmax", "sparsemax", "gumbel_softmax"]:
 26 |             nonlinearity = "sigmoid"
 27 |         elif activation in ["sigmoid", "tanh"]:
 28 |             nonlinearity = activation
 29 |         else:
 30 |             raise NotImplementedError
 31 |     elif isinstance(activation, nn.Module):
 32 |         if isinstance(activation, (Identity, Maximum, Minimum)):
 33 |             nonlinearity = "linear"
 34 |         elif isinstance(activation, (ReLU, ReLU6, ELU, SELU, CELU, GELU)):
 35 |             nonlinearity = "relu"
 36 |         elif isinstance(activation, (LeakyReLU, PReLU)):
 37 |             nonlinearity = "leaky_relu"
 38 |         elif isinstance(activation, (Softmax, Sparsemax, GumbelSoftmax)):
 39 |             nonlinearity = "sigmoid"
 40 |         elif isinstance(activation, Sigmoid):
 41 |             nonlinearity = "sigmoid"
 42 |         elif isinstance(activation, Tanh):
 43 |             nonlinearity = "tanh"
 44 |         else:
 45 |             raise NotImplementedError
 46 |     else:
 47 |         raise ValueError
 48 | 
 49 |     return nn.init.calculate_gain(nonlinearity, LEAKY_RELU_A)
 50 | 
 51 | 
 52 | def calculate_fan_in_and_fan_out(x):
 53 |     if x.dim() < 2:
 54 |         x = x.unsqueeze(-1)
 55 |     num_input_fmaps = x.size(1)
 56 |     num_output_fmaps = x.size(0)
 57 |     receptive_field_size = 1
 58 |     if x.dim() > 2:
 59 |         receptive_field_size = x[0][0].numel()
 60 |     fan_in = num_input_fmaps * receptive_field_size
 61 |     fan_out = num_output_fmaps * receptive_field_size
 62 | 
 63 |     return fan_in, fan_out
 64 | 
 65 | 
 66 | def zero_init(x, gain=1.0):
 67 |     return nn.init.zeros_(x)
 68 | 
 69 | 
 70 | def xavier_uniform_init(x, gain=1.0):
 71 |     fan_in, fan_out = calculate_fan_in_and_fan_out(x)
 72 |     std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
 73 |     a = 1.7320508075688772 * std
 74 | 
 75 |     return nn.init.uniform_(x, -a, a)
 76 | 
 77 | 
 78 | def kaiming_normal_init(x, gain=1.0):
 79 |     fan_in, fan_out = calculate_fan_in_and_fan_out(x)
 80 |     std = gain / math.sqrt(fan_in)
 81 |     return nn.init.normal_(x, 0, std)
 82 | 
 83 | 
 84 | def orthogonal_init(x, gain=1.0):
 85 |     return nn.init.orthogonal_(x, gain=1.0)
 86 | 
 87 | 
 88 | def equivariant_init(x, gain=1.0):
 89 |     with th.no_grad():
 90 |         x_size = tuple(x.size())
 91 |         if len(x_size) == 1:
 92 |             kaiming_normal_init(x, gain=gain)
 93 |         elif len(x_size) == 2:
 94 |             kaiming_normal_init(x[0], gain=gain)
 95 |             vec = x[0]
 96 |             for i in range(1, x.size(0)):
 97 |                 x[i].data.copy_(th.roll(vec, i, 0))
 98 |         else:
 99 |             x = x.view(x_size[:-2] + (-1, ))
100 |             equivariant_init(x, gain=gain)
101 |             x = x.view(x_size)
102 |     return x
103 | 
104 | 
105 | def identity_init(x, gain=1.0):
106 |     with th.no_grad():
107 |         x_size = tuple(x.size())
108 |         if len(x_size) == 1:
109 |             fan_in, fan_out = calculate_fan_in_and_fan_out(x)
110 |             std = gain * (2.0 / float(fan_in + fan_out))
111 |             nn.init.ones_(x)
112 |             x += th.randn_like(x) * std**2
113 |         elif len(x_size) == 2:
114 |             fan_in, fan_out = calculate_fan_in_and_fan_out(x)
115 |             std = gain * (2.0 / float(fan_in + fan_out))
116 |             nn.init.eye_(x)
117 |             x += th.randn_like(x) * std**2
118 |         else:
119 |             x = x.view(x_size[:-2] + (-1, ))
120 |             identity_init(x, gain=gain)
121 |             x = x.view(x_size)
122 |     return x
123 | 
124 | 
125 | def init_weight(x, activation="none", init="uniform"):
126 |     gain = calculate_gain(activation)
127 |     if init == "zero":
128 |         init_func = zero_init
129 |     elif init == "identity":
130 |         init_func = identity_init
131 |     elif init == "uniform":
132 |         init_func = xavier_uniform_init
133 |     elif init == "normal":
134 |         init_func = kaiming_normal_init
135 |     elif init == "orthogonal":
136 |         init_func = orthogonal_init
137 |     elif init == "equivariant":
138 |         init_func = equivariant_init
139 |     else:
140 |         raise ValueError("init=%s is not supported now." % (init))
141 | 
142 |     if isinstance(x, th.Tensor):
143 |         init_func(x, gain=gain)
144 | 
145 | 
146 | def init_module(x, activation="none", init="uniform"):
147 |     gain = calculate_gain(activation)
148 |     if init == "zero":
149 |         init_func = zero_init
150 |     elif init == "identity":
151 |         init_func = identity_init
152 |     elif init == "uniform":
153 |         init_func = xavier_uniform_init
154 |     elif init == "normal":
155 |         init_func = kaiming_normal_init
156 |     elif init == "orthogonal":
157 |         init_func = orthogonal_init
158 |     elif init == "equivariant":
159 |         init_func = equivariant_init
160 |     else:
161 |         raise ValueError("init=%s is not supported now." % (init))
162 | 
163 |     if isinstance(x, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
164 |         init_func(x.weight, gain=gain)
165 |         if hasattr(x, "bias") and x.bias is not None:
166 |             nn.init.zeros_(x.bias)
167 |     elif isinstance(x, nn.Embedding):
168 |         with th.no_grad():
169 |             if init == "uniform":
170 |                 nn.init.uniform_(x.weight, -1.0, 1.0)
171 |             elif init == "normal":
172 |                 nn.init.normal_(x.weight, 0.0, 1.0)
173 |             elif init == "orthogonal":
174 |                 nn.init.orthogonal_(x.weight, gain=math.sqrt(calculate_fan_in_and_fan_out(x.weight)[0]) * 1.0)
175 |             elif init == "identity":
176 |                 nn.init.eye_(x.weight)
177 |             elif init == "equivariant":
178 |                 nn.init.normal_(x.weight[0], 0.0, 1.0)
179 |                 vec = x.weight[0]
180 |                 for i in range(1, x.weight.size(0)):
181 |                     x.weight[i].data.copy_(th.roll(vec, i, 0))
182 |             if x.padding_idx is not None:
183 |                 x.weight[x.padding_idx].fill_(0)
184 |     elif isinstance(x, nn.RNNBase):
185 |         for layer_weights in x._all_weights:
186 |             for w in layer_weights:
187 |                 if "weight" in w:
188 |                     init_func(getattr(x, w))
189 |                 elif "bias" in w:
190 |                     nn.init.zeros_(getattr(x, w))
191 |     elif isinstance(x, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.LayerNorm, nn.GroupNorm)):
192 |         nn.init.ones_(x.weight)
193 |         nn.init.zeros_(x.bias)
194 | 
195 | 
196 | def change_dropout_rate(model, dropout):
197 |     for name, child in model.named_children():
198 |         if isinstance(child, nn.Dropout):
199 |             child.p = dropout
200 |         change_dropout_rate(child, dropout)
201 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/utils/io.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import igraph as ig
  3 | import json
  4 | import math
  5 | import numpy as np
  6 | import torch as th
  7 | import torch.nn as nn
  8 | import os
  9 | from argparse import Namespace
 10 | from collections import OrderedDict, namedtuple
 11 | from multiprocessing import Pool
 12 | from tqdm import tqdm
 13 | 
 14 | from .act import *
 15 | 
 16 | csv.field_size_limit(500 * 1024 * 1024)
 17 | 
 18 | 
 19 | def get_subdirs(dirpath, leaf_only=True):
 20 |     subdirs = list()
 21 |     is_leaf = True
 22 |     for filename in os.listdir(dirpath):
 23 |         filename = os.path.join(dirpath, filename)
 24 |         if os.path.isdir(filename):
 25 |             is_leaf = False
 26 |             subdirs.extend(get_subdirs(filename, leaf_only=leaf_only))
 27 |     if not leaf_only or is_leaf:
 28 |         subdirs.append(dirpath)
 29 |     return subdirs
 30 | 
 31 | 
 32 | def get_files(dirpath):
 33 |     files = list()
 34 |     for filename in os.listdir(dirpath):
 35 |         filename = os.path.join(dirpath, filename)
 36 |         if os.path.isdir(filename):
 37 |             files.extend(get_files(filename))
 38 |         else:
 39 |             files.append(filename)
 40 |     return files
 41 | 
 42 | 
 43 | def _read_graphs_from_dir(dirpath):
 44 |     graphs = dict()
 45 |     for filename in os.listdir(dirpath):
 46 |         if not os.path.isdir(os.path.join(dirpath, filename)):
 47 |             names = os.path.splitext(os.path.basename(filename))
 48 |             if names[1] != ".gml":
 49 |                 continue
 50 |             try:
 51 |                 graph = ig.read(os.path.join(dirpath, filename))
 52 |                 graph.vs["id"] = list(map(int, graph.vs["id"]))
 53 |                 graph.vs["label"] = list(map(int, graph.vs["label"]))
 54 |                 graph.es["label"] = list(map(int, graph.es["label"]))
 55 |                 graph.es["key"] = list(map(int, graph.es["key"]))
 56 |                 graphs[names[0]] = graph
 57 |             except BaseException as e:
 58 |                 print(e)
 59 |                 break
 60 |     return graphs
 61 | 
 62 | 
 63 | def read_graphs_from_dir(dirpath, num_workers=4):
 64 |     graphs = dict()
 65 |     subdirs = get_subdirs(dirpath)
 66 |     with Pool(num_workers if num_workers > 0 else os.cpu_count()) as pool:
 67 |         results = list()
 68 |         for subdir in subdirs:
 69 |             results.append((subdir, pool.apply_async(_read_graphs_from_dir, args=(subdir, ))))
 70 |         pool.close()
 71 | 
 72 |         for subdir, x in tqdm(results):
 73 |             x = x.get()
 74 |             graphs[os.path.basename(subdir)] = x
 75 |     dirpath = os.path.basename(dirpath)
 76 |     if dirpath in graphs and (dirpath == "graphs" or "G_" not in dirpath):
 77 |         graphs.update(graphs.pop(dirpath))
 78 |     return graphs
 79 | 
 80 | 
 81 | def read_patterns_from_dir(dirpath, num_workers=4):
 82 |     patterns = dict()
 83 |     subdirs = get_subdirs(dirpath)
 84 |     with Pool(num_workers if num_workers > 0 else os.cpu_count()) as pool:
 85 |         results = list()
 86 |         for subdir in subdirs:
 87 |             results.append((subdir, pool.apply_async(_read_graphs_from_dir, args=(subdir, ))))
 88 |         pool.close()
 89 | 
 90 |         for subdir, x in tqdm(results):
 91 |             x = x.get()
 92 |             patterns.update(x)
 93 |     dirpath = os.path.basename(dirpath)
 94 |     if dirpath in patterns and (dirpath == "patterns" or "P_" not in dirpath):
 95 |         patterns.update(patterns.pop(dirpath))
 96 |     return patterns
 97 | 
 98 | 
 99 | def _read_metadata_from_csv(csv_file):
100 |     meta = dict()
101 |     try:
102 |         with open(csv_file, "r", newline="") as f:
103 |             csv_reader = csv.reader(f, delimiter=",")
104 |             header = next(csv_reader)
105 |             gid_idx = header.index("g_id")
106 |             cnt_idx = header.index("counts")
107 |             iso_idx = header.index("subisomorphisms")
108 |             for row in csv_reader:
109 |                 meta[row[gid_idx]] = {
110 |                     "counts": int(row[cnt_idx]),
111 |                     "subisomorphisms": np.asarray(eval(row[iso_idx]), dtype=np.int64)
112 |                 }
113 |     except BaseException as e:
114 |         print(csv_file, e)
115 |     return meta
116 | 
117 | 
118 | def read_metadata_from_dir(dirpath, num_workers=4):
119 |     meta = dict()
120 |     files = get_files(dirpath)
121 |     with Pool(num_workers if num_workers > 0 else os.cpu_count()) as pool:
122 |         results = list()
123 |         for filename in files:
124 |             if filename.endswith(".csv"):
125 |                 results.append(
126 |                     (
127 |                         os.path.splitext(os.path.basename(filename))[0],
128 |                         pool.apply_async(_read_metadata_from_csv, args=(filename, ))
129 |                     )
130 |                 )
131 |         pool.close()
132 | 
133 |         for p_id, x in tqdm(results):
134 |             x = x.get()
135 |             if p_id not in meta:
136 |                 meta[p_id] = x
137 |             else:
138 |                 meta[p_id].update(x)
139 |     dirpath = os.path.basename(dirpath)
140 |     if dirpath in meta and dirpath == "metadata":
141 |         meta.update(meta.pop(dirpath))
142 |     return meta
143 | 
144 | 
145 | def load_data(pattern_dir, graph_dir, metadata_dir, num_workers=4):
146 |     patterns = read_patterns_from_dir(pattern_dir, num_workers=num_workers)
147 |     graphs = read_graphs_from_dir(graph_dir, num_workers=num_workers)
148 |     meta = read_metadata_from_dir(metadata_dir, num_workers=num_workers)
149 | 
150 |     if os.path.exists(os.path.join(metadata_dir, "train.txt")):
151 |         train_indices = set([int(x) for x in open(os.path.join(metadata_dir, "train.txt"))])
152 |     else:
153 |         train_indices = None
154 |     if os.path.exists(os.path.join(metadata_dir, "dev.txt")):
155 |         dev_indices = set([int(x) for x in open(os.path.join(metadata_dir, "dev.txt"))])
156 |     else:
157 |         dev_indices = None
158 |     if os.path.exists(os.path.join(metadata_dir, "test.txt")):
159 |         test_indices = set([int(x) for x in open(os.path.join(metadata_dir, "test.txt"))])
160 |     else:
161 |         test_indices = None
162 | 
163 |     train_data, dev_data, test_data = list(), list(), list()
164 |     shared_graph = True
165 |     for p, pattern in patterns.items():
166 |         # each pattern corresponds to specific graphs
167 |         if p in graphs:
168 |             shared_graph = False
169 |             for g, graph in graphs[p].items():
170 |                 x = dict()
171 |                 x["id"] = ("%s-%s" % (p, g))
172 |                 x["pattern"] = pattern
173 |                 x["graph"] = graph
174 |                 x["subisomorphisms"] = meta[p][g]["subisomorphisms"]
175 |                 x["counts"] = meta[p][g]["counts"]
176 | 
177 |                 g_idx = int(g.rsplit("_", 1)[-1])
178 |                 if train_indices is not None:
179 |                     if g_idx in train_indices:
180 |                         train_data.append(x)
181 |                 elif g_idx % 10 > 1:
182 |                     train_data.append(x)
183 |                 if dev_indices is not None:
184 |                     if g_idx in dev_indices:
185 |                         dev_data.append(x)
186 |                 elif g_idx % 10 == 0:
187 |                     dev_data.append(x)
188 |                 if test_indices is not None:
189 |                     if g_idx in test_indices:
190 |                         test_data.append(x)
191 |                 elif g_idx % 10 == 1:
192 |                     test_data.append(x)
193 |         # patterns share graphs
194 |         else:
195 |             for g, graph in graphs.items():
196 |                 x = dict()
197 |                 x["id"] = ("%s-%s" % (p, g))
198 |                 x["pattern"] = pattern
199 |                 x["graph"] = graph
200 |                 x["subisomorphisms"] = meta[p][g]["subisomorphisms"]
201 |                 x["counts"] = meta[p][g]["counts"]
202 | 
203 |                 g_idx = int(g.rsplit("_", 1)[-1])
204 |                 if train_indices is not None:
205 |                     if g_idx in train_indices:
206 |                         train_data.append(x)
207 |                 elif g_idx % 3 > 1:
208 |                     train_data.append(x)
209 |                 if dev_indices is not None:
210 |                     if g_idx in dev_indices:
211 |                         dev_data.append(x)
212 |                 elif g_idx % 3 == 0:
213 |                     dev_data.append(x)
214 |                 if test_indices is not None:
215 |                     if g_idx in test_indices:
216 |                         test_data.append(x)
217 |                 elif g_idx % 3 == 1:
218 |                     test_data.append(x)
219 | 
220 |     return OrderedDict({"train": train_data, "dev": dev_data, "test": test_data}), shared_graph
221 | 
222 | 
223 | def str2value(x):
224 |     try:
225 |         return eval(x)
226 |     except:
227 |         return x
228 | 
229 | 
230 | def str2bool(x):
231 |     x = x.lower()
232 |     return x == "true" or x == "yes"
233 | 
234 | 
235 | def str2list(x):
236 |     results = []
237 |     for x in x.split(","):
238 |         x = x.strip()
239 |         if x == "" or x == "null":
240 |             continue
241 |         try:
242 |             x = str2value(x)
243 |         except:
244 |             pass
245 |         results.append(x)
246 |     return results
247 | 
248 | 
249 | def load_config(path, as_dict=True):
250 |     with open(path, "r") as f:
251 |         config = json.load(f)
252 |         if not as_dict:
253 |             config = namedtuple("config", config.keys())(*config.values())
254 |     return config
255 | 
256 | 
257 | def save_config(config, path):
258 |     if isinstance(config, dict):
259 |         pass
260 |     elif isinstance(config, Namespace):
261 |         config = vars(config)
262 |     else:
263 |         try:
264 |             config = config._as_dict()
265 |         except BaseException as e:
266 |             raise e
267 | 
268 |     with open(path, "w") as f:
269 |         json.dump(config, f)
270 | 
271 | 
272 | class TensorEncoder(json.JSONEncoder):
273 |     def default(self, obj):
274 |         if isinstance(obj, np.integer):
275 |             return int(obj)
276 |         elif isinstance(obj, np.floating):
277 |             return float(obj)
278 |         elif isinstance(obj, (np.ndarray, th.Tensor)):
279 |             return obj.tolist()
280 |         else:
281 |             return super(TensorEncoder, self).default(obj)
282 | 
283 | 
284 | def load_results(path):
285 |     with open(path, "w") as f:
286 |         results = json.load(f)
287 |     return results
288 | 
289 | 
290 | def save_results(results, path):
291 |     with open(path, "w") as f:
292 |         json.dump(results, f, cls=TensorEncoder)
293 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/utils/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import re
 4 | 
 5 | 
 6 | def init_logger(log_file=None, log_tag="GOOD LUCK"):
 7 |     logger = logging.getLogger()
 8 |     logger.setLevel(logging.INFO)
 9 | 
10 |     log_format = logging.Formatter("[%(asctime)s {log_tag}] %(message)s".format(log_tag=log_tag))
11 |     console_handler = logging.StreamHandler()
12 |     console_handler.setFormatter(log_format)
13 |     logger.addHandler(console_handler)
14 | 
15 |     if log_file:
16 |         os.makedirs(os.path.dirname(log_file), exist_ok=True)
17 |         file_handler = logging.FileHandler(log_file)
18 |         file_handler.setFormatter(log_format)
19 |         logger.addHandler(file_handler)
20 | 
21 |     setattr(logger, "prefix_len", len(logger.handlers[0].formatter._fmt))
22 | 
23 |     return logger
24 | 
25 | 
26 | def close_logger(logger):
27 |     handlers = logger.handlers[:]
28 |     for handler in handlers:
29 |         handler.close()
30 |         logger.removeHandler(handler)
31 | 
32 | 
33 | def generate_log_line(data_type, epoch=-1, total_epochs=-1, step=-1, total_steps=-1, **kw):
34 |     line = ["data_type: {:<10s}".format(data_type)]
35 |     if epoch != -1 and total_epochs != -1:
36 |         line.append("epoch: {:0>5d}/{:0>5d}".format(epoch, total_epochs))
37 |     if step != -1 and total_steps != -1:
38 |         line.append("step: {:0>5d}/{:0>5d}".format(step, total_steps))
39 |     for k, v in kw.items():
40 |         if isinstance(v, float):
41 |             line.append("{}: {:8>5.3f}".format(k, v))
42 |         elif isinstance(v, int):
43 |             line.append("{}: {:0>3d}".format(k, v))
44 |         else:
45 |             line.append("{}: {}".format(k, v))
46 |     line = "\t".join(line)
47 |     return line
48 | 
49 | 
50 | def generate_best_line(data_type, epoch, total_epochs, **kw):
51 |     line = \
52 |         ["data_type: " + str(data_type)] + \
53 |         ["best %s: %s" % (str(k), str(v)) for k, v in kw.items()] + \
54 |         ["(epoch: %d/%d)" % (epoch, total_epochs)]
55 |     line = "\t".join(line)
56 |     return line
57 | 
58 | 
59 | def get_best_epochs(log_file):
60 |     # 0: data type
61 |     # 1: metric name
62 |     # 2: metric value
63 |     # 3: epoch
64 |     regex = re.compile(
65 |         r"data_type:\s+(\w+)\s+best\s+([a-zA-Z0-9\.\-\+\_]+):\s+([a-zA-Z0-9\.\-\+\_]+)\s+\(epoch:\s+(\d+)/\d+\)"
66 |     )
67 |     best_epochs = dict()
68 |     # get the best epoch
69 |     with open(log_file, "r") as f:
70 |         for line in f:
71 |             matched_results = regex.findall(line)
72 |             for matched_result in matched_results:
73 |                 if matched_result[1] not in best_epochs:
74 |                     best_epochs[matched_result[1]] = dict()
75 |                 best_epochs[matched_result[1]][matched_result[0]] = (int(matched_result[3]), float(matched_result[2]))
76 |     return best_epochs
77 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/utils/sampler.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import torch as th
  5 | 
  6 | from collections import OrderedDict
  7 | from torch.utils.data import Sampler
  8 | 
  9 | 
 10 | class BucketSampler(Sampler):
 11 |     def __init__(
 12 |         self,
 13 |         dataset,
 14 |         group_by,
 15 |         batch_size,
 16 |         shuffle=False,
 17 |         seed=0,
 18 |         drop_last=False
 19 |     ):
 20 |         super(BucketSampler, self).__init__(dataset)
 21 |         self.dataset = dataset
 22 |         if isinstance(group_by, str):
 23 |             group_by = [group_by]
 24 |         self.group_by = group_by
 25 |         self.cache = OrderedDict()
 26 |         for attr in group_by:
 27 |             self.cache[attr] = th.tensor([x[attr] for x in dataset], dtype=th.float32)
 28 |         self.batch_size = batch_size
 29 |         self.seed = seed
 30 |         self.shuffle = shuffle
 31 |         self.drop_last = drop_last
 32 |         self.epoch = 0
 33 | 
 34 |         if drop_last:
 35 |             self.num_samples = math.ceil((len(self.dataset) - self.batch_size) / self.batch_size) * self.batch_size
 36 |         else:
 37 |             self.num_samples = math.ceil(len(self.dataset) / self.batch_size) * self.batch_size
 38 |         self.total_size = self.num_samples
 39 | 
 40 |     def __iter__(self):
 41 |         rng = th.Generator()
 42 |         rng.manual_seed(self.seed + self.epoch)
 43 |         array = th.stack(list(self.cache.values()), dim=-1)
 44 | 
 45 |         if not self.drop_last:
 46 |             ind = th.arange(len(self.dataset))
 47 |             padding_size = self.total_size - len(self.dataset)
 48 |             while padding_size > len(array):
 49 |                 ind = th.cat([ind, ind], dim=0)
 50 |                 padding_size -= len(array)
 51 |             if padding_size > 0:
 52 |                 ind = th.cat([ind, th.randperm(len(self.dataset))[:padding_size]], dim=0)
 53 |             array = array[ind]
 54 |         else:
 55 |             ind = th.arange(self.total_size)
 56 |             array = array[:self.total_size]
 57 |         assert len(array) == self.total_size
 58 | 
 59 |         rand = th.rand((self.total_size, 1), generator=rng)
 60 |         array = th.cat([array, rand], dim=-1)
 61 |         array = array.numpy().view(list(zip(list(self.cache.keys()) + ["rand"],
 62 |                                             [np.float32] * (len(self.cache) + 1)))).flatten()
 63 |         indices = np.argsort(array, axis=0, order=self.group_by)
 64 |         batches = [indices[i:i + self.batch_size] for i in range(0, len(indices), self.batch_size)]
 65 | 
 66 |         if self.shuffle:
 67 |             indices = th.randperm(len(batches), generator=rng)
 68 |             batches = batches[indices]
 69 | 
 70 |         batch_idx = 0
 71 |         while batch_idx < len(batches):
 72 |             yield ind[batches[batch_idx]]
 73 |             batch_idx += 1
 74 | 
 75 |     def __len__(self):
 76 |         return self.num_samples // self.batch_size
 77 | 
 78 |     def set_epoch(self, epoch=-1):
 79 |         if epoch == -1:
 80 |             self.epoch += 1
 81 |         else:
 82 |             self.epoch = epoch
 83 | 
 84 | 
 85 | class CircurriculumSampler(BucketSampler):
 86 |     def __init__(
 87 |         self,
 88 |         dataset,
 89 |         learning_by,
 90 |         used_ratio,
 91 |         batch_size,
 92 |         group_by=None,
 93 |         shuffle=False,
 94 |         seed=0,
 95 |         drop_last=False
 96 |     ):
 97 |         if isinstance(learning_by, str):
 98 |             learning_by = [learning_by]
 99 |         if isinstance(group_by, str):
100 |             group_by = [group_by]
101 |         elif group_by is None:
102 |             group_by = learning_by
103 |         super(CircurriculumSampler, self).__init__(dataset, group_by, batch_size, shuffle, seed, drop_last)
104 |         self.learning_by = learning_by
105 |         for attr in learning_by:
106 |             if attr not in self.cache:
107 |                 self.cache[attr] = th.tensor([x[attr] for x in dataset], dtype=th.float32)
108 | 
109 |         self.used_ratio = used_ratio
110 | 
111 |     def __iter__(self):
112 |         rng = th.Generator()
113 |         rng.manual_seed(self.seed + self.epoch)
114 |         array = th.stack(list(self.cache.values()), dim=-1)
115 | 
116 |         if not self.drop_last:
117 |             ind = th.arange(len(self.dataset))
118 |             padding_size = self.total_size - len(self.dataset)
119 |             while padding_size > len(array):
120 |                 ind = th.cat([ind, ind], dim=0)
121 |                 padding_size -= len(array)
122 |             if padding_size > 0:
123 |                 ind = th.cat([ind, th.randperm(len(self.dataset))[:padding_size]], dim=0)
124 |             array = array[ind]
125 |         else:
126 |             ind = th.arange(self.total_size)
127 |             array = array[:self.total_size]
128 |         assert len(array) == self.total_size
129 | 
130 |         rand = th.rand((self.total_size, 1), generator=rng)
131 |         array = th.cat([array, rand], dim=-1)
132 |         array = array.numpy().view(list(zip(list(self.cache.keys()) + ["rand"],
133 |                                             [np.float32] * (len(self.cache) + 1)))).flatten()
134 | 
135 |         if self.learning_by == self.group_by or self.learning_by == self.group_by[:len(self.learning_by)]:
136 |             group_indices = np.argsort(array, axis=0, order=self.group_by)
137 |             indices = group_indices[:math.ceil(self.used_ratio * len(group_indices))]
138 |         else:
139 |             learn_indices = np.argsort(array, axis=0, order=self.learning_by)
140 |             learn_indices = learn_indices[:int(self.used_ratio * len(learn_indices))]
141 |             indices = np.argsort(array[learn_indices], axis=0, order=self.group_by)
142 | 
143 |         batches = [indices[i:i + self.batch_size] for i in range(0, len(indices), self.batch_size)]
144 | 
145 |         if self.shuffle:
146 |             batches = [batches[i] for i in th.randperm(len(batches), generator=rng).tolist()]
147 | 
148 |         batch_idx = 0
149 |         while batch_idx < len(batches):
150 |             yield ind[batches[batch_idx]]
151 |             batch_idx += 1
152 | 


--------------------------------------------------------------------------------
/SubgraphCountingMatching/utils/scheduler.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from torch.optim.lr_scheduler import LambdaLR
  3 | 
  4 | 
  5 | PI = 3.141592653589793
  6 | INIT_STEPS = 600
  7 | SCHEDULE_STEPS = 10000
  8 | NUM_CYCLES = 2
  9 | MIN_PERCENT = 1e-3
 10 | 
 11 | 
 12 | class ConstantScheduler(LambdaLR):
 13 |     def __init__(self):
 14 |         pass
 15 | 
 16 |     def set_optimizer(self, optimizer):
 17 |         super(ConstantScheduler, self).__init__(optimizer, self.lr_lambda)
 18 | 
 19 |     def lr_lambda(self, current_step):
 20 |         return 1.0
 21 | 
 22 | 
 23 | class ConstantWarmupScheduler(LambdaLR):
 24 |     def __init__(
 25 |         self,
 26 |         num_warmup_steps=INIT_STEPS
 27 |     ):
 28 |         self.num_warmup_steps = num_warmup_steps
 29 | 
 30 |     def set_optimizer(self, optimizer):
 31 |         super(ConstantWarmupScheduler, self).__init__(optimizer, self.lr_lambda)
 32 | 
 33 |     def lr_lambda(self, current_step):
 34 |         if current_step < self.num_warmup_steps:
 35 |             return float(current_step) / max(1.0, float(self.num_warmup_steps))
 36 |         return 1.0
 37 | 
 38 | 
 39 | class LinearScheduler(LambdaLR):
 40 |     def __init__(
 41 |         self,
 42 |         num_schedule_steps=SCHEDULE_STEPS,
 43 |         min_percent=MIN_PERCENT
 44 |     ):
 45 |         self.num_schedule_steps = num_schedule_steps
 46 |         self.min_percent = min_percent
 47 | 
 48 |     def set_optimizer(self, optimizer):
 49 |         super(LinearScheduler, self).__init__(optimizer, self.lr_lambda)
 50 | 
 51 |     def lr_lambda(self, current_step):
 52 |         return max(
 53 |             self.min_percent,
 54 |             float(self.num_schedule_steps - current_step) / \
 55 |                 float(max(1, self.num_schedule_steps))
 56 |         )
 57 | 
 58 | 
 59 | class LinearWarmupScheduler(LambdaLR):
 60 |     def __init__(
 61 |         self,
 62 |         num_warmup_steps=INIT_STEPS,
 63 |         num_schedule_steps=SCHEDULE_STEPS,
 64 |         min_percent=MIN_PERCENT
 65 |     ):
 66 |         self.num_warmup_steps = num_warmup_steps
 67 |         self.num_schedule_steps = num_schedule_steps
 68 |         self.min_percent = min_percent
 69 | 
 70 |     def set_optimizer(self, optimizer):
 71 |         super(LinearWarmupScheduler, self).__init__(optimizer, self.lr_lambda)
 72 | 
 73 |     def lr_lambda(self, current_step):
 74 |         if current_step < self.num_warmup_steps:
 75 |             return float(current_step) / float(max(1, self.num_warmup_steps))
 76 |         return max(
 77 |             self.min_percent,
 78 |             float(self.num_schedule_steps - current_step) / \
 79 |                 float(max(1, self.num_schedule_steps - self.num_warmup_steps))
 80 |         )
 81 | 
 82 | 
 83 | class LinearWarmupRestartScheduler(LambdaLR):
 84 |     def __init__(
 85 |         self,
 86 |         num_warmup_steps=INIT_STEPS,
 87 |         num_schedule_steps=SCHEDULE_STEPS,
 88 |         num_cycles=NUM_CYCLES,
 89 |         min_percent=MIN_PERCENT
 90 |     ):
 91 |         self.num_warmup_steps = num_warmup_steps
 92 |         self.num_schedule_steps = num_schedule_steps
 93 |         self.num_cycles = num_cycles
 94 |         self.min_percent = min_percent
 95 | 
 96 |     def set_optimizer(self, optimizer):
 97 |         super(LinearWarmupRestartScheduler, self).__init__(optimizer, self.lr_lambda)
 98 | 
 99 |     def lr_lambda(self, current_step):
100 |         if current_step < self.num_warmup_steps:
101 |             return float(current_step) / float(max(1, self.num_warmup_steps))
102 |         progress = float(current_step - self.num_warmup_steps) / \
103 |             float(max(1, self.num_schedule_steps - self.num_warmup_steps))
104 |         if progress >= 1.0:
105 |             return self.min_percent
106 |         return max(self.min_percent, 1 - (float(self.num_cycles) * progress) % 1.0)
107 | 
108 | 
109 | class CosineScheduler(LambdaLR):
110 |     def __init__(
111 |         self,
112 |         num_schedule_steps=SCHEDULE_STEPS,
113 |         num_cycles=NUM_CYCLES,
114 |         min_percent=MIN_PERCENT
115 |     ):
116 |         self.num_schedule_steps = num_schedule_steps
117 |         self.num_cycles = num_cycles
118 |         self.min_percent = min_percent
119 | 
120 |     def set_optimizer(self, optimizer):
121 |         super(CosineScheduler, self).__init__(optimizer, self.lr_lambda)
122 | 
123 |     def lr_lambda(self, current_step):
124 |         progress = float(current_step) / float(max(1, self.num_schedule_steps))
125 |         return max(self.min_percent, 0.5 * (1.0 + math.cos(PI * float(self.num_cycles) * 2.0 * progress)))
126 | 
127 | 
128 | class CosineWarmupScheduler(LambdaLR):
129 |     def __init__(
130 |         self,
131 |         num_warmup_steps=INIT_STEPS,
132 |         num_schedule_steps=SCHEDULE_STEPS,
133 |         num_cycles=NUM_CYCLES,
134 |         min_percent=MIN_PERCENT
135 |     ):
136 |         self.num_warmup_steps = num_warmup_steps
137 |         self.num_schedule_steps = num_schedule_steps
138 |         self.num_cycles = num_cycles
139 |         self.min_percent = min_percent
140 | 
141 |     def set_optimizer(self, optimizer):
142 |         super(CosineWarmupScheduler, self).__init__(optimizer, self.lr_lambda)
143 | 
144 |     def lr_lambda(self, current_step):
145 |         if current_step < self.num_warmup_steps:
146 |             return float(current_step) / float(max(1, self.num_warmup_steps))
147 |         progress = float(current_step - self.num_warmup_steps) / \
148 |             float(max(1, self.num_schedule_steps - self.num_warmup_steps))
149 |         return max(self.min_percent, 0.5 * (1.0 + math.cos(PI * float(self.num_cycles) * 2.0 * progress)))
150 | 
151 | 
152 | class CosineWarmupRestartScheduler(LambdaLR):
153 |     def __init__(
154 |         self,
155 |         num_warmup_steps=INIT_STEPS,
156 |         num_schedule_steps=SCHEDULE_STEPS,
157 |         num_cycles=NUM_CYCLES,
158 |         min_percent=MIN_PERCENT
159 |     ):
160 |         self.num_warmup_steps = num_warmup_steps
161 |         self.num_schedule_steps = num_schedule_steps
162 |         self.num_cycles = num_cycles
163 |         self.min_percent = min_percent
164 | 
165 |     def set_optimizer(self, optimizer):
166 |         super(CosineWarmupRestartScheduler, self).__init__(optimizer, self.lr_lambda)
167 | 
168 |     def lr_lambda(self, current_step):
169 |         if current_step < self.num_warmup_steps:
170 |             return float(current_step) / float(max(1, self.num_warmup_steps))
171 |         progress = float(current_step - self.num_warmup_steps) / \
172 |             float(max(1, self.num_schedule_steps - self.num_warmup_steps))
173 |         if progress >= 1.0:
174 |             return self.min_percent
175 |         return max(self.min_percent, 0.5 * (1.0 + math.cos(PI * ((float(self.num_cycles) * progress) % 1.0))))
176 | 
177 | 
178 | supported_schedulers = {
179 |     "constant": ConstantScheduler(),
180 |     "constant_with_warmup": ConstantWarmupScheduler(),
181 |     "linear": LinearScheduler(),
182 |     "linear_with_warmup": LinearWarmupScheduler(),
183 |     "linear_with_warmup_and_restart": LinearWarmupRestartScheduler(),
184 |     "cosine": CosineScheduler(),
185 |     "cosine_with_warmup": CosineWarmupScheduler(),
186 |     "cosine_with_warmup_and_restart": CosineWarmupRestartScheduler(),
187 | }
188 | 
189 | 
190 | def map_scheduler_str_to_scheduler(scheduler, **kw):
191 |     if scheduler not in supported_schedulers:
192 |         raise NotImplementedError
193 | 
194 |     sdlr = supported_schedulers[scheduler]
195 |     for k, v in kw.items():
196 |         if hasattr(sdlr, k):
197 |             try:
198 |                 setattr(sdlr, k, v)
199 |             except:
200 |                 pass
201 |     return sdlr
202 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Data/README.md:
--------------------------------------------------------------------------------
 1 | ## Data
 2 | 
 3 | Carl Yang (yangji9181@gmail.com) provides 4 HIN benchmark datasets: ```DBLP```, ```Yelp```, ```Freebase```, and ```PubMed```.
 4 | 
 5 | Users can retrieve them <a href="https://drive.google.com/open?id=1Pkbl2wkwAXVRYrUWKpa1C4YQdjl_oIu2">here</a> and unzip the downloaded file to the current folder. To reproduce the experiments in AAAI"2022, please download ```PubMed``` and ```Yelp```.
 6 | 
 7 | The statistics of each dataset are as follows.
 8 | 
 9 | **Dataset** | #node types | #nodes | #link types | #links | #attributes | #attributed nodes | #label types | #labeled nodes
10 | --- | --- | --- | --- | --- | --- | --- | --- | ---
11 | **PubMed** | 4 | 63,109 | 10 | 244,986 | 200 | ALL | 8 | 454
12 | **Yelp** | 4 | 82,465 | 4 | 30,542,675 | N/A | N/A | 16 | 7,417
13 | 
14 | Each dataset contains:
15 | - 3 data files (```node.dat```, ```link.dat```, ```label.dat```);
16 | - 2 evaluation files (```link.dat.test```, ```label.dat.test```);
17 | - 2 description files (```meta.dat```, ```info.dat```);
18 | - 1 recording file (```record.dat```).
19 | 
20 | ### node.dat
21 | 
22 | - In each line, there are 4 elements (```node_id```, ```node_name```, ```node_type```, ```node_attributes```) separated by ```\t```.
23 | - ```Node_name``` is in Line ```node_id```.
24 | - In ```node_name```, empty space (``` ```) is replaced by underscore (```_```).
25 | - In ```node_attributes```, attributes are separated by comma (```,```).
26 | - ```DBLP``` and ```PubMed``` contain attributes, while ```Freebase``` and ```Yelp``` do not contain attributes.
27 | 
28 | ### link.dat
29 | 
30 | - In each line, there are 4 elements (```node_id```, ```node_id```, ```link_type```, ```link_weight```) separated by ```\t```.
31 | - All links are directed. Each node is connected by at least one link.
32 | 
33 | ### label.dat
34 | 
35 | - In each line, there are 4 elements (```node_id```, ```node_name```, ```node_type```, ```node_label```) separated by ```\t```.
36 | - All labeled nodes are of the same ```node_type```.
37 | - For ```DBLP```, ```Freebase```, and ```PubMed```, each labeled node only has one label. For ```Yelp```, each labeled node has one or multiple labels separated by ```,```.
38 | - For unsupervised training, ```label.dat``` and ```label.dat.test``` are merged for five-fold cross validation. For semi-supervised training, ```label.dat``` is used for training and ```label.dat.test``` is used for testing.
39 | 
40 | ### link.dat.test
41 | 
42 | - In each line, there are 3 elements (```node_id```, ```node_id```, ```link_status```) separated by ```\t```.
43 | - For ```link_status```, ```1``` indicates a positive link and ```0``` indicates a negative link.
44 | - Positive and negative links are of the same ```link_type```.
45 | - Number of positive links = Number of negative links = One fourth of the number of real links of the same type in ```label.dat```.
46 | 
47 | ### label.dat.test
48 | 
49 | - In each line, there are 4 elements (```node_id```, ```node_name```, ```node_type```, ```node_label```) separated by ```\t```.
50 | - All labeled nodes are of the same ```node_type```.
51 | - Number of labeled nodes in ```label.dat.test``` = One fourth of the number of labeled nodes in ```label.dat```.
52 | - For ```DBLP```, ```Freebase```, and ```PubMed```, each labeled node only has one label. For ```Yelp```, each labeled node has one or multiple labels separated by ```,```.
53 | - For unsupervised training, ```label.dat``` and ```label.dat.test``` are merged for five-fold cross validation. For semi-supervised training, ```label.dat``` is used for training and ```label.dat.test``` is used for testing.
54 | 
55 | ### meta.dat
56 | 
57 | - This file describes the number of instances of each node type, link type, and label type in the corresponding dataset.
58 | 
59 | ### info.dat
60 | 
61 | - This file describes the meaning of each node type, link type, and label type in the corresponding dataset.
62 | 
63 | ### record.dat
64 | 
65 | - In each paragraph, the first line tells the model and evaluation settings, the second line tells the set of training parameters, and the third line tells the evaluation results.
66 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Evaluate/README.md:
--------------------------------------------------------------------------------
 1 | ## Evaluate
 2 | 
 3 | This stage evaluates the output embeddings based on specific tasks.
 4 | 
 5 | Users need to specify the following parameters in ```evaluate.sh```:
 6 | - **dataset**: choose from ```PubMed``` and ```Yelp```;
 7 | - **model**: choose from ```DMPNN```, ```CompGCN```, ```R-GIN```;
 8 | - **attributed**: choose ```True``` for attributed training or ```False``` for unattributed training;
 9 | - **supervised**: choose ```True``` for semi-supervised training or ```False``` for unsupervised training.
10 | - **task**: choose ```nc``` for node classification, ```lp``` for link prediction, or ```both``` for both tasks.
11 | 
12 | *Note: Only Message-Passing Methods (```R-GCN```, ```HAN```, ```MAGNN```, ```HGT```) support attributed or semi-supervised training.* <br /> 
13 | *Note: Only ```DBLP``` and ```PubMed``` contain node attributes.*
14 | 
15 | **Node Classification**: <br /> 
16 | We train a separate linear Support Vector Machine (LinearSVC) based on the learned embeddings on 80% of the labeled nodes and predict on the remaining 20%. We repeat the process for standard five-fold cross validation and compute the average scores regarding **Macro-F1** (across all labels) and **Micro-F1** (across all nodes).
17 | 
18 | **Link Prediction**: <br /> 
19 | We use the Hadamard function to construct feature vectors for node pairs, train a two-class LinearSVC on the 80% training links and evaluate towards the 20% held out links. We repeat the process for standard five-fold cross validation and compute the average scores regarding **AUC** (area under the ROC curve) and **MRR** (mean reciprocal rank).
20 | 
21 | Run ```bash evaluate.sh``` to complete *Stage 4: Evaluate*.
22 | 
23 | The evaluation results are stored in ```record.dat``` of the corresponding dataset. 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Evaluate/evaluate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | from link_prediction import *
  4 | from node_classification import *
  5 | 
  6 | data_folder, model_folder = '../Data', '../Model'
  7 | emb_file, record_file = 'emb.dat', 'record.dat'
  8 | link_test_file, label_test_file, label_file = 'link.dat.test', 'label.dat.test', 'label.dat'
  9 | 
 10 | 
 11 | def parse_args():
 12 | 
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument(
 15 |         '--dataset', required=True, type=str, choices=['DBLP', 'Freebase', 'PubMed', 'Yelp'],
 16 |         help='Targeting dataset.'
 17 |     )
 18 |     parser.add_argument(
 19 |         '--model',
 20 |         required=True,
 21 |         type=str,
 22 |         help='Targeting model.',
 23 |         choices=[
 24 |             'metapath2vec-ESim', 'PTE', 'HIN2Vec', 'AspEm', 'HEER', 'R-GCN', 'R-GIN', 'CompGCN', 'DMPNN', 'HAN',
 25 |             'MAGNN', 'HGT', 'TransE', 'DistMult', 'ComplEx', 'ConvE'
 26 |         ]
 27 |     )
 28 |     parser.add_argument('--task', required=True, type=str, help='Targeting task.', choices=['nc', 'lp', 'both'])
 29 |     parser.add_argument(
 30 |         '--attributed',
 31 |         required=True,
 32 |         type=str,
 33 |         help='Only R-GCN, R-GIN, CompGCN, DMPNN, HAN, MAGNN, and HGT support attributed training.',
 34 |         choices=['True', 'False']
 35 |     )
 36 |     parser.add_argument(
 37 |         '--supervised',
 38 |         required=True,
 39 |         type=str,
 40 |         help='Only R-GCN, R-GIN, CompGCN, DMPNN, HAN, MAGNN, and HGT support semi-supervised training.',
 41 |         choices=['True', 'False']
 42 |     )
 43 |     parser.add_argument('--emb_file', type=str, default='')
 44 |     parser.add_argument('--record_file', type=str, default='')
 45 |     parser.add_argument('--link_test_file', type=str, default='')
 46 |     parser.add_argument('--label_test_file', type=str, default='')
 47 |     parser.add_argument('--label_file', type=str, default='')
 48 | 
 49 |     return parser.parse_args()
 50 | 
 51 | 
 52 | def load(emb_file_path):
 53 | 
 54 |     emb_dict = {}
 55 |     with open(emb_file_path, 'r') as emb_file:
 56 |         for i, line in enumerate(emb_file):
 57 |             if i == 0:
 58 |                 train_para = line[:-1]
 59 |             else:
 60 |                 index, emb = line[:-1].split('\t')
 61 |                 emb_dict[index] = np.array(emb.split()).astype(np.float32)
 62 | 
 63 |     return train_para, emb_dict
 64 | 
 65 | 
 66 | def record(args, all_tasks, train_para, all_scores):
 67 | 
 68 |     if args.record_file != '':
 69 |         filename = args.record_file
 70 |     else:
 71 |         filename = f'{data_folder}/{args.dataset}/{record_file}'
 72 |     with open(filename, 'a') as file:
 73 |         for task, score in zip(all_tasks, all_scores):
 74 |             file.write(f'model={args.model}, task={task}, attributed={args.attributed}, supervised={args.supervised}\n')
 75 |             file.write(f'{train_para}\n')
 76 |             if task == 'nc':
 77 |                 file.write(f'Macro-F1={score[0]:.4f}, Micro-F1={score[1]:.4f}\n')
 78 |             elif task == 'lp':
 79 |                 file.write(f'AUC={score[0]:.4f}, MRR={score[1]:.4f}\n')
 80 |             file.write('\n')
 81 | 
 82 |     return
 83 | 
 84 | 
 85 | def check(args):
 86 | 
 87 |     if args.attributed == 'True':
 88 |         if args.model not in ['R-GCN', 'R-GIN', 'CompGCN', 'DMPNN', 'HAN', 'MAGNN', 'HGT']:
 89 |             print(f'{args.model} does not support attributed training!')
 90 |             print('Only R-GCN, R-GIN, CompGCN, DMPNN, HAN, MAGNN, and HGT support attributed training!')
 91 |             return False
 92 |         if args.dataset not in ['DBLP', 'PubMed']:
 93 |             print(f'{args.dataset} does not support attributed training!')
 94 |             print('Only DBLP and PubMed support attributed training!')
 95 |             return False
 96 | 
 97 |     if args.supervised == 'True':
 98 |         if args.model not in ['R-GCN', 'R-GIN', 'CompGCN', 'DMPNN', 'HAN', 'MAGNN', 'HGT']:
 99 |             print(f'{args.model} does not support semi-supervised training!')
100 |             print('Only R-GCN, R-GIN, CompGCN, DMPNN, HAN, MAGNN, and HGT support semi-supervised training!')
101 |             return False
102 | 
103 |     return True
104 | 
105 | 
106 | def main():
107 | 
108 |     args = parse_args()
109 | 
110 |     if not check(args):
111 |         return
112 | 
113 |     print('Load Embeddings!')
114 |     print(args)
115 |     if args.emb_file != '':
116 |         emb_file_path = args.emb_file
117 |     else:
118 |         emb_file_path = f'{model_folder}/{args.model}/data/{args.dataset}/{emb_file}'
119 |     train_para, emb_dict = load(emb_file_path)
120 | 
121 |     if args.label_file != '':
122 |         label_file_path = args.label_file
123 |     else:
124 |         label_file_path = f'{data_folder}/{args.dataset}/{label_file}'
125 | 
126 |     if args.label_test_file != '':
127 |         label_test_file_path = args.label_test_file
128 |     else:
129 |         label_test_file_path = f'{data_folder}/{args.dataset}/{label_test_file}'
130 | 
131 |     if args.link_test_file != '':
132 |         link_test_file_path = args.link_test_file
133 |     else:
134 |         link_test_file_path = f'{data_folder}/{args.dataset}/{link_test_file}'
135 | 
136 |     print('Start Evaluation!')
137 |     all_tasks, all_scores = [], []
138 |     if args.task == 'nc' or args.task == 'both':
139 |         print(f'Evaluate Node Classification Performance for Model {args.model} on Dataset {args.dataset}!')
140 |         scores = nc_evaluate(args.dataset, args.supervised, label_file_path, label_test_file_path, emb_dict)
141 |         all_tasks.append('nc')
142 |         all_scores.append(scores)
143 |         print(f'Macro-F1={scores[0]:.4f}, Micro-F1={scores[1]:.4f}')
144 |     if args.task == 'lp' or args.task == 'both':
145 |         print(f'Evaluate Link Prediction Performance for Model {args.model} on Dataset {args.dataset}!')
146 |         scores = lp_evaluate(link_test_file_path, emb_dict)
147 |         all_tasks.append('lp')
148 |         all_scores.append(scores)
149 |         print(f'AUC={scores[0]:.4f}, MRR={scores[1]:.4f}')
150 | 
151 |     print('Record Results!')
152 |     record(args, all_tasks, train_para, all_scores)
153 | 
154 |     return
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     main()
159 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Evaluate/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Note: Only "R-GCN", "R-GIN", "CompGCN", "ConjGCN", "HAN", "MAGNN", and "HGT" support attributed="True" or supervised="True"
 4 | # Note: Only "DBLP" and "PubMed" support attributed="True"
 5 | 
 6 | attributed="False"
 7 | supervised="False"
 8 | negative_sample=5
 9 | dropout=0.2
10 | n_hidden=50
11 | n_epochs=50 # the epoch here is different with the epoch in original HNE
12 | grad_norm=1.0
13 | sampler=randomwalk
14 | 
15 | for dataset in "PubMed" "Yelp"
16 | do
17 |     for model in "DMPNN" "CompGCN" "R-GIN" "R-GCN"
18 |     do
19 |         folder="../Model/${model}/data/${dataset}/"
20 |         node_file="${folder}node.dat"
21 |         label_file="${folder}label.dat"
22 |         link_file="${folder}link.dat"
23 |         for lr in 1e-2 1e-3
24 |         do
25 |             for reg in 1e-2 1e-3
26 |             do
27 |             for n_layers in 1 2
28 |                 do
29 |                     for graph_split_size in 0.5 0.7 0.9
30 |                     do
31 |                         emb_file="${folder}emb_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat"
32 |                         # record_file="${folder}record_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat"
33 |                         record_file="${folder}record_noattr_unsup_hidden${n_hidden}.dat"
34 |                         OMP_NUM_THREADS=4 python evaluate.py \
35 |                             --dataset ${dataset} \
36 |                             --model ${model} \
37 |                             --task nc \
38 |                             --attributed ${attributed} \
39 |                             --supervised ${supervised} \
40 |                             --emb_file ${emb_file} \
41 |                             --record_file ${record_file}
42 |                     done
43 |                 done
44 |             done
45 |         done
46 |     done
47 |     exit 1
48 | done
49 | 
50 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Evaluate/link_prediction.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import numpy as np
 3 | from collections import defaultdict
 4 | from sklearn.svm import LinearSVC
 5 | from sklearn.model_selection import KFold
 6 | from sklearn.metrics import roc_auc_score
 7 | from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning
 8 | from utils import SingleLabelBinarySeachCV, MultiLabelBinarySeachCV
 9 | 
10 | seed = 1
11 | max_iter = 300
12 | np.random.seed(seed)
13 | warnings.filterwarnings("ignore", category=ConvergenceWarning)
14 | warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
15 | 
16 | 
17 | def cross_validation(edge_embs, edge_labels):
18 | 
19 |     auc, mrr = [], []
20 |     seed_nodes, num_nodes = np.array(list(edge_embs.keys())), len(edge_embs)
21 | 
22 |     skf = KFold(n_splits=5, shuffle=True, random_state=seed)
23 |     for fold, (train_idx, test_idx) in enumerate(skf.split(np.zeros((num_nodes, 1)), np.zeros(num_nodes))):
24 | 
25 |         print(f'Start Evaluation Fold {fold}!')
26 |         train_nodes = seed_nodes[train_idx]
27 |         train_edge_embs = np.concatenate([edge_embs[n] for n in train_nodes])
28 |         train_edge_labels = np.concatenate([edge_labels[n] for n in train_nodes])
29 |         test_nodes = seed_nodes[test_idx]
30 |         test_edge_embs = np.concatenate([edge_embs[n] for n in test_nodes])
31 |         test_edge_labels = np.concatenate([edge_labels[n] for n in test_nodes])
32 | 
33 |         c = SingleLabelBinarySeachCV(train_edge_embs, train_edge_labels, multi_class="ovr")
34 |         clf = LinearSVC(random_state=seed, max_iter=max_iter, multi_class="ovr", C=c)
35 |         clf.fit(train_edge_embs, train_edge_labels)
36 |         preds = clf.predict(test_edge_embs)
37 |         auc.append(roc_auc_score(test_edge_labels, preds))
38 | 
39 |         confidence = clf.decision_function(test_edge_embs)
40 |         curr_mrr, conf_num = [], 0
41 |         for each in test_idx:
42 |             test_edge_conf = np.argsort(-confidence[conf_num:conf_num + len(edge_labels[seed_nodes[each]])])
43 |             rank = np.empty_like(test_edge_conf)
44 |             rank[test_edge_conf] = np.arange(len(test_edge_conf))
45 |             curr_mrr.append(1 / (1 + np.min(rank[np.argwhere(edge_labels[seed_nodes[each]] == 1).flatten()])))
46 |             conf_num += len(rank)
47 |         mrr.append(np.mean(curr_mrr))
48 |         assert conf_num == len(confidence)
49 | 
50 |     return np.mean(auc), np.mean(mrr)
51 | 
52 | 
53 | def lp_evaluate(test_file_path, emb_dict):
54 | 
55 |     posi, nega = defaultdict(set), defaultdict(set)
56 |     with open(test_file_path, 'r') as test_file:
57 |         for line in test_file:
58 |             left, right, label = line[:-1].split('\t')
59 |             if label == '1':
60 |                 posi[left].add(right)
61 |             elif label == '0':
62 |                 nega[left].add(right)
63 | 
64 |     edge_embs, edge_labels = defaultdict(list), defaultdict(list)
65 |     for left, rights in posi.items():
66 |         for right in rights:
67 |             key = left + ',' + right
68 |             if key in emb_dict:
69 |                 edge_embs[left].append(emb_dict[key])
70 |             else:
71 |                 edge_embs[left].append(emb_dict[left] * emb_dict[right])
72 |             edge_labels[left].append(1)
73 |     for left, rights in nega.items():
74 |         for right in rights:
75 |             key = left + ',' + right
76 |             if key in emb_dict:
77 |                 edge_embs[left].append(emb_dict[key])
78 |             else:
79 |                 edge_embs[left].append(emb_dict[left] * emb_dict[right])
80 |             edge_labels[left].append(0)
81 | 
82 |     for node in edge_embs:
83 |         edge_embs[node] = np.array(edge_embs[node])
84 |         edge_labels[node] = np.array(edge_labels[node])
85 | 
86 |     auc, mrr = cross_validation(edge_embs, edge_labels)
87 | 
88 |     return auc, mrr
89 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Evaluate/node_classification.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | import numpy as np
  3 | from collections import defaultdict
  4 | from sklearn.svm import LinearSVC
  5 | from sklearn.metrics import f1_score
  6 | from sklearn.model_selection import StratifiedKFold
  7 | from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning
  8 | from utils import SingleLabelBinarySeachCV, MultiLabelBinarySeachCV
  9 | 
 10 | 
 11 | seed = 1
 12 | max_iter = 300
 13 | np.random.seed(seed)
 14 | warnings.filterwarnings("ignore", category=ConvergenceWarning)
 15 | warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
 16 | 
 17 | 
 18 | def nc_evaluate(dataset, supervised, label_file_path, label_test_path, emb_dict):
 19 | 
 20 |     if supervised == 'True':
 21 |         if dataset == 'Yelp':
 22 |             return semisupervised_single_class_multi_label(label_file_path, label_test_path, emb_dict)
 23 |         elif dataset == 'DBLP' or dataset == 'Freebase' or dataset == 'PubMed':
 24 |             return semisupervised_single_class_single_label(label_file_path, label_test_path, emb_dict)
 25 |     elif supervised == 'False':
 26 |         if dataset == 'Yelp':
 27 |             return unsupervised_single_class_multi_label(label_file_path, label_test_path, emb_dict)
 28 |         elif dataset == 'DBLP' or dataset == 'Freebase' or dataset == 'PubMed':
 29 |             return unsupervised_single_class_single_label(label_file_path, label_test_path, emb_dict)
 30 | 
 31 | 
 32 | def semisupervised_single_class_single_label(label_file_path, label_test_path, emb_dict):
 33 | 
 34 |     train_labels, train_embeddings = [], []
 35 |     with open(label_file_path, 'r') as label_file:
 36 |         for line in label_file:
 37 |             index, _, _, label = line.strip().split('\t')
 38 |             train_labels.append(label)
 39 |             train_embeddings.append(emb_dict[index])
 40 |     train_labels, train_embeddings = np.array(train_labels).astype(int), np.array(train_embeddings)
 41 | 
 42 |     test_labels, test_embeddings = [], []
 43 |     with open(label_test_path, 'r') as label_file:
 44 |         for line in label_file:
 45 |             index, _, _, label = line.strip().split('\t')
 46 |             test_labels.append(label)
 47 |             test_embeddings.append(emb_dict[index])
 48 |     test_labels, test_embeddings = np.array(test_labels).astype(int), np.array(test_embeddings)
 49 | 
 50 |     c = SingleLabelBinarySeachCV(train_embeddings, train_labels, multi_class="ovr")
 51 |     clf = LinearSVC(random_state=seed, max_iter=max_iter, multi_class="ovr", C=c)
 52 |     clf.fit(train_embeddings, train_labels)
 53 |     preds = clf.predict(test_embeddings)
 54 | 
 55 |     macro = f1_score(test_labels, preds, average='macro')
 56 |     micro = f1_score(test_labels, preds, average='micro')
 57 | 
 58 |     return macro, micro
 59 | 
 60 | 
 61 | def unsupervised_single_class_single_label(label_file_path, label_test_path, emb_dict):
 62 | 
 63 |     labels, embeddings = [], []
 64 |     for file_path in [label_file_path, label_test_path]:
 65 |         with open(file_path, 'r') as label_file:
 66 |             for line in label_file:
 67 |                 index, _, _, label = line.strip().split('\t')
 68 |                 labels.append(label)
 69 |                 embeddings.append(emb_dict[index])
 70 |     labels, embeddings = np.array(labels).astype(int), np.array(embeddings)
 71 | 
 72 |     macro, micro = [], []
 73 |     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
 74 |     for train_idx, test_idx in skf.split(embeddings, labels):
 75 |         c = SingleLabelBinarySeachCV(embeddings[train_idx], labels[train_idx], multi_class="ovr")
 76 |         clf = LinearSVC(random_state=seed, max_iter=max_iter, multi_class="ovr", C=c)
 77 |         clf.fit(embeddings[train_idx], labels[train_idx])
 78 |         preds = clf.predict(embeddings[test_idx])
 79 | 
 80 |         macro.append(f1_score(labels[test_idx], preds, average='macro'))
 81 |         micro.append(f1_score(labels[test_idx], preds, average='micro'))
 82 |     print(macro)
 83 |     print(micro)
 84 |     return np.mean(macro), np.mean(micro)
 85 | 
 86 | 
 87 | def semisupervised_single_class_multi_label(label_file_path, label_test_path, emb_dict):
 88 | 
 89 |     train_embs = []
 90 |     with open(label_file_path, 'r') as label_file:
 91 |         for line in label_file:
 92 |             index, _, nclass, label = line.strip().split('\t')
 93 |             train_embs.append(emb_dict[index])
 94 |     train_embs = np.array(train_embs)
 95 | 
 96 |     test_embs = []
 97 |     with open(label_test_path, 'r') as label_file:
 98 |         for line in label_file:
 99 |             index, _, nclass, label = line.strip().split('\t')
100 |             test_embs.append(emb_dict[index])
101 |     test_embs = np.array(test_embs)
102 | 
103 |     label_count, labled_node_index = 0, 0
104 |     binary_labels, label_dict, = [], {}
105 |     with open(label_file_path, 'r') as label_file:
106 |         for line in label_file:
107 |             index, _, nclass, label = line.strip().split('\t')
108 |             for each in label.split(','):
109 |                 if (nclass, each) not in label_dict:
110 |                     label_dict[(nclass, each)] = label_count
111 |                     label_count += 1
112 |                     binary_labels.append(np.zeros(len(train_embs), dtype=np.bool8))
113 |                 binary_labels[label_dict[(nclass, each)]][labled_node_index] = 1
114 |             labled_node_index += 1
115 |     train_labels = np.vstack(binary_labels)
116 | 
117 |     label_count, labled_node_index = 0, 0
118 |     binary_labels, label_dict, = [], {}
119 |     with open(label_test_path, 'r') as label_file:
120 |         for line in label_file:
121 |             index, _, nclass, label = line.strip().split('\t')
122 |             for each in label.split(','):
123 |                 if (nclass, each) not in label_dict:
124 |                     label_dict[(nclass, each)] = label_count
125 |                     label_count += 1
126 |                     binary_labels.append(np.zeros(len(test_embs), dtype=np.bool8))
127 |                 binary_labels[label_dict[(nclass, each)]][labled_node_index] = 1
128 |             labled_node_index += 1
129 |     test_labels = np.vstack(binary_labels)
130 | 
131 |     weights, total_scores = [], []
132 |     for ntype, (train_label, test_label) in enumerate(zip(train_labels, test_labels)):
133 |         c = MultiLabelBinarySeachCV(train_embs, train_label, multi_class="crammer_singer")
134 |         clf = LinearSVC(random_state=seed, max_iter=max_iter, C=c, multi_class="crammer_singer")
135 |         clf.fit(train_embs, train_label)
136 |         preds = clf.predict(test_embs)
137 |         scores = f1_score(test_label, preds, average='binary')
138 |         weights.append(sum(test_label))
139 |         total_scores.append(scores)
140 | 
141 |     macro = sum(total_scores) / len(total_scores)
142 |     micro = sum([score * weight for score, weight in zip(total_scores, weights)]) / sum(weights)
143 | 
144 |     return macro, micro
145 | 
146 | 
147 | def unsupervised_single_class_multi_label(label_file_path, label_test_path, emb_dict):
148 |     embs = []
149 |     for file_path in [label_file_path, label_test_path]:
150 |         with open(file_path, 'r') as label_file:
151 |             for line in label_file:
152 |                 index, _, nclass, label = line.strip().split('\t')
153 |                 embs.append(emb_dict[index])
154 |     embs = np.array(embs)
155 | 
156 |     label_count, labled_node_index = 0, 0
157 |     binary_labels, label_dict, = [], {}
158 |     for file_path in [label_file_path, label_test_path]:
159 |         with open(file_path, 'r') as label_file:
160 |             for line in label_file:
161 |                 index, _, nclass, label = line.strip().split('\t')
162 |                 for each in label.split(','):
163 |                     if (nclass, each) not in label_dict:
164 |                         label_dict[(nclass, each)] = label_count
165 |                         label_count += 1
166 |                         binary_labels.append(np.zeros(len(embs), dtype=np.int32))
167 |                     binary_labels[label_dict[(nclass, each)]][labled_node_index] = 1
168 |                 labled_node_index += 1
169 |     binary_labels = np.vstack(binary_labels)
170 | 
171 |     cs = []
172 |     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
173 |     for train_idx, test_idx in skf.split(embs, binary_labels[binary_labels.sum(axis=1).argmax()]):
174 |         c = MultiLabelBinarySeachCV(embs[train_idx], binary_labels[:, train_idx], multi_class="crammer_singer")
175 |         cs.append(c)
176 |     weights, total_scores = [], []
177 |     for ntype, binary_label in enumerate(binary_labels):
178 |         scores = []
179 |         skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
180 |         for nsplit, train_idx, test_idx in enumerate(skf.split(embs, binary_label)):
181 |             clf = LinearSVC(random_state=seed, max_iter=max_iter, C=cs[nsplit], multi_class="crammer_singer")
182 |             clf.fit(embs[train_idx], binary_label[train_idx])
183 |             preds = clf.predict(embs[test_idx])
184 |             scores.append(f1_score(binary_label[test_idx], preds, average='binary'))
185 | 
186 |         weights.append(sum(binary_label))
187 |         total_scores.append(sum(scores) / 5)
188 | 
189 |     macro = sum(total_scores) / len(total_scores)
190 |     micro = sum([score * weight for score, weight in zip(total_scores, weights)]) / sum(weights)
191 | 
192 |     return macro, micro
193 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Evaluate/utils.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import numpy as np
 3 | from collections import defaultdict
 4 | from sklearn.svm import LinearSVC
 5 | from sklearn.metrics import f1_score
 6 | from sklearn.model_selection import StratifiedKFold
 7 | from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning
 8 | 
 9 | seed = 1
10 | max_iter = 100
11 | np.random.seed(seed)
12 | warnings.filterwarnings("ignore", category=ConvergenceWarning)
13 | warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
14 | 
15 | 
16 | def SingleLabelBinarySeachCV(data, labels, multi_class="ovr"):
17 |     best_c = 1.0
18 |     c0 = np.power(10.0, -(labels.max() - labels.min() + 1))
19 |     c1 = 1 / c0
20 |     cnt = 0
21 |     max_cnt = 2 * (labels.max() - labels.min() + 1) - 1
22 |     while cnt < max_cnt and np.abs(c0 - c1) > 1e-10:
23 |         np.random.seed(cnt)
24 |         index = np.random.choice(len(data), size=(int(len(data) * (cnt+1) / max_cnt), ), replace=False)
25 |         cur_data, cur_labels = data[index], labels[index]
26 |         clf0 = LinearSVC(random_state=seed, max_iter=int(max_iter * (cnt+1) / max_cnt), multi_class=multi_class, C=c0)
27 |         clf0.fit(cur_data, cur_labels)
28 |         preds0 = clf0.predict(cur_data)
29 |         macro0 = f1_score(cur_labels, preds0, average='macro')
30 |         micro0 = f1_score(cur_labels, preds0, average='micro')
31 | 
32 |         clf1 = LinearSVC(random_state=seed, max_iter=int(max_iter * (cnt+1) / max_cnt), multi_class=multi_class, C=c1)
33 |         clf1.fit(cur_data, cur_labels)
34 |         preds1 = clf1.predict(cur_data)
35 |         macro1 = f1_score(cur_labels, preds1, average='macro')
36 |         micro1 = f1_score(cur_labels, preds1, average='micro')
37 | 
38 |         if macro0 + micro0 > macro1 + micro1:
39 |             best_c = c0
40 |             c1 /= 10
41 |         else:
42 |             best_c = c1
43 |             c0 *= 10
44 |         cnt += 1
45 |     return best_c
46 | 
47 | 
48 | def MultiLabelBinarySeachCV(data, labels, multi_class="crammer_singer"):
49 |     best_c = 1.0
50 |     c0 = np.power(10.0, -len(labels))
51 |     c1 = 1 / c0
52 |     cnt = 0
53 |     max_cnt = 2 * len(labels) - 1
54 |     while cnt < max_cnt and np.abs(c0 - c1) > 1e-10:
55 |         np.random.seed(cnt)
56 |         index = np.random.choice(len(data), size=(int(len(data) * (cnt+1) / max_cnt), ), replace=False)
57 |         cur_data, cur_labels = data[index], labels[:, index]
58 |         weights0 = np.zeros((len(cur_data)), dtype=np.float32)
59 |         scores0 = np.zeros((len(cur_data)), dtype=np.float32)
60 |         for ntype, nlabels in enumerate(cur_labels):
61 |             clf0 = LinearSVC(random_state=seed, max_iter=int(max_iter * (cnt+1) / max_cnt), multi_class="crammer_singer", C=c0)
62 |             clf0.fit(cur_data, nlabels)
63 |             preds0 = clf0.predict(cur_data)
64 |             scores0[ntype] = f1_score(nlabels, preds0, average='binary')
65 |             weights0[ntype] = sum(nlabels)
66 |         macro0 = scores0.sum() / scores0.shape[0]
67 |         micro0 = (scores0 * weights0 / weights0.sum()).sum()
68 | 
69 |         weights1 = np.zeros((len(cur_data)), dtype=np.float32)
70 |         scores1 = np.zeros((len(cur_data)), dtype=np.float32)
71 |         for ntype, nlabels in enumerate(cur_labels):
72 |             clf1 = LinearSVC(random_state=seed, max_iter=int(max_iter * (cnt+1) / max_cnt), multi_class="crammer_singer", C=c1)
73 |             clf1.fit(cur_data, nlabels)
74 |             preds1 = clf1.predict(cur_data)
75 |             scores1[ntype] = f1_score(nlabels, preds1, average='binary')
76 |             weights1[ntype] = sum(nlabels)
77 |         macro1 = scores1.sum() / scores1.shape[0]
78 |         micro1 = (scores1 * weights1 / weights1.sum()).sum()
79 | 
80 |         if macro0 + micro0 > macro1 + micro1:
81 |             best_c = c0
82 |             c1 /= 10
83 |         else:
84 |             best_c = c1
85 |             c0 *= 10
86 |         cnt += 1
87 |     return best_c
88 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Model/CompGCN/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu=2
 4 | attributed="False"
 5 | supervised="False"
 6 | negative_sample=5
 7 | dropout=0.2
 8 | n_hidden=50
 9 | n_epochs=50 # the epoch here is different with the epoch in original HNE
10 | graph_batch_size=10000
11 | sample_depth=3
12 | sample_width=10
13 | label_batch_size=64
14 | grad_norm=1.0
15 | sampler=randomwalk
16 | 
17 | for dataset in "PubMed" "Yelp"
18 | do
19 |     folder="data/${dataset}/"
20 |     node_file="${folder}node.dat"
21 |     label_file="${folder}label.dat"
22 |     link_file="${folder}link.dat"
23 |     for lr in 1e-2 1e-3
24 |     do
25 |         for reg in 1e-2 1e-3
26 |         do
27 |         for n_layers in 1 2
28 |             do
29 |                 for graph_split_size in 0.5 0.7 0.9
30 |                 do
31 |                     emb_file="${folder}emb_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat"
32 |                     record_file="${folder}record_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat"
33 |                     OMP_NUM_THREADS=4 python src/main.py \
34 |                         --link ${link_file} \
35 |                         --node ${node_file} \
36 |                         --label ${label_file} \
37 |                         --output ${emb_file} \
38 |                         --n-hidden ${n_hidden} \
39 |                         --negative-sample ${negative_sample} \
40 |                         --lr ${lr} \
41 |                         --dropout ${dropout} \
42 |                         --gpu ${gpu} \
43 |                         --n-layers ${n_layers} \
44 |                         --n-epochs ${n_epochs} \
45 |                         --regularization ${reg} \
46 |                         --grad-norm ${grad_norm} \
47 |                         --graph-batch-size ${graph_batch_size} \
48 |                         --graph-split-size ${graph_split_size} \
49 |                         --label-batch-size ${label_batch_size} \
50 |                         --sampler ${sampler} \
51 |                         --sample-depth ${sample_depth} \
52 |                         --sample-width ${sample_width} \
53 |                         --attributed ${attributed} \
54 |                         --supervised ${supervised}
55 |                 done
56 |             done
57 |         done
58 |     done
59 | done
60 | 
61 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Model/CompGCN/src/main.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import dgl
  4 | import math
  5 | import numpy as np
  6 | import os
  7 | import time
  8 | import torch
  9 | import torch.nn.functional as F
 10 | from torch.optim.lr_scheduler import LambdaLR
 11 | from tqdm import tqdm
 12 | 
 13 | import utils
 14 | from model import *
 15 | 
 16 | 
 17 | np.random.seed(12345)
 18 | torch.manual_seed(12345)
 19 | torch.cuda.manual_seed(12345)
 20 | 
 21 | 
 22 | class CosineWarmupRestartScheduler(LambdaLR):
 23 |     def __init__(
 24 |         self,
 25 |         num_warmup_steps=600,
 26 |         num_schedule_steps=10000,
 27 |         num_cycles=2,
 28 |         min_percent=1e-3
 29 |     ):
 30 |         self.num_warmup_steps = num_warmup_steps
 31 |         self.num_schedule_steps = num_schedule_steps
 32 |         self.num_cycles = num_cycles
 33 |         self.min_percent = min_percent
 34 | 
 35 |     def set_optimizer(self, optimizer):
 36 |         super(CosineWarmupRestartScheduler, self).__init__(optimizer, self.lr_lambda)
 37 | 
 38 |     def lr_lambda(self, current_step):
 39 |         if current_step < self.num_warmup_steps:
 40 |             return float(current_step) / float(max(1, self.num_warmup_steps))
 41 |         progress = float(current_step - self.num_warmup_steps) / \
 42 |             float(max(1, self.num_schedule_steps - self.num_warmup_steps))
 43 |         if progress >= 1.0:
 44 |             return self.min_percent
 45 |         return max(self.min_percent, 0.5 * (1.0 + math.cos(math.pi * ((float(self.num_cycles) * progress) % 1.0))))
 46 | 
 47 | 
 48 | def main(args):
 49 | 
 50 |     # load graph data
 51 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start loading...", flush=True)
 52 |     if args.supervised == "True":
 53 |         train_pool, train_labels, nlabels, multi = utils.load_label(args.label)
 54 |         train_data, num_nodes, num_rels, train_indices, ntrain, node_attri = utils.load_supervised(
 55 |             args, args.link, args.node, train_pool
 56 |         )
 57 |     elif args.supervised == "False":
 58 |         train_data, num_nodes, num_rels, node_attri = utils.load_unsupervised(args, args.link, args.node)
 59 |         nlabels = 0
 60 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "finish loading...", flush=True)
 61 | 
 62 |     # check cuda
 63 |     use_cuda = args.gpu >= 0 and torch.cuda.is_available()
 64 |     if use_cuda:
 65 |         torch.cuda.set_device(args.gpu)
 66 |     print("check 1", flush=True)
 67 |     # create model
 68 |     model = TrainModel(
 69 |         node_attri,
 70 |         num_nodes,
 71 |         args.n_hidden,
 72 |         num_rels,
 73 |         nlabels,
 74 |         num_hidden_layers=args.n_layers,
 75 |         dropout=args.dropout,
 76 |         use_cuda=use_cuda,
 77 |         reg_param=args.regularization
 78 |     )
 79 |     print("check 2", flush=True)
 80 |     if use_cuda:
 81 |         model.to("cuda:%d" % (args.gpu))
 82 |     print("check 3", flush=True)
 83 |     """
 84 |     # build adj list and calculate degrees for sampling
 85 |     degrees = utils.get_adj_and_degrees(num_nodes, train_data)
 86 |     """
 87 |     # build graph
 88 |     graph = utils.build_graph_from_triplets(num_nodes, num_rels, train_data)
 89 |     graph.ndata[dgl.NID] = torch.arange(num_nodes, dtype=torch.long)
 90 |     graph.edata[dgl.EID] = torch.arange(len(train_data) * 2, dtype=torch.long)
 91 |     seed_nodes = list()
 92 |     if os.path.exists(args.node.replace("node.dat", "seed_node.dat")):
 93 |         with open(args.node.replace("node.dat", "seed_node.dat"), "r") as f:
 94 |             for line in f:
 95 |                 seed_nodes.append(int(line))
 96 |     seed_nodes = set(seed_nodes)
 97 |     if len(seed_nodes) > 0:
 98 |         dataloader = torch.utils.data.DataLoader(
 99 |             np.array([x for x in train_data if x[0] in seed_nodes or x[2] in seed_nodes]),
100 |             batch_size=args.graph_batch_size, shuffle=True
101 |         )
102 |     else:
103 |         dataloader = torch.utils.data.DataLoader(
104 |             train_data,
105 |             batch_size=args.graph_batch_size, shuffle=True
106 |         )
107 |     print("check 4", flush=True)
108 |     # optimizer
109 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
110 |     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs * len(dataloader), eta_min=3e-6)
111 |     optimizer.zero_grad()
112 |     scheduler.step(0)
113 | 
114 |     # training loop
115 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start training...", flush=True)
116 |     model.train()
117 |     prev_loss = np.float32("inf")
118 |     for epoch in range(args.n_epochs):
119 |         losses = []
120 |         for batch in tqdm(dataloader):
121 |             # perform edge neighborhood sampling to generate training graph and data
122 |             if args.supervised == "True":
123 |                 subg, samples, matched_labels, matched_index = \
124 |                     utils.generate_sampled_graph_and_labels_supervised(
125 |                         graph, batch, args.sampler, args.sample_depth, args.sample_width,
126 |                         args.graph_split_size,
127 |                         train_indices, train_labels, multi, nlabels, ntrain,
128 |                         if_train=True, label_batch_size=args.label_batch_size
129 |                     )
130 |                 if multi:
131 |                     matched_labels = torch.from_numpy(matched_labels).float()
132 |                 else:
133 |                     matched_labels = torch.from_numpy(matched_labels).long()
134 |                 if use_cuda:
135 |                     matched_labels = matched_labels.to("cuda:%d" % (args.gpu))
136 |             elif args.supervised == "False":
137 |                 subg, samples, labels = \
138 |                     utils.generate_sampled_graph_and_labels_unsupervised(
139 |                         graph, batch, args.sampler, args.sample_depth, args.sample_width,
140 |                         args.graph_split_size, args.negative_sample
141 |                     )
142 |                 samples = torch.from_numpy(samples)
143 |                 labels = torch.from_numpy(labels)
144 |                 if use_cuda:
145 |                     samples = samples.to("cuda:%d" % (args.gpu))
146 |                     labels = labels.to("cuda:%d" % (args.gpu))
147 |             else:
148 |                 raise ValueError
149 | 
150 |             # calculate norms and eigenvalues of the subgraph
151 |             edge_norm = utils.compute_edgenorm(subg)
152 |             if use_cuda:
153 |                 subg = subg.to("cuda:%d" % (args.gpu))
154 |                 edge_norm = edge_norm.to("cuda:%d" % (args.gpu))
155 |             edge_type = subg.edata["type"]
156 | 
157 |             embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm)
158 | 
159 |             if args.supervised == "True":
160 |                 loss = model.get_supervised_loss(subg, embed, edge_type, pred, matched_labels, matched_index, multi)
161 |             elif args.supervised == "False":
162 |                 loss = model.get_unsupervised_loss(subg, embed, edge_type, samples, labels)
163 |             loss.backward()
164 |             losses.append(loss.item())
165 |             torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm)  # clip gradients
166 |             optimizer.step()
167 |             optimizer.zero_grad()
168 |             scheduler.step()
169 |         loss = sum(losses) / len(losses)
170 | 
171 |         print(
172 |             time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) +
173 |             "Epoch {:05d} | Loss {:.4f}".format(epoch, loss),
174 |             flush=True
175 |         )
176 |         if loss > prev_loss:
177 |             break
178 |         prev_loss = loss
179 | 
180 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "training done", flush=True)
181 | 
182 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start output...", flush=True)
183 |     dataloader = torch.utils.data.DataLoader(train_data, batch_size=args.graph_batch_size * 4, shuffle=False)
184 |     model.eval()
185 |     with torch.no_grad():
186 |         node_emb, node_sampled = model.model.node_emb.weight.detach().cpu().clone(), set()
187 |         for batch in tqdm(dataloader):
188 |             subg, samples, labels = \
189 |                 utils.generate_sampled_graph_and_labels_unsupervised(
190 |                     graph, batch, args.sampler, args.sample_depth, args.sample_width,
191 |                     args.graph_split_size, args.negative_sample
192 |                 )
193 | 
194 |             # calculate norms and eigenvalues of the subgraph
195 |             edge_norm = utils.compute_edgenorm(subg)
196 |             nid = subg.ndata[dgl.NID]
197 |             coef = (subg.ndata["in_deg"].float() + 1) / (graph.ndata["in_deg"][nid].float() + 1)
198 |             coef = coef.view(-1, 1)
199 |             if use_cuda:
200 |                 subg = subg.to("cuda:%d" % (args.gpu))
201 |                 edge_norm = edge_norm.to("cuda:%d" % (args.gpu))
202 |             edge_type = subg.edata["type"]
203 | 
204 |             embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm)
205 | 
206 |             node_emb[nid] = node_emb[nid] * (1 - coef) + embed[0].detach().cpu() * coef
207 |             # node_emb[nid].data.copy_(embed[0].detach().cpu())
208 |             node_sampled.update(nid.numpy())
209 | 
210 |     print("{:5}% node embeddings are saved.".format(len(node_sampled) * 100 / num_nodes))
211 |     if len(seed_nodes) > 0:
212 |         seed_nodes = np.array(sorted(seed_nodes))
213 |         utils.save(args, node_emb[seed_nodes].numpy(), index=seed_nodes)
214 |     else:
215 |         utils.save(args, node_emb.numpy())
216 | 
217 |     return
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     parser = argparse.ArgumentParser(description="CompGCN")
222 |     parser.add_argument(
223 |         "--link", type=str, required=True,
224 |         help="dataset to use"
225 |     )
226 |     parser.add_argument(
227 |         "--node", type=str, required=True,
228 |         help="dataset to use"
229 |     )
230 |     parser.add_argument(
231 |         "--label", type=str, required=True,
232 |         help="dataset to use"
233 |     )
234 |     parser.add_argument(
235 |         "--output", required=True, type=str,
236 |         help="Output embedding file"
237 |     )
238 |     parser.add_argument(
239 |         "--dropout", type=float, default=0.2,
240 |         help="dropout probability"
241 |     )
242 |     parser.add_argument(
243 |         "--n-hidden", type=int, default=50,
244 |         help="number of hidden units"
245 |     )
246 |     parser.add_argument(
247 |         "--gpu", type=int, default=-1,
248 |         help="gpu"
249 |     )
250 |     parser.add_argument(
251 |         "--lr", type=float, default=1e-2,
252 |         help="learning rate"
253 |     )
254 |     parser.add_argument(
255 |         "--n-layers", type=int, default=2,
256 |         help="number of propagation rounds"
257 |     )
258 |     parser.add_argument(
259 |         "--n-epochs", type=int, default=2000,
260 |         help="number of minimum training epochs"
261 |     )
262 |     parser.add_argument(
263 |         "--regularization", type=float, default=0.01,
264 |         help="regularization weight"
265 |     )
266 |     parser.add_argument(
267 |         "--grad-norm", type=float, default=1.0,
268 |         help="norm to clip gradient to"
269 |     )
270 |     parser.add_argument(
271 |         "--label-batch-size", type=int, default=512
272 |     )
273 |     parser.add_argument(
274 |         "--graph-batch-size", type=int, default=20000,
275 |         help="number of edges to sample in each iteration"
276 |     )
277 |     parser.add_argument(
278 |         "--graph-split-size", type=float, default=0.5,
279 |         help="portion of edges used as positive sample"
280 |     )
281 |     parser.add_argument(
282 |         "--negative-sample", type=int, default=5,
283 |         help="number of negative samples per positive sample"
284 |     )
285 |     parser.add_argument(
286 |         "--sampler", type=str, default="neighbor",
287 |         help="type of subgraph sampler: neighbor or randomwalk"
288 |     )
289 |     parser.add_argument(
290 |         "--sample-depth", type=int, default=6
291 |     )
292 |     parser.add_argument(
293 |         "--sample-width", type=int, default=128
294 |     )
295 |     parser.add_argument(
296 |         "--attributed", type=str, default="False"
297 |     )
298 |     parser.add_argument(
299 |         "--supervised", type=str, default="False"
300 |     )
301 | 
302 |     args = parser.parse_args()
303 |     print(args, flush=True)
304 |     main(args)
305 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Model/DMPNN/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu=1
 4 | attributed="False"
 5 | supervised="False"
 6 | negative_sample=5
 7 | dropout=0.2
 8 | n_hidden=50
 9 | n_epochs=50 # the epoch here is different with the epoch in original HNE
10 | graph_batch_size=10000
11 | sample_depth=3
12 | sample_width=10
13 | label_batch_size=64
14 | grad_norm=1.0
15 | sampler=randomwalk
16 | 
17 | for dataset in "PubMed" "Yelp"
18 | do
19 |     folder="data/${dataset}/"
20 |     node_file="${folder}node.dat"
21 |     label_file="${folder}label.dat"
22 |     link_file="${folder}link.dat"
23 |     for lr in 1e-2 1e-3
24 |     do
25 |         for reg in 1e-2 1e-3
26 |         do
27 |         for n_layers in 1 2
28 |             do
29 |                 for graph_split_size in 0.5 0.7 0.9
30 |                 do
31 |                     emb_file="${folder}emb_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat"
32 |                     record_file="${folder}record_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat"
33 |                     OMP_NUM_THREADS=4 python src/main.py \
34 |                         --link ${link_file} \
35 |                         --node ${node_file} \
36 |                         --label ${label_file} \
37 |                         --output ${emb_file} \
38 |                         --n-hidden ${n_hidden} \
39 |                         --negative-sample ${negative_sample} \
40 |                         --lr ${lr} \
41 |                         --dropout ${dropout} \
42 |                         --gpu ${gpu} \
43 |                         --n-layers ${n_layers} \
44 |                         --n-epochs ${n_epochs} \
45 |                         --regularization ${reg} \
46 |                         --grad-norm ${grad_norm} \
47 |                         --graph-batch-size ${graph_batch_size} \
48 |                         --graph-split-size ${graph_split_size} \
49 |                         --label-batch-size ${label_batch_size} \
50 |                         --sampler ${sampler} \
51 |                         --sample-depth ${sample_depth} \
52 |                         --sample-width ${sample_width} \
53 |                         --attributed ${attributed} \
54 |                         --supervised ${supervised}
55 |                 done
56 |             done
57 |         done
58 |     done
59 | done
60 | 
61 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Model/DMPNN/src/main.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import dgl
  4 | import math
  5 | import numpy as np
  6 | import os
  7 | import time
  8 | import torch
  9 | import torch.nn.functional as F
 10 | from torch.optim.lr_scheduler import LambdaLR
 11 | from tqdm import tqdm
 12 | 
 13 | import utils
 14 | from model import *
 15 | 
 16 | 
 17 | np.random.seed(12345)
 18 | torch.manual_seed(12345)
 19 | torch.cuda.manual_seed(12345)
 20 | 
 21 | 
 22 | class CosineWarmupRestartScheduler(LambdaLR):
 23 |     def __init__(
 24 |         self,
 25 |         num_warmup_steps=600,
 26 |         num_schedule_steps=10000,
 27 |         num_cycles=2,
 28 |         min_percent=1e-3
 29 |     ):
 30 |         self.num_warmup_steps = num_warmup_steps
 31 |         self.num_schedule_steps = num_schedule_steps
 32 |         self.num_cycles = num_cycles
 33 |         self.min_percent = min_percent
 34 | 
 35 |     def set_optimizer(self, optimizer):
 36 |         super(CosineWarmupRestartScheduler, self).__init__(optimizer, self.lr_lambda)
 37 | 
 38 |     def lr_lambda(self, current_step):
 39 |         if current_step < self.num_warmup_steps:
 40 |             return float(current_step) / float(max(1, self.num_warmup_steps))
 41 |         progress = float(current_step - self.num_warmup_steps) / \
 42 |             float(max(1, self.num_schedule_steps - self.num_warmup_steps))
 43 |         if progress >= 1.0:
 44 |             return self.min_percent
 45 |         return max(self.min_percent, 0.5 * (1.0 + math.cos(math.pi * ((float(self.num_cycles) * progress) % 1.0))))
 46 | 
 47 | 
 48 | def main(args):
 49 | 
 50 |     # load graph data
 51 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start loading...", flush=True)
 52 |     if args.supervised == "True":
 53 |         train_pool, train_labels, nlabels, multi = utils.load_label(args.label)
 54 |         train_data, num_nodes, num_rels, train_indices, ntrain, node_attri = utils.load_supervised(
 55 |             args, args.link, args.node, train_pool
 56 |         )
 57 |     elif args.supervised == "False":
 58 |         train_data, num_nodes, num_rels, node_attri = utils.load_unsupervised(args, args.link, args.node)
 59 |         nlabels = 0
 60 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "finish loading...", flush=True)
 61 | 
 62 |     # check cuda
 63 |     use_cuda = args.gpu >= 0 and torch.cuda.is_available()
 64 |     if use_cuda:
 65 |         torch.cuda.set_device(args.gpu)
 66 |     print("check 1", flush=True)
 67 |     # create model
 68 |     model = TrainModel(
 69 |         node_attri,
 70 |         num_nodes,
 71 |         args.n_hidden,
 72 |         num_rels,
 73 |         nlabels,
 74 |         num_hidden_layers=args.n_layers,
 75 |         dropout=args.dropout,
 76 |         use_cuda=use_cuda,
 77 |         reg_param=args.regularization
 78 |     )
 79 |     print("check 2", flush=True)
 80 |     if use_cuda:
 81 |         model.to("cuda:%d" % (args.gpu))
 82 |     print("check 3", flush=True)
 83 |     """
 84 |     # build adj list and calculate degrees for sampling
 85 |     degrees = utils.get_adj_and_degrees(num_nodes, train_data)
 86 |     """
 87 |     # build graph
 88 |     graph = utils.build_graph_from_triplets(num_nodes, num_rels, train_data)
 89 |     graph.ndata[dgl.NID] = torch.arange(num_nodes, dtype=torch.long)
 90 |     graph.edata[dgl.EID] = torch.arange(len(train_data) * 2, dtype=torch.long)
 91 |     seed_nodes = list()
 92 |     if os.path.exists(args.node.replace("node.dat", "seed_node.dat")):
 93 |         with open(args.node.replace("node.dat", "seed_node.dat"), "r") as f:
 94 |             for line in f:
 95 |                 seed_nodes.append(int(line))
 96 |     seed_nodes = set(seed_nodes)
 97 |     if len(seed_nodes) > 0:
 98 |         dataloader = torch.utils.data.DataLoader(
 99 |             np.array([x for x in train_data if x[0] in seed_nodes or x[2] in seed_nodes]),
100 |             batch_size=args.graph_batch_size, shuffle=True
101 |         )
102 |     else:
103 |         dataloader = torch.utils.data.DataLoader(
104 |             train_data,
105 |             batch_size=args.graph_batch_size, shuffle=True
106 |         )
107 |         args.n_epochs = math.ceil(args.n_epochs * len(dataloader) * args.graph_batch_size / num_nodes)
108 |     print("check 4", flush=True)
109 |     # optimizer
110 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
111 |     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs * len(dataloader), eta_min=3e-6)
112 |     optimizer.zero_grad()
113 |     scheduler.step(0)
114 | 
115 |     # training loop
116 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start training...", flush=True)
117 |     model.train()
118 |     prev_loss = np.float32("inf")
119 |     for epoch in range(args.n_epochs):
120 |         losses = []
121 |         for batch in tqdm(dataloader):
122 |             # perform edge neighborhood sampling to generate training graph and data
123 |             if args.supervised == "True":
124 |                 subg, samples, matched_labels, matched_index = \
125 |                     utils.generate_sampled_graph_and_labels_supervised(
126 |                         graph, batch, args.sampler, args.sample_depth, args.sample_width,
127 |                         args.graph_split_size,
128 |                         train_indices, train_labels, multi, nlabels, ntrain,
129 |                         if_train=True, label_batch_size=args.label_batch_size
130 |                     )
131 |                 if multi:
132 |                     matched_labels = torch.from_numpy(matched_labels).float()
133 |                 else:
134 |                     matched_labels = torch.from_numpy(matched_labels).long()
135 |                 if use_cuda:
136 |                     matched_labels = matched_labels.to("cuda:%d" % (args.gpu))
137 |             elif args.supervised == "False":
138 |                 subg, samples, labels = \
139 |                     utils.generate_sampled_graph_and_labels_unsupervised(
140 |                         graph, batch, args.sampler, args.sample_depth, args.sample_width,
141 |                         args.graph_split_size, args.negative_sample
142 |                     )
143 |                 samples = torch.from_numpy(samples)
144 |                 labels = torch.from_numpy(labels)
145 |                 if use_cuda:
146 |                     samples = samples.to("cuda:%d" % (args.gpu))
147 |                     labels = labels.to("cuda:%d" % (args.gpu))
148 |             else:
149 |                 raise ValueError
150 | 
151 |             # calculate norms and eigenvalues of the subgraph
152 |             edge_norm = utils.compute_edgenorm(subg)
153 |             if use_cuda:
154 |                 subg = subg.to("cuda:%d" % (args.gpu))
155 |                 edge_norm = edge_norm.to("cuda:%d" % (args.gpu))
156 |             edge_type = subg.edata["type"]
157 | 
158 |             embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm)
159 | 
160 |             if args.supervised == "True":
161 |                 loss = model.get_supervised_loss(subg, embed, edge_type, pred, matched_labels, matched_index, multi)
162 |             elif args.supervised == "False":
163 |                 loss = model.get_unsupervised_loss(subg, embed, edge_type, samples, labels)
164 |             loss.backward()
165 |             losses.append(loss.item())
166 |             torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm)  # clip gradients
167 |             optimizer.step()
168 |             optimizer.zero_grad()
169 |             scheduler.step()
170 |         loss = sum(losses) / len(losses)
171 | 
172 |         print(
173 |             time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) +
174 |             "Epoch {:05d} | Loss {:.4f}".format(epoch, loss),
175 |             flush=True
176 |         )
177 |         if loss > prev_loss:
178 |             break
179 |         prev_loss = loss
180 | 
181 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "training done", flush=True)
182 | 
183 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start output...", flush=True)
184 |     dataloader = torch.utils.data.DataLoader(train_data, batch_size=args.graph_batch_size * 4, shuffle=False)
185 |     model.eval()
186 |     with torch.no_grad():
187 |         node_emb, node_sampled = model.model.node_emb.weight.detach().cpu().clone(), set()
188 |         for batch in tqdm(dataloader):
189 |             subg, samples, labels = \
190 |                 utils.generate_sampled_graph_and_labels_unsupervised(
191 |                     graph, batch, args.sampler, args.sample_depth, args.sample_width,
192 |                     args.graph_split_size, args.negative_sample
193 |                 )
194 | 
195 |             # calculate norms and eigenvalues of the subgraph
196 |             edge_norm = utils.compute_edgenorm(subg)
197 |             nid = subg.ndata[dgl.NID]
198 |             coef = (subg.ndata["in_deg"].float() + 1) / (graph.ndata["in_deg"][nid].float() + 1)
199 |             coef = coef.view(-1, 1)
200 |             if use_cuda:
201 |                 subg = subg.to("cuda:%d" % (args.gpu))
202 |                 edge_norm = edge_norm.to("cuda:%d" % (args.gpu))
203 |             edge_type = subg.edata["type"]
204 | 
205 |             embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm)
206 | 
207 |             node_emb[nid] = node_emb[nid] * (1 - coef) + embed[0].detach().cpu() * coef
208 |             # node_emb[nid].data.copy_(embed[0].detach().cpu())
209 |             node_sampled.update(nid.numpy())
210 | 
211 |     print("{:5}% node embeddings are saved.".format(len(node_sampled) * 100 / num_nodes))
212 |     if len(seed_nodes) > 0:
213 |         seed_nodes = np.array(sorted(seed_nodes))
214 |         utils.save(args, node_emb[seed_nodes].numpy(), index=seed_nodes)
215 |     else:
216 |         utils.save(args, node_emb.numpy())
217 | 
218 |     return
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     parser = argparse.ArgumentParser(description="DMPNN")
223 |     parser.add_argument(
224 |         "--link", type=str, required=True,
225 |         help="dataset to use"
226 |     )
227 |     parser.add_argument(
228 |         "--node", type=str, required=True,
229 |         help="dataset to use"
230 |     )
231 |     parser.add_argument(
232 |         "--label", type=str, required=True,
233 |         help="dataset to use"
234 |     )
235 |     parser.add_argument(
236 |         "--output", required=True, type=str,
237 |         help="Output embedding file"
238 |     )
239 |     parser.add_argument(
240 |         "--dropout", type=float, default=0.2,
241 |         help="dropout probability"
242 |     )
243 |     parser.add_argument(
244 |         "--n-hidden", type=int, default=50,
245 |         help="number of hidden units"
246 |     )
247 |     parser.add_argument(
248 |         "--gpu", type=int, default=-1,
249 |         help="gpu"
250 |     )
251 |     parser.add_argument(
252 |         "--lr", type=float, default=1e-2,
253 |         help="learning rate"
254 |     )
255 |     parser.add_argument(
256 |         "--n-layers", type=int, default=2,
257 |         help="number of propagation rounds"
258 |     )
259 |     parser.add_argument(
260 |         "--n-epochs", type=int, default=2000,
261 |         help="number of minimum training epochs"
262 |     )
263 |     parser.add_argument(
264 |         "--regularization", type=float, default=0.01,
265 |         help="regularization weight"
266 |     )
267 |     parser.add_argument(
268 |         "--grad-norm", type=float, default=1.0,
269 |         help="norm to clip gradient to"
270 |     )
271 |     parser.add_argument(
272 |         "--label-batch-size", type=int, default=512
273 |     )
274 |     parser.add_argument(
275 |         "--graph-batch-size", type=int, default=20000,
276 |         help="number of edges to sample in each iteration"
277 |     )
278 |     parser.add_argument(
279 |         "--graph-split-size", type=float, default=0.5,
280 |         help="portion of edges used as positive sample"
281 |     )
282 |     parser.add_argument(
283 |         "--negative-sample", type=int, default=5,
284 |         help="number of negative samples per positive sample"
285 |     )
286 |     parser.add_argument(
287 |         "--sampler", type=str, default="neighbor",
288 |         help="type of subgraph sampler: neighbor or randomwalk"
289 |     )
290 |     parser.add_argument(
291 |         "--sample-depth", type=int, default=6
292 |     )
293 |     parser.add_argument(
294 |         "--sample-width", type=int, default=128
295 |     )
296 |     parser.add_argument(
297 |         "--attributed", type=str, default="False"
298 |     )
299 |     parser.add_argument(
300 |         "--supervised", type=str, default="False"
301 |     )
302 | 
303 |     args = parser.parse_args()
304 |     print(args, flush=True)
305 |     main(args)
306 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Model/R-GCN/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu=3
 4 | attributed="False"
 5 | supervised="False"
 6 | negative_sample=5
 7 | dropout=0.2
 8 | n_hidden=50
 9 | n_epochs=50 # the epoch here is different with the epoch in original HNE
10 | graph_batch_size=10000
11 | sample_depth=3
12 | sample_width=10
13 | label_batch_size=64
14 | grad_norm=1.0
15 | sampler=randomwalk
16 | 
17 | for dataset in "PubMed" "Yelp"
18 | do
19 |     folder="data/${dataset}/"
20 |     node_file="${folder}node.dat"
21 |     label_file="${folder}label.dat"
22 |     link_file="${folder}link.dat"
23 |     for lr in 1e-2 1e-3
24 |     do
25 |         for reg in 1e-2 1e-3
26 |         do
27 |         for n_layers in 1 2
28 |             do
29 |                 for graph_split_size in 0.5 0.7 0.9
30 |                 do
31 |                     emb_file="${folder}emb_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat"
32 |                     record_file="${folder}record_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat"
33 |                     OMP_NUM_THREADS=4 python src/main.py \
34 |                         --link ${link_file} \
35 |                         --node ${node_file} \
36 |                         --label ${label_file} \
37 |                         --output ${emb_file} \
38 |                         --n-hidden ${n_hidden} \
39 |                         --negative-sample ${negative_sample} \
40 |                         --lr ${lr} \
41 |                         --dropout ${dropout} \
42 |                         --gpu ${gpu} \
43 |                         --n-layers ${n_layers} \
44 |                         --n-epochs ${n_epochs} \
45 |                         --regularization ${reg} \
46 |                         --grad-norm ${grad_norm} \
47 |                         --graph-batch-size ${graph_batch_size} \
48 |                         --graph-split-size ${graph_split_size} \
49 |                         --label-batch-size ${label_batch_size} \
50 |                         --sampler ${sampler} \
51 |                         --sample-depth ${sample_depth} \
52 |                         --sample-width ${sample_width} \
53 |                         --attributed ${attributed} \
54 |                         --supervised ${supervised}
55 |                 done
56 |             done
57 |         done
58 |     done
59 | done
60 | 
61 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Model/R-GCN/src/model.py:
--------------------------------------------------------------------------------
  1 | import dgl
  2 | import dgl.function as fn
  3 | import math
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from dgl.nn.pytorch import RelGraphConv
  9 | from utils import *
 10 | 
 11 | 
 12 | class MultiHotEmbeddingLayer(nn.Module):
 13 |     def __init__(self, num_emb, emb_dim, base=2):
 14 |         super(MultiHotEmbeddingLayer, self).__init__()
 15 |         self.num_emb = num_emb
 16 |         enc_len = get_enc_len(num_emb - 1, base)
 17 |         self.encoding = nn.Embedding(num_emb, enc_len * base)
 18 |         self.embedding = nn.Parameter(torch.Tensor(enc_len * base, emb_dim))
 19 | 
 20 |         with torch.no_grad():
 21 |             self.encoding.weight.data.copy_(
 22 |                 torch.from_numpy(int2multihot(np.arange(0, num_emb), enc_len, base)).float()
 23 |             )
 24 | 
 25 |             scale = 1 / (emb_dim * enc_len)**0.5
 26 |             nn.init.uniform_(self.embedding, -scale, scale)
 27 |         self.encoding.weight.requires_grad = False
 28 | 
 29 |     def forward(self, g, x):
 30 |         enc = self.encoding(x.squeeze())
 31 |         emb = torch.matmul(enc.view(-1, self.embedding.size(0)), self.embedding)
 32 |         return emb
 33 | 
 34 |     @property
 35 |     def weight(self):
 36 |         return torch.matmul(self.encoding.weight, self.embedding)
 37 | 
 38 | 
 39 | class EmbeddingLayer(nn.Module):
 40 |     def __init__(self, num_emb, emb_dim):
 41 |         super(EmbeddingLayer, self).__init__()
 42 |         self.embedding = nn.Embedding(num_emb, emb_dim)
 43 |         scale = 1 / (emb_dim)**0.5
 44 |         nn.init.uniform_(self.embedding.weight, -scale, scale)
 45 | 
 46 |     def forward(self, g, x):
 47 |         return self.embedding(x.squeeze())
 48 | 
 49 |     @property
 50 |     def weight(self):
 51 |         return self.embedding.weight
 52 | 
 53 | 
 54 | class EmbeddingLayerAttri(nn.Module):
 55 |     def __init__(self, attri):
 56 |         super(EmbeddingLayerAttri, self).__init__()
 57 |         self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(attri))
 58 | 
 59 |     def forward(self, g, x):
 60 |         return self.embedding(x.squeeze())
 61 | 
 62 |     @property
 63 |     def weight(self):
 64 |         return self.embedding.weight
 65 | 
 66 | 
 67 | class BaseModel(nn.Module):
 68 |     def __init__(
 69 |         self,
 70 |         node_attri,
 71 |         rel_attri,
 72 |         num_nodes,
 73 |         h_dim,
 74 |         out_dim,
 75 |         num_rels,
 76 |         num_hidden_layers=1,
 77 |         dropout=0,
 78 |         use_cuda=False
 79 |     ):
 80 |         super(BaseModel, self).__init__()
 81 |         self.num_nodes = num_nodes
 82 |         self.h_dim = h_dim
 83 |         self.out_dim = out_dim
 84 |         self.num_rels = num_rels
 85 |         self.num_hidden_layers = num_hidden_layers
 86 |         self.dropout = dropout
 87 |         self.use_cuda = use_cuda
 88 | 
 89 |         # create conjgcn layers
 90 |         self.build_model(node_attri, rel_attri)
 91 | 
 92 |     def build_model(self, node_attri, rel_attri):
 93 |         self.node_emb, self.rel_emb = self.build_input_layer(node_attri, rel_attri)
 94 |         self.layers = nn.ModuleList()
 95 |         # h2h
 96 |         for idx in range(self.num_hidden_layers):
 97 |             h2h = self.build_hidden_layer(idx)
 98 |             self.layers.append(h2h)
 99 |         # h2o
100 |         h2o = self.build_output_layer()
101 |         if h2o is not None:
102 |             self.layers.append(h2o)
103 | 
104 |     def build_input_layer(self, node_attri, rel_attri):
105 |         return None, None
106 | 
107 |     def build_hidden_layer(self, idx):
108 |         raise NotImplementedError
109 | 
110 |     def build_output_layer(self):
111 |         return None
112 | 
113 |     def forward(self, g, h, r, norm):
114 |         raise NotImplementedError
115 | 
116 | 
117 | class RGCN(BaseModel):
118 |     def build_input_layer(self, node_attri, rel_attri):
119 |         if node_attri is not None:
120 |             return EmbeddingLayerAttri(node_attri), None
121 |         return EmbeddingLayer(self.num_nodes, self.h_dim), None
122 | 
123 |     def build_hidden_layer(self, idx):
124 |         if idx == 0:
125 |             in_dim = self.h_dim
126 |         else:
127 |             in_dim = self.out_dim
128 |         if idx < self.num_hidden_layers - 1:
129 |             act = nn.Tanh()
130 |         else:
131 |             act = None
132 |         return RelGraphConv(
133 |             in_dim,
134 |             self.out_dim,
135 |             self.num_rels,
136 |             regularizer="basis",
137 |             num_bases=-1,
138 |             bias=True,
139 |             activation=act,
140 |             self_loop=True,
141 |             dropout=self.dropout
142 |         )
143 | 
144 |     def forward(self, g, h, r, norm):
145 |         h = self.node_emb(g, h)
146 |         for layer in self.layers:
147 |             h = layer(g, h, r, norm)
148 |         return h
149 | 
150 | 
151 | class TrainModel(nn.Module):
152 |     def __init__(
153 |         self,
154 |         node_attri,
155 |         num_nodes,
156 |         o_dim,
157 |         num_rels,
158 |         nlabel,
159 |         num_hidden_layers=1,
160 |         dropout=0,
161 |         use_cuda=False,
162 |         reg_param=0
163 |     ):
164 |         super(TrainModel, self).__init__()
165 | 
166 |         i_dim = o_dim if node_attri is None else node_attri.shape[1]
167 |         self.model = RGCN(
168 |             node_attri, None, num_nodes, i_dim, o_dim, num_rels * 2, num_hidden_layers, dropout, use_cuda
169 |         )
170 |         self.reg_param = reg_param
171 | 
172 |         if nlabel == 0:
173 |             self.supervised = False
174 |             self.w_relation = nn.Parameter(torch.Tensor(num_rels, o_dim))
175 |             nn.init.xavier_uniform_(self.w_relation, gain=nn.init.calculate_gain('relu'))
176 |         else:
177 |             self.supervised = True
178 |             self.node_fc = nn.Linear(o_dim, nlabel)
179 |             nn.init.xavier_normal_(self.node_fc.weight, gain=nn.init.calculate_gain("sigmoid"))
180 |             nn.init.zeros_(self.node_fc.bias)
181 | 
182 |         # self.edge_fc = nn.Linear(o_dim, num_rels * 2)
183 |         self.edge_fc = nn.Linear(o_dim, o_dim)
184 |         nn.init.xavier_normal_(self.edge_fc.weight, gain=nn.init.calculate_gain('sigmoid'))
185 |         nn.init.zeros_(self.edge_fc.bias)
186 | 
187 |     def calc_score(self, embedding, triplets):
188 |         if isinstance(embedding, (tuple, list)):
189 |             node_emb = embedding[0]
190 |         else:
191 |             node_emb = embedding
192 |         s = node_emb[triplets[:, 0]]
193 |         r = self.w_relation[triplets[:, 1]]
194 |         o = node_emb[triplets[:, 2]]
195 |         score = torch.sum(s * r * o, dim=1)
196 |         return score
197 | 
198 |     def forward(self, g, h, edge_type, edge_norm):
199 |         output = self.model.forward(g, h, edge_type, edge_norm)
200 |         if self.supervised:
201 |             if isinstance(output, (tuple, list)):
202 |                 pred = self.node_fc(output[0])
203 |             else:
204 |                 pred = self.node_fc(output)
205 |         else:
206 |             pred = None
207 | 
208 |         return output, pred
209 | 
210 |     def unsupervised_regularization_loss(self, embedding, edge_type=None):
211 |         reg = torch.mean(self.w_relation.pow(2))
212 |         if isinstance(embedding, (tuple, list)):
213 |             for emb in embedding:
214 |                 reg = reg + torch.mean(emb.pow(2))
215 |         elif isinstance(embedding, torch.Tensor):
216 |             reg = reg + torch.mean(embedding.pow(2))
217 |         else:
218 |             raise ValueError
219 |         if edge_type is not None:
220 |             if isinstance(embedding, (tuple, list)):
221 |                 for emb in embedding:
222 |                     if emb.size(0) == edge_type.size(0):
223 |                         mask = edge_type < self.w_relation.size(0)
224 |                         # reg = reg + F.cross_entropy(self.edge_fc(emb[mask]), edge_type[mask])
225 |                         emb_diff = self.edge_fc(emb[mask]) - torch.index_select(self.w_relation, 0, edge_type[mask])
226 |                         reg = reg + torch.mean(torch.pow(emb_diff, 2))
227 |             elif isinstance(embedding, torch.Tensor):
228 |                 if embedding.size(0) == edge_type.size(0):
229 |                     mask = edge_type < self.w_relation.size(0)
230 |                     # reg = reg + F.cross_entropy(self.edge_fc(embedding[mask]), edge_type[mask])
231 |                     emb_diff = self.edge_fc(embedding[mask]) - torch.index_select(self.w_relation, 0, edge_type[mask])
232 |                     reg = reg + torch.mean(torch.pow(emb_diff, 2))
233 | 
234 |         return reg
235 | 
236 |     def get_unsupervised_loss(self, g, embedding, edge_type, triplets, labels):
237 |         # triplets is a list of data samples (positive and negative)
238 |         # each row in the triplets is a 3-tuple of (source, relation, destination)
239 |         score = self.calc_score(embedding, triplets)
240 |         predict_loss = F.binary_cross_entropy_with_logits(score, labels)
241 |         reg_loss = self.unsupervised_regularization_loss(embedding, edge_type=edge_type)
242 |         return predict_loss + self.reg_param * reg_loss
243 | 
244 |     def supervised_regularization_loss(self, embedding, edge_type=None):
245 |         return self.unsupervised_regularization_loss(embedding, edge_type=edge_type)
246 | 
247 |     def get_supervised_loss(self, g, embedding, edge_type, pred, matched_labels, matched_index, multi):
248 |         # triplets is a list of data samples (positive and negative)
249 |         # each row in the triplets is a 3-tuple of (source, relation, destination)
250 |         if multi:
251 |             predict_loss = F.binary_cross_entropy(torch.sigmoid(pred[matched_index]), matched_labels)
252 |         else:
253 |             predict_loss = F.nll_loss(F.log_softmax(pred[matched_index]), matched_labels)
254 |         reg_loss = self.supervised_regularization_loss(embedding, edge_type=edge_type)
255 |         return predict_loss + self.reg_param * reg_loss
256 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Model/R-GIN/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gpu=2
 4 | attributed="False"
 5 | supervised="False"
 6 | negative_sample=5
 7 | dropout=0.2
 8 | n_hidden=50
 9 | n_epochs=50 # the epoch here is different with the epoch in original HNE
10 | graph_batch_size=10000
11 | sample_depth=3
12 | sample_width=10
13 | label_batch_size=64
14 | grad_norm=1.0
15 | sampler=randomwalk
16 | 
17 | for dataset in "PubMed" "Yelp"
18 | do
19 |     folder="data/${dataset}/"
20 |     node_file="${folder}node.dat"
21 |     label_file="${folder}label.dat"
22 |     link_file="${folder}link.dat"
23 |     for lr in 1e-2 1e-3
24 |     do
25 |         for reg in 1e-2 1e-3
26 |         do
27 |         for n_layers in 1 2
28 |             do
29 |                 for graph_split_size in 0.5 0.7 0.9
30 |                 do
31 |                     emb_file="${folder}emb_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat"
32 |                     record_file="${folder}record_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat"
33 |                     OMP_NUM_THREADS=4 python src/main.py \
34 |                         --link ${link_file} \
35 |                         --node ${node_file} \
36 |                         --label ${label_file} \
37 |                         --output ${emb_file} \
38 |                         --n-hidden ${n_hidden} \
39 |                         --negative-sample ${negative_sample} \
40 |                         --lr ${lr} \
41 |                         --dropout ${dropout} \
42 |                         --gpu ${gpu} \
43 |                         --n-layers ${n_layers} \
44 |                         --n-epochs ${n_epochs} \
45 |                         --regularization ${reg} \
46 |                         --grad-norm ${grad_norm} \
47 |                         --graph-batch-size ${graph_batch_size} \
48 |                         --graph-split-size ${graph_split_size} \
49 |                         --label-batch-size ${label_batch_size} \
50 |                         --sampler ${sampler} \
51 |                         --sample-depth ${sample_depth} \
52 |                         --sample-width ${sample_width} \
53 |                         --attributed ${attributed} \
54 |                         --supervised ${supervised}
55 |                 done
56 |             done
57 |         done
58 |     done
59 | done
60 | 
61 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Model/R-GIN/src/main.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import dgl
  4 | import math
  5 | import numpy as np
  6 | import os
  7 | import time
  8 | import torch
  9 | import torch.nn.functional as F
 10 | from torch.optim.lr_scheduler import LambdaLR
 11 | from tqdm import tqdm
 12 | 
 13 | import utils
 14 | from model import *
 15 | 
 16 | 
 17 | np.random.seed(12345)
 18 | torch.manual_seed(12345)
 19 | torch.cuda.manual_seed(12345)
 20 | 
 21 | 
 22 | class CosineWarmupRestartScheduler(LambdaLR):
 23 |     def __init__(
 24 |         self,
 25 |         num_warmup_steps=600,
 26 |         num_schedule_steps=10000,
 27 |         num_cycles=2,
 28 |         min_percent=1e-3
 29 |     ):
 30 |         self.num_warmup_steps = num_warmup_steps
 31 |         self.num_schedule_steps = num_schedule_steps
 32 |         self.num_cycles = num_cycles
 33 |         self.min_percent = min_percent
 34 | 
 35 |     def set_optimizer(self, optimizer):
 36 |         super(CosineWarmupRestartScheduler, self).__init__(optimizer, self.lr_lambda)
 37 | 
 38 |     def lr_lambda(self, current_step):
 39 |         if current_step < self.num_warmup_steps:
 40 |             return float(current_step) / float(max(1, self.num_warmup_steps))
 41 |         progress = float(current_step - self.num_warmup_steps) / \
 42 |             float(max(1, self.num_schedule_steps - self.num_warmup_steps))
 43 |         if progress >= 1.0:
 44 |             return self.min_percent
 45 |         return max(self.min_percent, 0.5 * (1.0 + math.cos(math.pi * ((float(self.num_cycles) * progress) % 1.0))))
 46 | 
 47 | 
 48 | def main(args):
 49 | 
 50 |     # load graph data
 51 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start loading...", flush=True)
 52 |     if args.supervised == "True":
 53 |         train_pool, train_labels, nlabels, multi = utils.load_label(args.label)
 54 |         train_data, num_nodes, num_rels, train_indices, ntrain, node_attri = utils.load_supervised(
 55 |             args, args.link, args.node, train_pool
 56 |         )
 57 |     elif args.supervised == "False":
 58 |         train_data, num_nodes, num_rels, node_attri = utils.load_unsupervised(args, args.link, args.node)
 59 |         nlabels = 0
 60 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "finish loading...", flush=True)
 61 | 
 62 |     # check cuda
 63 |     use_cuda = args.gpu >= 0 and torch.cuda.is_available()
 64 |     if use_cuda:
 65 |         torch.cuda.set_device(args.gpu)
 66 |     print("check 1", flush=True)
 67 |     # create model
 68 |     model = TrainModel(
 69 |         node_attri,
 70 |         num_nodes,
 71 |         args.n_hidden,
 72 |         num_rels,
 73 |         nlabels,
 74 |         num_hidden_layers=args.n_layers,
 75 |         dropout=args.dropout,
 76 |         use_cuda=use_cuda,
 77 |         reg_param=args.regularization
 78 |     )
 79 |     print("check 2", flush=True)
 80 |     if use_cuda:
 81 |         model.to("cuda:%d" % (args.gpu))
 82 |     print("check 3", flush=True)
 83 |     """
 84 |     # build adj list and calculate degrees for sampling
 85 |     degrees = utils.get_adj_and_degrees(num_nodes, train_data)
 86 |     """
 87 |     # build graph
 88 |     graph = utils.build_graph_from_triplets(num_nodes, num_rels, train_data)
 89 |     graph.ndata[dgl.NID] = torch.arange(num_nodes, dtype=torch.long)
 90 |     graph.edata[dgl.EID] = torch.arange(len(train_data) * 2, dtype=torch.long)
 91 |     seed_nodes = list()
 92 |     if os.path.exists(args.node.replace("node.dat", "seed_node.dat")):
 93 |         with open(args.node.replace("node.dat", "seed_node.dat"), "r") as f:
 94 |             for line in f:
 95 |                 seed_nodes.append(int(line))
 96 |     seed_nodes = set(seed_nodes)
 97 |     if len(seed_nodes) > 0:
 98 |         dataloader = torch.utils.data.DataLoader(
 99 |             np.array([x for x in train_data if x[0] in seed_nodes or x[2] in seed_nodes]),
100 |             batch_size=args.graph_batch_size, shuffle=True
101 |         )
102 |     else:
103 |         dataloader = torch.utils.data.DataLoader(
104 |             train_data,
105 |             batch_size=args.graph_batch_size, shuffle=True
106 |         )
107 |     print("check 4", flush=True)
108 |     # optimizer
109 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
110 |     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs * len(dataloader), eta_min=3e-6)
111 |     optimizer.zero_grad()
112 |     scheduler.step(0)
113 | 
114 |     # training loop
115 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start training...", flush=True)
116 |     model.train()
117 |     prev_loss = np.float32("inf")
118 |     for epoch in range(args.n_epochs):
119 |         losses = []
120 |         for batch in tqdm(dataloader):
121 |             # perform edge neighborhood sampling to generate training graph and data
122 |             if args.supervised == "True":
123 |                 subg, samples, matched_labels, matched_index = \
124 |                     utils.generate_sampled_graph_and_labels_supervised(
125 |                         graph, batch, args.sampler, args.sample_depth, args.sample_width,
126 |                         args.graph_split_size,
127 |                         train_indices, train_labels, multi, nlabels, ntrain,
128 |                         if_train=True, label_batch_size=args.label_batch_size
129 |                     )
130 |                 if multi:
131 |                     matched_labels = torch.from_numpy(matched_labels).float()
132 |                 else:
133 |                     matched_labels = torch.from_numpy(matched_labels).long()
134 |                 if use_cuda:
135 |                     matched_labels = matched_labels.to("cuda:%d" % (args.gpu))
136 |             elif args.supervised == "False":
137 |                 subg, samples, labels = \
138 |                     utils.generate_sampled_graph_and_labels_unsupervised(
139 |                         graph, batch, args.sampler, args.sample_depth, args.sample_width,
140 |                         args.graph_split_size, args.negative_sample
141 |                     )
142 |                 samples = torch.from_numpy(samples)
143 |                 labels = torch.from_numpy(labels)
144 |                 if use_cuda:
145 |                     samples = samples.to("cuda:%d" % (args.gpu))
146 |                     labels = labels.to("cuda:%d" % (args.gpu))
147 |             else:
148 |                 raise ValueError
149 | 
150 |             # calculate norms and eigenvalues of the subgraph
151 |             edge_norm = utils.compute_edgenorm(subg)
152 |             if use_cuda:
153 |                 subg = subg.to("cuda:%d" % (args.gpu))
154 |                 edge_norm = edge_norm.to("cuda:%d" % (args.gpu))
155 |             edge_type = subg.edata["type"]
156 | 
157 |             embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm)
158 | 
159 |             if args.supervised == "True":
160 |                 loss = model.get_supervised_loss(subg, embed, edge_type, pred, matched_labels, matched_index, multi)
161 |             elif args.supervised == "False":
162 |                 loss = model.get_unsupervised_loss(subg, embed, edge_type, samples, labels)
163 |             loss.backward()
164 |             losses.append(loss.item())
165 |             torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm)  # clip gradients
166 |             optimizer.step()
167 |             optimizer.zero_grad()
168 |             scheduler.step()
169 |         loss = sum(losses) / len(losses)
170 | 
171 |         print(
172 |             time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) +
173 |             "Epoch {:05d} | Loss {:.4f}".format(epoch, loss),
174 |             flush=True
175 |         )
176 |         if loss > prev_loss:
177 |             break
178 |         prev_loss = loss
179 | 
180 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "training done", flush=True)
181 | 
182 |     print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start output...", flush=True)
183 |     dataloader = torch.utils.data.DataLoader(train_data, batch_size=args.graph_batch_size * 4, shuffle=False)
184 |     model.eval()
185 |     with torch.no_grad():
186 |         node_emb, node_sampled = model.model.node_emb.weight.detach().cpu().clone(), set()
187 |         for batch in tqdm(dataloader):
188 |             subg, samples, labels = \
189 |                 utils.generate_sampled_graph_and_labels_unsupervised(
190 |                     graph, batch, args.sampler, args.sample_depth, args.sample_width,
191 |                     args.graph_split_size, args.negative_sample
192 |                 )
193 | 
194 |             # calculate norms and eigenvalues of the subgraph
195 |             edge_norm = utils.compute_edgenorm(subg)
196 |             nid = subg.ndata[dgl.NID]
197 |             coef = (subg.ndata["in_deg"].float() + 1) / (graph.ndata["in_deg"][nid].float() + 1)
198 |             coef = coef.view(-1, 1)
199 |             if use_cuda:
200 |                 subg = subg.to("cuda:%d" % (args.gpu))
201 |                 edge_norm = edge_norm.to("cuda:%d" % (args.gpu))
202 |             edge_type = subg.edata["type"]
203 | 
204 |             embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm)
205 | 
206 |             node_emb[nid] = node_emb[nid] * (1 - coef) + embed[0].detach().cpu() * coef
207 |             # node_emb[nid].data.copy_(embed[0].detach().cpu())
208 |             node_sampled.update(nid.numpy())
209 | 
210 |     print("{:5}% node embeddings are saved.".format(len(node_sampled) * 100 / num_nodes))
211 |     if len(seed_nodes) > 0:
212 |         seed_nodes = np.array(sorted(seed_nodes))
213 |         utils.save(args, node_emb[seed_nodes].numpy(), index=seed_nodes)
214 |     else:
215 |         utils.save(args, node_emb.numpy())
216 | 
217 |     return
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     parser = argparse.ArgumentParser(description="R-GIN")
222 |     parser.add_argument(
223 |         "--link", type=str, required=True,
224 |         help="dataset to use"
225 |     )
226 |     parser.add_argument(
227 |         "--node", type=str, required=True,
228 |         help="dataset to use"
229 |     )
230 |     parser.add_argument(
231 |         "--label", type=str, required=True,
232 |         help="dataset to use"
233 |     )
234 |     parser.add_argument(
235 |         "--output", required=True, type=str,
236 |         help="Output embedding file"
237 |     )
238 |     parser.add_argument(
239 |         "--dropout", type=float, default=0.2,
240 |         help="dropout probability"
241 |     )
242 |     parser.add_argument(
243 |         "--n-hidden", type=int, default=50,
244 |         help="number of hidden units"
245 |     )
246 |     parser.add_argument(
247 |         "--gpu", type=int, default=-1,
248 |         help="gpu"
249 |     )
250 |     parser.add_argument(
251 |         "--lr", type=float, default=1e-2,
252 |         help="learning rate"
253 |     )
254 |     parser.add_argument(
255 |         "--n-layers", type=int, default=2,
256 |         help="number of propagation rounds"
257 |     )
258 |     parser.add_argument(
259 |         "--n-epochs", type=int, default=2000,
260 |         help="number of minimum training epochs"
261 |     )
262 |     parser.add_argument(
263 |         "--regularization", type=float, default=0.01,
264 |         help="regularization weight"
265 |     )
266 |     parser.add_argument(
267 |         "--grad-norm", type=float, default=1.0,
268 |         help="norm to clip gradient to"
269 |     )
270 |     parser.add_argument(
271 |         "--label-batch-size", type=int, default=512
272 |     )
273 |     parser.add_argument(
274 |         "--graph-batch-size", type=int, default=20000,
275 |         help="number of edges to sample in each iteration"
276 |     )
277 |     parser.add_argument(
278 |         "--graph-split-size", type=float, default=0.5,
279 |         help="portion of edges used as positive sample"
280 |     )
281 |     parser.add_argument(
282 |         "--negative-sample", type=int, default=5,
283 |         help="number of negative samples per positive sample"
284 |     )
285 |     parser.add_argument(
286 |         "--sampler", type=str, default="neighbor",
287 |         help="type of subgraph sampler: neighbor or randomwalk"
288 |     )
289 |     parser.add_argument(
290 |         "--sample-depth", type=int, default=6
291 |     )
292 |     parser.add_argument(
293 |         "--sample-width", type=int, default=128
294 |     )
295 |     parser.add_argument(
296 |         "--attributed", type=str, default="False"
297 |     )
298 |     parser.add_argument(
299 |         "--supervised", type=str, default="False"
300 |     )
301 | 
302 |     args = parser.parse_args()
303 |     print(args, flush=True)
304 |     main(args)
305 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Model/R-GIN/src/model.py:
--------------------------------------------------------------------------------
  1 | import dgl
  2 | import dgl.function as fn
  3 | import math
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from dgl.nn.pytorch import RelGraphConv
  9 | from utils import *
 10 | 
 11 | 
 12 | class MultiHotEmbeddingLayer(nn.Module):
 13 |     def __init__(self, num_emb, emb_dim, base=2):
 14 |         super(MultiHotEmbeddingLayer, self).__init__()
 15 |         self.num_emb = num_emb
 16 |         enc_len = get_enc_len(num_emb - 1, base)
 17 |         self.encoding = nn.Embedding(num_emb, enc_len * base)
 18 |         self.embedding = nn.Parameter(torch.Tensor(enc_len * base, emb_dim))
 19 | 
 20 |         with torch.no_grad():
 21 |             self.encoding.weight.data.copy_(
 22 |                 torch.from_numpy(int2multihot(np.arange(0, num_emb), enc_len, base)).float()
 23 |             )
 24 | 
 25 |             scale = 1 / (emb_dim * enc_len)**0.5
 26 |             nn.init.uniform_(self.embedding, -scale, scale)
 27 |         self.encoding.weight.requires_grad = False
 28 | 
 29 |     def forward(self, g, x):
 30 |         enc = self.encoding(x.squeeze())
 31 |         emb = torch.matmul(enc.view(-1, self.embedding.size(0)), self.embedding)
 32 |         return emb
 33 | 
 34 |     @property
 35 |     def weight(self):
 36 |         return torch.matmul(self.encoding.weight, self.embedding)
 37 | 
 38 | 
 39 | class EmbeddingLayer(nn.Module):
 40 |     def __init__(self, num_emb, emb_dim):
 41 |         super(EmbeddingLayer, self).__init__()
 42 |         self.embedding = nn.Embedding(num_emb, emb_dim)
 43 |         scale = 1 / (emb_dim)**0.5
 44 |         nn.init.uniform_(self.embedding.weight, -scale, scale)
 45 | 
 46 |     def forward(self, g, x):
 47 |         return self.embedding(x.squeeze())
 48 | 
 49 |     @property
 50 |     def weight(self):
 51 |         return self.embedding.weight
 52 | 
 53 | 
 54 | class EmbeddingLayerAttri(nn.Module):
 55 |     def __init__(self, attri):
 56 |         super(EmbeddingLayerAttri, self).__init__()
 57 |         self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(attri))
 58 | 
 59 |     def forward(self, g, x):
 60 |         return self.embedding(x.squeeze())
 61 | 
 62 |     @property
 63 |     def weight(self):
 64 |         return self.embedding.weight
 65 | 
 66 | 
 67 | class BaseModel(nn.Module):
 68 |     def __init__(
 69 |         self,
 70 |         node_attri,
 71 |         rel_attri,
 72 |         num_nodes,
 73 |         h_dim,
 74 |         out_dim,
 75 |         num_rels,
 76 |         num_hidden_layers=1,
 77 |         dropout=0,
 78 |         use_cuda=False
 79 |     ):
 80 |         super(BaseModel, self).__init__()
 81 |         self.num_nodes = num_nodes
 82 |         self.h_dim = h_dim
 83 |         self.out_dim = out_dim
 84 |         self.num_rels = num_rels
 85 |         self.num_hidden_layers = num_hidden_layers
 86 |         self.dropout = dropout
 87 |         self.use_cuda = use_cuda
 88 | 
 89 |         # create conjgcn layers
 90 |         self.build_model(node_attri, rel_attri)
 91 | 
 92 |     def build_model(self, node_attri, rel_attri):
 93 |         self.node_emb, self.rel_emb = self.build_input_layer(node_attri, rel_attri)
 94 |         self.layers = nn.ModuleList()
 95 |         # h2h
 96 |         for idx in range(self.num_hidden_layers):
 97 |             h2h = self.build_hidden_layer(idx)
 98 |             self.layers.append(h2h)
 99 |         # h2o
100 |         h2o = self.build_output_layer()
101 |         if h2o is not None:
102 |             self.layers.append(h2o)
103 | 
104 |     def build_input_layer(self, node_attri, rel_attri):
105 |         return None, None
106 | 
107 |     def build_hidden_layer(self, idx):
108 |         raise NotImplementedError
109 | 
110 |     def build_output_layer(self):
111 |         return None
112 | 
113 |     def forward(self, g, h, r, norm):
114 |         raise NotImplementedError
115 | 
116 | 
117 | class RelGraphIso(RelGraphConv):
118 |     def __init__(
119 |         self,
120 |         in_feat,
121 |         out_feat,
122 |         num_rels,
123 |         regularizer="basis",
124 |         num_bases=None,
125 |         bias=True,
126 |         activation=None,
127 |         self_loop=True,
128 |         low_mem=False,
129 |         dropout=0.0,
130 |         layer_norm=False
131 |     ):
132 |         super().__init__(
133 |             in_feat,
134 |             out_feat,
135 |             num_rels,
136 |             regularizer=regularizer,
137 |             num_bases=num_bases,
138 |             bias=bias,
139 |             activation=activation,
140 |             self_loop=self_loop,
141 |             low_mem=low_mem,
142 |             dropout=dropout,
143 |             layer_norm=layer_norm
144 |         )
145 |         self.out_layer = nn.Linear(out_feat, out_feat)
146 | 
147 |         nn.init.xavier_uniform_(self.out_layer.weight, gain=nn.init.calculate_gain('relu'))
148 |         nn.init.zeros_(self.out_layer.bias)
149 | 
150 |     def forward(self, g, feat, etypes, norm=None):
151 |         o = super().forward(g, feat, etypes, norm=None)
152 |         o = self.out_layer(o)
153 |         if self.activation:
154 |             o = self.activation(o)
155 |         o = self.dropout(o)
156 | 
157 |         return o
158 | 
159 | 
160 | class RGIN(BaseModel):
161 |     def build_input_layer(self, node_attri, rel_attri):
162 |         if node_attri is not None:
163 |             return EmbeddingLayerAttri(node_attri), None
164 |         return EmbeddingLayer(self.num_nodes, self.h_dim), None
165 | 
166 |     def build_hidden_layer(self, idx):
167 |         if idx == 0:
168 |             in_dim = self.h_dim
169 |         else:
170 |             in_dim = self.out_dim
171 |         if idx < self.num_hidden_layers - 1:
172 |             act = nn.Tanh()
173 |         else:
174 |             act = None
175 |         return RelGraphIso(
176 |             in_dim,
177 |             self.out_dim,
178 |             self.num_rels,
179 |             "basis",
180 |             self.num_rels,
181 |             activation=act,
182 |             self_loop=True,
183 |             dropout=self.dropout
184 |         )
185 | 
186 |     def forward(self, g, h, r, norm):
187 |         h = self.node_emb(g, h)
188 |         has_norm = False
189 |         if "norm" in g.edata:
190 |             has_norm = True
191 |             norm = g.edata.pop("norm")
192 |         for layer in self.layers:
193 |             h = layer(g, h, r, None)
194 |         if has_norm:
195 |             g.edata["norm"] = norm
196 |         return h
197 | 
198 | 
199 | class TrainModel(nn.Module):
200 |     def __init__(
201 |         self,
202 |         node_attri,
203 |         num_nodes,
204 |         o_dim,
205 |         num_rels,
206 |         nlabel,
207 |         num_hidden_layers=1,
208 |         dropout=0,
209 |         use_cuda=False,
210 |         reg_param=0
211 |     ):
212 |         super(TrainModel, self).__init__()
213 | 
214 |         i_dim = o_dim if node_attri is None else node_attri.shape[1]
215 |         self.model = RGIN(
216 |             node_attri, None, num_nodes, i_dim, o_dim, num_rels * 2, num_hidden_layers, dropout, use_cuda
217 |         )
218 |         self.reg_param = reg_param
219 | 
220 |         if nlabel == 0:
221 |             self.supervised = False
222 |             self.w_relation = nn.Parameter(torch.Tensor(num_rels, o_dim))
223 |             nn.init.xavier_uniform_(self.w_relation, gain=nn.init.calculate_gain('relu'))
224 |         else:
225 |             self.supervised = True
226 |             self.node_fc = nn.Linear(o_dim, nlabel)
227 |             nn.init.xavier_normal_(self.node_fc.weight, gain=nn.init.calculate_gain("sigmoid"))
228 |             nn.init.zeros_(self.node_fc.bias)
229 | 
230 |         # self.edge_fc = nn.Linear(o_dim, num_rels * 2)
231 |         self.edge_fc = nn.Linear(o_dim, o_dim)
232 |         nn.init.xavier_normal_(self.edge_fc.weight, gain=nn.init.calculate_gain('sigmoid'))
233 |         nn.init.zeros_(self.edge_fc.bias)
234 | 
235 |     def calc_score(self, embedding, triplets):
236 |         if isinstance(embedding, (tuple, list)):
237 |             node_emb = embedding[0]
238 |         else:
239 |             node_emb = embedding
240 |         s = node_emb[triplets[:, 0]]
241 |         r = self.w_relation[triplets[:, 1]]
242 |         o = node_emb[triplets[:, 2]]
243 |         score = torch.sum(s * r * o, dim=1)
244 |         return score
245 | 
246 |     def forward(self, g, h, edge_type, edge_norm):
247 |         output = self.model.forward(g, h, edge_type, edge_norm)
248 |         if self.supervised:
249 |             if isinstance(output, (tuple, list)):
250 |                 pred = self.node_fc(output[0])
251 |             else:
252 |                 pred = self.node_fc(output)
253 |         else:
254 |             pred = None
255 | 
256 |         return output, pred
257 | 
258 |     def unsupervised_regularization_loss(self, embedding, edge_type=None):
259 |         reg = torch.mean(self.w_relation.pow(2))
260 |         if isinstance(embedding, (tuple, list)):
261 |             for emb in embedding:
262 |                 reg = reg + torch.mean(emb.pow(2))
263 |         elif isinstance(embedding, torch.Tensor):
264 |             reg = reg + torch.mean(embedding.pow(2))
265 |         else:
266 |             raise ValueError
267 |         if edge_type is not None:
268 |             if isinstance(embedding, (tuple, list)):
269 |                 for emb in embedding:
270 |                     if emb.size(0) == edge_type.size(0):
271 |                         mask = edge_type < self.w_relation.size(0)
272 |                         # reg = reg + F.cross_entropy(self.edge_fc(emb[mask]), edge_type[mask])
273 |                         emb_diff = self.edge_fc(emb[mask]) - torch.index_select(self.w_relation, 0, edge_type[mask])
274 |                         reg = reg + torch.mean(torch.pow(emb_diff, 2))
275 |             elif isinstance(embedding, torch.Tensor):
276 |                 if embedding.size(0) == edge_type.size(0):
277 |                     mask = edge_type < self.w_relation.size(0)
278 |                     # reg = reg + F.cross_entropy(self.edge_fc(embedding[mask]), edge_type[mask])
279 |                     emb_diff = self.edge_fc(embedding[mask]) - torch.index_select(self.w_relation, 0, edge_type[mask])
280 |                     reg = reg + torch.mean(torch.pow(emb_diff, 2))
281 | 
282 |         return reg
283 | 
284 |     def get_unsupervised_loss(self, g, embedding, edge_type, triplets, labels):
285 |         # triplets is a list of data samples (positive and negative)
286 |         # each row in the triplets is a 3-tuple of (source, relation, destination)
287 |         score = self.calc_score(embedding, triplets)
288 |         predict_loss = F.binary_cross_entropy_with_logits(score, labels)
289 |         reg_loss = self.unsupervised_regularization_loss(embedding, edge_type=edge_type)
290 |         return predict_loss + self.reg_param * reg_loss
291 | 
292 |     def supervised_regularization_loss(self, embedding, edge_type=None):
293 |         return self.unsupervised_regularization_loss(embedding, edge_type=edge_type)
294 | 
295 |     def get_supervised_loss(self, g, embedding, edge_type, pred, matched_labels, matched_index, multi):
296 |         # triplets is a list of data samples (positive and negative)
297 |         # each row in the triplets is a 3-tuple of (source, relation, destination)
298 |         if multi:
299 |             predict_loss = F.binary_cross_entropy(torch.sigmoid(pred[matched_index]), matched_labels)
300 |         else:
301 |             predict_loss = F.nll_loss(F.log_softmax(pred[matched_index]), matched_labels)
302 |         reg_loss = self.supervised_regularization_loss(embedding, edge_type=edge_type)
303 |         return predict_loss + self.reg_param * reg_loss
304 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Model/README.md:
--------------------------------------------------------------------------------
 1 | ## Models: Message-Passing
 2 | 
 3 | **DMPNN: Graph Convolutional Networks with Dual Message Passing for Subgraph Isomorphism Counting and Matching**
 4 | ```
 5 | @inproceedings{liu2022graph,
 6 |   title={Graph convolutional networks with dual message passing for subgraph isomorphism counting and matching},
 7 |   author={Liu, Xin and Song, Yangqiu},
 8 |   booktitle={AAAI},
 9 |   year={2022}
10 | }
11 | ```
12 | 
13 | *Source: https://github.com/HKUST-KnowComp/DualMessagePassing/blob/master/src/rgin.py*
14 | 
15 | **R-GIN: Neural subgraph isomorphism counting**
16 | ```
17 | @inproceedings{liu2020neural,
18 |   title={Neural subgraph isomorphism counting},
19 |   author={Liu, Xin and Pan, Haojie and He, Mutian and Song, Yangqiu and Jiang, Xin and Shang, Lifeng},
20 |   booktitle={SIGKDD},
21 |   pages={1959--1969},
22 |   year={2020}
23 | }
24 | ```
25 | 
26 | *Source: https://github.com/HKUST-KnowComp/NeuralSubgraphCounting/blob/master/src/rgin.py*
27 | 
28 | **CompGCN: Neural subgraph isomorphism counting**
29 | ```
30 | @inproceedings{vashishth2019composition,
31 |   title={Composition-based multi-relational graph convolutional networks},
32 |   author={Vashishth, Shikhar and Sanyal, Soumya and Nitin, Vikram and Talukdar, Partha},
33 |   booktitle={ICLR},
34 |   year={2020}
35 | }
36 | ```
37 | 
38 | *Source: https://github.com/dmlc/dgl/blob/master/examples/pytorch/compGCN/models.py*
39 | 
40 | **R-GCN: Modeling Relational Data with Graph Convolutional Networks**
41 | ```
42 | @inproceedings{schlichtkrull2018modeling,
43 |   title={Modeling relational data with graph convolutional networks},
44 |   author={Schlichtkrull, Michael and Kipf, Thomas N and Bloem, Peter and Van Den Berg, Rianne and Titov, Ivan and Welling, Max},
45 |   booktitle={ESWC},
46 |   pages={593--607},
47 |   year={2018},
48 |   organization={Springer}
49 | }
50 | ```
51 | 
52 | *Source: https://github.com/dmlc/dgl/blob/master/examples/pytorch/rgcn/model.py*
53 | 
54 | ### Deployment
55 | 
56 | This implementation relies on 2 external packages:
57 | - <a href="https://pytorch.org/">[PyTorch]</a>
58 | - <a href="https://github.com/dmlc/dgl">[DGL]</a>
59 | 
60 | ### Input
61 | 
62 | *Stage 2: Transform* prepares 3 input files stored in ```data/{dataset}```:
63 | - ```node.dat```: This file is only needed for attributed training, each line is formatted as ```{node_id}\t{node_attributes}``` where entries in ```{node_attributes}``` are separated by ```,```.
64 | - ```link.dat```: The first line specifies ```{number_of_nodes} {number_of_link_types}```. Each folloing line is formatted as ```{head_node_id} {link_type} {tail_node_id}```.
65 | - ```label.dat```: This file is only needed for semi-supervised training. Each line is formatted as ```{node_id}\t{node_label}```.
66 | 
67 | ### Run
68 | 
69 | Users need to specify the targeting dataset and the set of training parameters in ```run.sh```. <br /> 
70 | Run ```bash run.sh``` to start training.
71 | 
72 | ### Output
73 | 
74 | This implementation generates 1 output file stored in ```data/{dataset}```:
75 | - ```emb.dat```: The first line specifies the parameters used in training. Each following line describes the id and the embeddings of a node. The id and the embeddings are separated by ```\t```. Entries in the embeddings are separated by ``` ```.


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/README.md:
--------------------------------------------------------------------------------
 1 | # Unsupervised Node Classification
 2 | 
 3 | This part is modified from [HNE](https://github.com/yangji9181/HNE)
 4 | 
 5 | ## Reproduction
 6 | 
 7 | ### Stage 1: Data
 8 | 
 9 | We conduct experiments on 2 HIN benchmark datasets: ```PubMed``` and ```Yelp```.
10 | Please refer to the ```Data``` folder for more details.
11 | 
12 | ### Stage 2: Transform
13 | 
14 | This stage transforms a dataset from its original format to the training input format.
15 | 
16 | Users need to specify the targeting dataset, the targeting model, and the training settings.
17 | 
18 | Please refer to the ```Transform``` folder for more details.
19 | 
20 | ### Stage 3: Model
21 | 
22 | We add ```DMPNN``` and 2 more heterogeneous Message-Passing baseline implementaions (```CompGCN``` and ```R-GIN```)
23 | 
24 | Please refer to the ```Model``` folder for more details.
25 | 
26 | ### Stage 4: Evaluate
27 | 
28 | This stage evaluates the output embeddings based on specific tasks. 
29 | 
30 | Users need to specify the targeting dataset, the targeting model, and the evaluation tasks.
31 | 
32 | Please refer to the ```Evaluate``` folder for more details.
33 | 


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Transform/README.md:
--------------------------------------------------------------------------------
 1 | ## Transform
 2 | 
 3 | This stage transforms a dataset from its original format to the training input format.
 4 | 
 5 | Users need to specify the following parameters in ```transform.sh```:
 6 | - **dataset**: choose from ```PubMed``` and ```Yelp```;
 7 | - **model**: choose from ```DMPNN```, ```CompGCN```, and ```R-GIN``` (more basedlines can be found [here](https://github.com/yangji9181/HNE/tree/master/Model));
 8 | - **attributed**: choose ```False``` for unattributed training;
 9 | - **supervised**: choose ```False``` for unsupervised training.
10 | 
11 | Run ```bash transform.sh``` to complete *Stage 2: Transform*.
12 | 
13 | We also generate a file including node ids of all labeled nodes and nodes in predicted links.


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Transform/transform.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from transform_model import *
 3 | 
 4 | 
 5 | def parse_args():
 6 |     
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('-dataset', required=True, type=str, help='Targeting dataset.', 
 9 |                         choices=['DBLP','Freebase','PubMed','Yelp'])
10 |     parser.add_argument('-model', required=True, type=str, help='Targeting model.', 
11 |                         choices=['metapath2vec-ESim','PTE','HIN2Vec','AspEm','HEER','R-GCN','HAN','MAGNN','HGT','TransE','DistMult','ComplEx','ConvE','R-GIN','CompGCN','DMPNN'])
12 |     parser.add_argument('-attributed', required=True, type=str, help='Only DMPNN, CompGCN, R-GIN, R-GCN, HAN, MAGNN, and HGT support attributed training.',
13 |                         choices=['True','False'])
14 |     parser.add_argument('-supervised', required=True, type=str, help='Only DMPNN, CompGCN, R-GIN, R-GCN, HAN, MAGNN, and HGT support semi-supervised training.', 
15 |                         choices=['True','False'])
16 |     
17 |     return parser.parse_args()
18 | 
19 | 
20 | def check(args):
21 |     
22 |     if args.attributed=='True':
23 |         if args.model not in ['DMPNN', 'CompGCN', 'R-GIN', 'R-GCN','HAN', 'MAGNN', 'HGT']:
24 |             print(f'{args.model} does not support attributed training!')
25 |             print('Only DMPNN, CompGCN, R-GIN, R-GCN, HAN, MAGNN, and HGT support attributed training!')
26 |             return False
27 |         if args.dataset not in ['DBLP', 'PubMed']:
28 |             print(f'{args.dataset} does not support attributed training!')
29 |             print('Only DBLP and PubMed support attributed training!')
30 |             return False
31 |         
32 |     if args.supervised=='True':
33 |         if args.model not in ['DMPNN', 'CompGCN', 'R-GIN', 'R-GCN','HAN', 'MAGNN', 'HGT']:
34 |             print(f'{args.model} does not support semi-supervised training!')
35 |             print('Only DMPNN, CompGCN, R-GIN, R-GCN, HAN, MAGNN, and HGT support semi-supervised training!')
36 |             return False
37 |         
38 |     return True
39 | 
40 | 
41 | def main():
42 |     
43 |     args = parse_args()
44 |     
45 |     if not check(args):
46 |         return
47 |     
48 |     print('Transforming {} to {} input format for {}, {} training!'
49 |           .format(args.dataset, args.model, 
50 |                'attributed' if args.attributed=='True' else 'unattributed', 
51 |                'semi-supervised' if args.supervised=='True' else 'unsupervised'))
52 |     
53 |     if args.model=='metapath2vec-ESim': metapath2vec_esim_convert(args.dataset)
54 |     elif args.model=='PTE': pte_convert(args.dataset)
55 |     elif args.model=='HIN2Vec': hin2vec_convert(args.dataset)
56 |     elif args.model=='AspEm': aspem_convert(args.dataset)
57 |     elif args.model=='HEER': heer_convert(args.dataset)
58 |     elif args.model=='R-GCN': rgcn_convert(args.dataset, args.attributed, args.supervised)
59 |     elif args.model=='HAN': han_convert(args.dataset, args.attributed, args.supervised)
60 |     elif args.model=='MAGNN': magnn_convert(args.dataset, args.attributed, args.supervised)
61 |     elif args.model=='HGT': hgt_convert(args.dataset, args.attributed, args.supervised)    
62 |     elif args.model=='TransE': transe_convert(args.dataset)     
63 |     elif args.model=='DistMult': distmult_convert(args.dataset)
64 |     elif args.model=='ComplEx': complex_convert(args.dataset)
65 |     elif args.model=='ConvE': conve_convert(args.dataset)
66 |     elif args.model=='R-GIN': rgin_convert(args.dataset, args.attributed, args.supervised)
67 |     elif args.model=='CompGCN': compgcn_convert(args.dataset, args.attributed, args.supervised)
68 |     elif args.model=='DMPNN': dmpnn_convert(args.dataset, args.attributed, args.supervised)
69 |         
70 |     print('Data transformation finished!')
71 |     
72 |     return
73 | 
74 | 
75 | if __name__=='__main__':
76 |     main()


--------------------------------------------------------------------------------
/UnsupervisedNodeClassification/Transform/transform.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Note: Only 'R-GCN', 'HAN', 'MAGNN', and 'HGT' support attributed='True' or supervised='True'
 4 | # Note: Only 'DBLP' and 'PubMed' contain node attributes.
 5 | 
 6 | dataset='PubMed' # choose from 'DBLP', 'Yelp', 'Freebase', and 'PubMed'
 7 | model='R-GCN' # choose from 'metapath2vec-ESim', 'PTE', 'HIN2Vec', 'AspEm', 'HEER', 'R-GCN', 'HAN', 'MAGNN', 'HGT', 'TransE', 'DistMult', 'ComplEx', and 'ConvE'
 8 | attributed='False' # choose 'True' or 'False'
 9 | supervised='False' # choose 'True' or 'False'
10 | 
11 | for dataset in 'PubMed' 'Yelp'
12 | do
13 |     for model in 'R-GCN' 'DMPNN' 'CompGCN' 'R-GIN'
14 |     do
15 |         mkdir -p ../Model/${model}/data
16 |         mkdir -p ../Model/${model}/data/${dataset}
17 |         python transform.py -dataset ${dataset} -model ${model} -attributed ${attributed} -supervised ${supervised}
18 |     done
19 | done
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------