├── .gitignore ├── LICENSE ├── README.md ├── SubgraphCountingMatching ├── README.md ├── config.py ├── constants.py ├── dataset.py ├── evaluate.py ├── models │ ├── __init__.py │ ├── basemodel.py │ ├── cnn.py │ ├── compgcn.py │ ├── container.py │ ├── dmplrp.py │ ├── dmpnn.py │ ├── embed.py │ ├── filter.py │ ├── lrp.py │ ├── pred.py │ ├── rgcn.py │ ├── rgin.py │ ├── rnn.py │ └── txl.py ├── train.py └── utils │ ├── __init__.py │ ├── act.py │ ├── anneal.py │ ├── cyclical.py │ ├── dl.py │ ├── graph.py │ ├── init.py │ ├── io.py │ ├── log.py │ ├── sampler.py │ └── scheduler.py └── UnsupervisedNodeClassification ├── Data └── README.md ├── Evaluate ├── README.md ├── evaluate.py ├── evaluate.sh ├── link_prediction.py ├── node_classification.py └── utils.py ├── Model ├── CompGCN │ ├── run.sh │ └── src │ │ ├── main.py │ │ ├── model.py │ │ └── utils.py ├── DMPNN │ ├── run.sh │ └── src │ │ ├── main.py │ │ ├── model.py │ │ └── utils.py ├── R-GCN │ ├── run.sh │ └── src │ │ ├── main.py │ │ ├── model.py │ │ └── utils.py ├── R-GIN │ ├── run.sh │ └── src │ │ ├── main.py │ │ ├── model.py │ │ └── utils.py └── README.md ├── README.md └── Transform ├── README.md ├── transform.py ├── transform.sh └── transform_model.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Data 2 | Data 3 | data 4 | dumps 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | .vscode 113 | .idea 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Sean Liu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Graph Convolutional Networks with Dual Message Passing for Subgraph Isomorphism Counting and Matching 2 | 3 | This repository is an official implementation of the paper Graph Convolutional Networks with Dual Message Passing for Subgraph Isomorphism Counting and Matching. 4 | 5 | ## Introduction 6 | 7 | We propose dual message passing neural networks (DMPNNs) to enhance the substructure representation learning in an asynchronous way for subgraph isomorphism counting and matching as well as unsupervised node classification. 8 | 9 | ## Reproduction 10 | 11 | ### Package Dependencies 12 | * tqdm 13 | * numpy 14 | * pandas 15 | * scipy 16 | * numba >= 0.54.0 17 | * python-igraph == 0.9.11 18 | * torch >= 1.7.0 19 | * dgl >= 0.6.0 20 | 21 | Please refer to `SubgraphCountingMatching` and `UnsupervisedNodeClassification` 22 | 23 | 24 | ### Citation 25 | ```bibtex 26 | @inproceedings{liu2022graph, 27 | author = {Xin Liu, Yangqiu Song}, 28 | title = {Graph Convolutional Networks with Dual Message Passing for Subgraph Isomorphism Counting and Matching}, 29 | booktitle = {AAAI}, 30 | year = {2022} 31 | } 32 | ``` 33 | 34 | ### Miscellaneous 35 | Please send any questions about the code and/or the algorithm to . 36 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/README.md: -------------------------------------------------------------------------------- 1 | # Subgraph Isomorphism Counting and Matching 2 | 3 | This part is modified from [NeuralSubgraphCounting](https://github.com/HKUST-KnowComp/NeuralSubgraphCounting) 4 | 5 | ## Reproduction 6 | 7 | ### Stage 1: Download 8 | 9 | We conduct experiments on 4 subgraph isomorphism benchmark datasets: ```Erdos-Renyi```, ```Regular```, ```Complex```, and ```MUTAG```. 10 | 11 | Please download data from [OneDrive](https://hkustconnect-my.sharepoint.com/:f:/g/personal/xliucr_connect_ust_hk/ErzdTZguJnFBok2QKUr3yAYBHaReWOYOOAEca0uGzgBlyQ?e=UTs21h). 12 | 13 | ### Stage 2: Training 14 | 15 | We add ```DMPNN``` and ```CompGCN``` for heterogeneous Message-Passing implementaions. 16 | We also add ```DMPLRP``` and ```LRP``` for local relational pooling implementations. 17 | 18 | * In order to add reversed edges, please set `--add_rev True`. 19 | * In order to joint learn counting and matching, please set `--node_pred True --match_weights node` (for graph models), `--edge_pred True --match_weights edge` (for sequence models), or `--node_pred True --edge_pred True --match_weights node,edge` (for CompGCN, DMPNN, and DMPLRP, but no further improvement). 20 | 21 | ##### For Erdos-Renyi 22 | ```bash 23 | python train.py \ 24 | --pattern_dir data/Erdos-Renyi/patterns \ 25 | --graph_dir data/Erdos-Renyi/graphs \ 26 | --metadata_dir data/Erdos-Renyi/metadata \ 27 | --save_data_dir data/Erdos-Renyi/datasets \ 28 | --save_model_dir dumps/Erdos-Renyi \ 29 | --add_rev True \ 30 | --hid_dim 64 --node_pred True --edge_pred False \ 31 | --match_weights node \ 32 | --enc_net Multihot --enc_base 2 \ 33 | --emb_net Equivariant --share_emb_net True \ 34 | --rep_net DMPNN \ 35 | --rep_num_pattern_layers 3 --rep_num_graph_layers 3 \ 36 | --rep_residual True --rep_dropout 0.0 --share_rep_net True \ 37 | --pred_net SumPredictNet --pred_hid_dim 64 --pred_dropout 0.0 \ 38 | --max_npv 4 --max_npe 10 --max_npvl 1 --max_npel 1 \ 39 | --max_ngv 10 --max_nge 48 --max_ngvl 1 --max_ngel 1 \ 40 | --train_grad_steps 1 --train_batch_size 64 \ 41 | --train_log_steps 10 --eval_batch_size 64 \ 42 | --lr 1e-3 --train_epochs 100 \ 43 | --seed 0 --gpu_id 0 44 | ``` 45 | 46 | ##### For Regular 47 | ```bash 48 | python train.py \ 49 | --pattern_dir data/Regular/patterns \ 50 | --graph_dir data/Regular/graphs \ 51 | --metadata_dir data/Regular/metadata \ 52 | --save_data_dir data/Regular/datasets \ 53 | --save_model_dir dumps/Regular \ 54 | --add_rev True \ 55 | --hid_dim 64 --node_pred True --edge_pred False \ 56 | --match_weights node \ 57 | --enc_net Multihot --enc_base 2 \ 58 | --emb_net Equivariant --share_emb_net True \ 59 | --rep_net DMPNN \ 60 | --rep_num_pattern_layers 3 --rep_num_graph_layers 3 \ 61 | --rep_residual True --rep_dropout 0.0 --share_rep_net True \ 62 | --pred_net SumPredictNet --pred_hid_dim 64 --pred_dropout 0.0 \ 63 | --max_npv 4 --max_npe 10 --max_npvl 1 --max_npel 1 \ 64 | --max_ngv 30 --max_nge 90 --max_ngvl 1 --max_ngel 1 \ 65 | --train_grad_steps 1 --train_batch_size 64 \ 66 | --train_log_steps 10 --eval_batch_size 64 \ 67 | --lr 1e-3 --train_epochs 100 \ 68 | --seed 0 --gpu_id 0 69 | ``` 70 | 71 | ##### For Complex 72 | ```bash 73 | python train.py \ 74 | --pattern_dir data/Complex/patterns \ 75 | --graph_dir data/Complex/graphs \ 76 | --metadata_dir data/Complex/metadata_withoutloop \ 77 | --save_data_dir data/Complex/datasets \ 78 | --save_model_dir dumps/Complex \ 79 | --add_rev True \ 80 | --hid_dim 64 --node_pred True --edge_pred False \ 81 | --match_weights node \ 82 | --enc_net Multihot --enc_base 2 \ 83 | --emb_net Equivariant --share_emb_net True \ 84 | --rep_net DMPNN \ 85 | --rep_num_pattern_layers 3 --rep_num_graph_layers 3 \ 86 | --rep_residual True --rep_dropout 0.0 --share_rep_net True \ 87 | --pred_net SumPredictNet --pred_hid_dim 64 --pred_dropout 0.0 \ 88 | --max_npv 8 --max_npe 8 --max_npvl 8 --max_npel 8 \ 89 | --max_ngv 64 --max_nge 256 --max_ngvl 16 --max_ngel 16 \ 90 | --train_grad_steps 1 --train_batch_size 512 \ 91 | --train_log_steps 100 --eval_batch_size 512 \ 92 | --lr 1e-3 --train_epochs 100 \ 93 | --seed 0 --gpu_id 0 94 | ``` 95 | 96 | ##### For MUTAG 97 | ```bash 98 | python train.py \ 99 | --pattern_dir data/MUTAG/patterns \ 100 | --graph_dir data/MUTAG/graphs \ 101 | --metadata_dir data/MUTAG/metadata \ 102 | --save_data_dir data/MUTAG/datasets \ 103 | --save_model_dir dumps/MUTAG \ 104 | --add_rev True \ 105 | --hid_dim 64 --node_pred True --edge_pred False \ 106 | --match_weights node \ 107 | --enc_net Multihot --enc_base 2 \ 108 | --emb_net Equivariant --share_emb_net True \ 109 | --rep_net DMPNN \ 110 | --rep_num_pattern_layers 3 --rep_num_graph_layers 3 \ 111 | --rep_residual True --rep_dropout 0.0 --share_rep_net True \ 112 | --pred_net SumPredictNet --pred_hid_dim 64 --pred_dropout 0.0 \ 113 | --max_npv 4 --max_npe 3 --max_npvl 2 --max_npel 2 \ 114 | --max_ngv 28 --max_nge 66 --max_ngvl 7 --max_ngel 4 \ 115 | --train_grad_steps 1 --train_batch_size 32 \ 116 | --train_log_steps 10 --eval_batch_size 32 \ 117 | --lr 1e-3 --train_epochs 200 \ 118 | --seed 0 --gpu_id 0 119 | ``` 120 | 121 | ### Stage 3: Evaluation 122 | 123 | ```bash 124 | python evaluate.py \ 125 | --pattern_dir data/MUTAG/patterns \ 126 | --graph_dir data/MUTAG/graphs \ 127 | --metadata_dir data/MUTAG/metadata \ 128 | --save_data_dir data/MUTAG/datasets \ 129 | --load_model_dir dumps/MUTAG/DMPNN_SumPredictNet_2021_12_09_14_11_52 \ 130 | --eval_batch_size 64 131 | ``` 132 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/constants.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import os 3 | import re 4 | 5 | INF = 1e30 6 | _INF = -1e30 7 | EPS = 1e-8 8 | PI = 3.141592653589793 9 | 10 | LEAKY_RELU_A = 1 / 5.5 11 | 12 | LOOPFLAG = "is_loop" 13 | REVFLAG = "is_reversed" 14 | NORM = "norm" 15 | INDEGREE = "in_deg" 16 | INNORM = "in_norm" 17 | OUTDEGREE = "out_deg" 18 | OUTNORM = "out_norm" 19 | NODEID = "id" 20 | EDGEID = "id" 21 | NODELABEL = "label" 22 | EDGELABEL = "label" 23 | NODEEIGENV = "node_eigenv" 24 | EDGEEIGENV = "edge_eigenv" 25 | NODEFEAT = "node_feat" 26 | EDGEFEAT = "edge_feat" 27 | NODETYPE = "node_type" 28 | EDGETYPE = "edge_type" 29 | NODEMSG = "node_msg" 30 | EDGEMSG = "edge_msg" 31 | NODEAGG = "node_agg" 32 | EDGEAGG = "edge_agg" 33 | NODEOUTPUT = "node_out" 34 | EDGEOUTPUT = "edge_out" 35 | 36 | INIT_STEPS = 600 37 | SCHEDULE_STEPS = 10000 38 | NUM_CYCLES = 2 39 | MIN_PERCENT = 1e-3 40 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/evaluate.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import gc 3 | import math 4 | import numpy as np 5 | import os 6 | import pickle 7 | import random 8 | import time 9 | import torch as th 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import warnings 13 | from collections import OrderedDict 14 | from copy import deepcopy 15 | from functools import partial 16 | from itertools import chain 17 | from sklearn.metrics import roc_auc_score 18 | from tensorboardX import SummaryWriter 19 | from torch.utils.data import DataLoader 20 | from constants import * 21 | from dataset import * 22 | 23 | from torch.optim import AdamW 24 | 25 | from config import get_eval_config 26 | from utils.graph import compute_norm, compute_largest_eigenvalues, convert_to_dual_graph, get_dual_subisomorphisms 27 | from utils.log import init_logger, close_logger, generate_log_line, generate_best_line, get_best_epochs 28 | from utils.io import load_data, load_config, save_config, save_results 29 | from utils.scheduler import map_scheduler_str_to_scheduler 30 | from utils.sampler import BucketSampler, CircurriculumSampler 31 | from utils.anneal import anneal_fn 32 | from utils.cyclical import cyclical_fn 33 | from models import * 34 | 35 | from train import process_model_config, load_model 36 | from train import load_edgeseq_datasets, load_graphadj_datasets 37 | from train import remove_loops, add_reversed_edges, convert_to_dual_data 38 | from train import calculate_degrees, calculate_norms, calculate_eigenvalues 39 | from train import evaluate_epoch 40 | 41 | warnings.filterwarnings("ignore") 42 | 43 | 44 | if __name__ == "__main__": 45 | config = get_eval_config() 46 | 47 | random.seed(config["seed"]) 48 | th.manual_seed(config["seed"]) 49 | np.random.seed(config["seed"]) 50 | 51 | ts = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") 52 | if not config["load_model_dir"] or not os.path.exists(config["load_model_dir"]): 53 | raise FileNotFoundError 54 | model_name = "_".join(os.path.split(config["load_model_dir"])[1].split("_")[:2]) 55 | 56 | if config["save_data_dir"]: 57 | os.makedirs(config["save_data_dir"], exist_ok=True) 58 | 59 | # set device 60 | if isinstance(config["gpu_id"], int) and config["gpu_id"] >= 0: 61 | device = th.device("cuda:%d" % (config["gpu_id"])) 62 | else: 63 | device = th.device("cpu") 64 | 65 | # set logger and writer 66 | logger = init_logger(log_file=os.path.join(config["load_model_dir"], "eval_log_%s.txt" % (ts)), log_tag=model_name) 67 | logger.info("evaluation config: ", str(config)) 68 | 69 | # create/load model 70 | model, best_epochs = load_model(config["load_model_dir"]) 71 | for metric, epochs in best_epochs.items(): 72 | for data_type in epochs: 73 | logger.info( 74 | generate_best_line( 75 | data_type, 76 | epochs[data_type][0], 77 | epochs[data_type][0], 78 | **{ 79 | metric: "{:.3f}".format(epochs[data_type][1]) 80 | } 81 | ) 82 | ) 83 | model_config = load_config(os.path.join(config["load_model_dir"], "config.json"), as_dict=True) 84 | for k, v in model_config.items(): 85 | if k not in config: 86 | config[k] = v 87 | model.expand(pred_return_weights=config["match_weights"], **process_model_config(config)) 88 | model = model.to(device) 89 | logger.info(model) 90 | logger.info("number of parameters: %d" % (sum(p.numel() for p in model.parameters() if p.requires_grad))) 91 | 92 | # load data 93 | if config["rep_net"] in ["CNN", "RNN", "TXL"]: 94 | datasets = load_edgeseq_datasets( 95 | pattern_dir=config["pattern_dir"], 96 | graph_dir=config["graph_dir"], 97 | metadata_dir=config["metadata_dir"], 98 | save_data_dir=config["save_data_dir"], 99 | num_workers=config["num_workers"], 100 | logger=logger 101 | ) 102 | else: 103 | datasets = load_graphadj_datasets( 104 | pattern_dir=config["pattern_dir"], 105 | graph_dir=config["graph_dir"], 106 | metadata_dir=config["metadata_dir"], 107 | save_data_dir=config["save_data_dir"], 108 | num_workers=config["num_workers"], 109 | logger=logger 110 | ) 111 | 112 | # remove loops 113 | if "withoutloop" in config["metadata_dir"] or "withoutloop" in config["save_data_dir"]: 114 | for data_type in datasets: 115 | remove_loops(datasets[data_type]) 116 | 117 | max_ngv = config["max_ngv"] 118 | max_nge = config["max_nge"] 119 | max_ngvl = config["max_ngvl"] 120 | max_ngel = config["max_ngel"] 121 | if config["share_emb_net"]: 122 | max_npv = max_ngv 123 | max_npe = max_nge 124 | max_npvl = max_ngvl 125 | max_npel = max_ngel 126 | else: 127 | max_npv = config["max_npv"] 128 | max_npe = config["max_npe"] 129 | max_npvl = config["max_npvl"] 130 | max_npel = config["max_npel"] 131 | 132 | # compute the p_len and g_len for original data 133 | for data_type in datasets: 134 | if isinstance(datasets[data_type], EdgeSeqDataset): 135 | for x in datasets[data_type]: 136 | x["g_len"] = len(x["graph"]) 137 | x["p_len"] = len(x["pattern"]) 138 | elif isinstance(datasets[data_type], GraphAdjDataset): 139 | for x in datasets[data_type]: 140 | x["g_len"] = len(x["graph"]) 141 | x["p_len"] = len(x["pattern"]) 142 | if NODEID not in x["graph"].ndata: 143 | x["graph"].ndata[NODEID] = th.arange(x["graph"].number_of_nodes()) 144 | if EDGEID not in x["graph"].edata: 145 | x["graph"].edata[EDGEID] = th.arange(x["graph"].number_of_edges()) 146 | if NODEID not in x["pattern"].ndata: 147 | x["pattern"].ndata[NODEID] = th.arange(x["pattern"].number_of_nodes()) 148 | if EDGEID not in x["pattern"].edata: 149 | x["pattern"].edata[EDGEID] = th.arange(x["pattern"].number_of_edges()) 150 | 151 | # add E reversed edges 152 | if config["add_rev"]: 153 | if logger: 154 | logger.info("adding reversed edges...") 155 | for data_type in datasets: 156 | add_reversed_edges(datasets[data_type], max_npe, max_npel, max_nge, max_ngel) 157 | max_npe *= 2 158 | max_npel *= 2 159 | max_nge *= 2 160 | max_ngel *= 2 161 | 162 | # convert graphs to conj_graphs 163 | if config["convert_dual"]: 164 | if logger: 165 | logger.info("converting dual graphs and isomorphisms...") 166 | for data_type in datasets: 167 | convert_to_dual_data(datasets[data_type]) 168 | avg_gd = math.ceil(max_nge / max_ngv) 169 | avg_pd = math.ceil(max_npe / max_npv) 170 | 171 | max_ngv, max_nge = max_nge, (avg_gd * avg_gd) * max_ngv - max_ngv 172 | max_npv, max_npe = max_npe, (avg_pd * avg_pd) * max_npv - max_npv 173 | max_ngvl, max_ngel = max_ngel, max_ngvl 174 | max_npvl, max_npel = max_npel, max_npvl 175 | 176 | # calculate the degrees, norms, and lambdas 177 | if logger: 178 | logger.info("calculating degress...") 179 | for data_type in datasets: 180 | calculate_degrees(datasets[data_type]) 181 | calculate_norms(datasets[data_type], self_loop=True) 182 | calculate_eigenvalues(datasets[data_type]) 183 | 184 | if config["rep_net"].endswith("LRP"): 185 | lrp_datasets = OrderedDict() 186 | share_memory = "small" not in config["graph_dir"] 187 | cache = dict() if share_memory else None 188 | for data_type in datasets: 189 | LRPDataset.seq_len = config["lrp_seq_len"] 190 | lrp_datasets[data_type] = LRPDataset( 191 | datasets[data_type], 192 | cache=cache, 193 | num_workers=config["num_workers"], 194 | share_memory=share_memory 195 | ) 196 | for x in lrp_datasets[data_type]: 197 | x["g_len"] = len(x["graph"]) 198 | x["p_len"] = len(x["pattern"]) 199 | del cache 200 | del datasets 201 | gc.collect() 202 | datasets = lrp_datasets 203 | 204 | # set records 205 | eval_metrics = {"train": None, "dev": None, "test": None} 206 | 207 | logger.info("-" * 80) 208 | for data_type, dataset in datasets.items(): 209 | sampler = BucketSampler( 210 | dataset, 211 | group_by=["g_len", "p_len"], 212 | batch_size=config["eval_batch_size"], 213 | shuffle=False, 214 | seed=config["seed"], 215 | drop_last=False 216 | ) 217 | data_loader = DataLoader( 218 | dataset, 219 | batch_sampler=sampler, 220 | collate_fn=partial(dataset.batchify, return_weights=config["match_weights"]), 221 | ) 222 | eval_metric, eval_results = evaluate_epoch( 223 | model, data_type, data_loader, device, config, 0, logger, None 224 | ) 225 | save_results( 226 | eval_results, os.path.join(config["load_model_dir"], "eval_%s_results_%s.json" % (data_type, ts)) 227 | ) 228 | 229 | eval_metrics[data_type] = eval_metric 230 | 231 | for data_type in eval_metrics: 232 | if eval_metrics[data_type] is not None: 233 | logger.info( 234 | generate_best_line( 235 | data_type, 236 | 0, 237 | 0, 238 | **{ 239 | "eval-" + config["eval_metric"]: "{:.3f}".format(eval_metrics[data_type]) 240 | } 241 | ) 242 | ) 243 | logger.info("=" * 80) 244 | 245 | close_logger(logger) 246 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .basemodel import BaseModel, EdgeSeqModel, GraphAdjModel, GraphAdjModelV2 2 | from .cnn import CNN 3 | from .rnn import RNN 4 | from .txl import TransformerXL 5 | from .rgcn import RGCN 6 | from .rgin import RGIN 7 | from .compgcn import CompGCN 8 | from .dmpnn import DMPNN 9 | from .lrp import LRP 10 | from .dmplrp import DMPLRP 11 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/models/cnn.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .basemodel import EdgeSeqModel 6 | from .container import * 7 | # from ..utils.act import map_activation_str_to_layer 8 | # from ..utils.init import init_weight, init_module 9 | from utils.act import map_activation_str_to_layer 10 | from utils.init import init_weight, init_module 11 | 12 | 13 | class CNNLayer(nn.Module): 14 | def __init__( 15 | self, 16 | in_channels, 17 | out_channels, 18 | kernel_size, 19 | padding=-1, 20 | stride=1, 21 | groups=1, 22 | dilation=1, 23 | batch_norm=True, 24 | act_func="relu", 25 | dropout=0.0 26 | ): 27 | super(CNNLayer, self).__init__() 28 | self.in_channels = in_channels 29 | self.out_channels = out_channels 30 | if padding == -1: 31 | padding = kernel_size//2 32 | 33 | self.conv = nn.Conv1d( 34 | in_channels, out_channels, 35 | kernel_size=kernel_size, padding=padding, 36 | stride=stride, groups=groups, dilation=dilation 37 | ) 38 | self.act = map_activation_str_to_layer(act_func, inplace=True) 39 | self.pool = nn.MaxPool1d( 40 | kernel_size=kernel_size//stride, stride=1, padding=padding 41 | ) 42 | if batch_norm: 43 | self.bn = nn.BatchNorm1d(out_channels) 44 | else: 45 | self.bn = None 46 | self.drop = nn.Dropout(dropout) 47 | 48 | # init 49 | init_module(self.conv, init="normal", activation=act_func) 50 | 51 | def forward(self, x): 52 | o = self.conv(x) 53 | o = self.act(o) 54 | o = self.pool(o) 55 | if self.bn is not None: 56 | o = self.bn(o) 57 | o = self.drop(o) 58 | 59 | return o 60 | 61 | def get_output_dim(self): 62 | return self.out_channels 63 | 64 | def extra_repr(self): 65 | "" 66 | 67 | 68 | class CNN(EdgeSeqModel): 69 | def __init__(self, **kw): 70 | super(CNN, self).__init__(**kw) 71 | 72 | def create_rep_net(self, type, **kw): 73 | if type == "graph": 74 | num_layers = kw.get("rep_num_graph_layers", 1) 75 | elif type == "pattern": 76 | if self.share_rep_net: 77 | return self.g_rep_net 78 | num_layers = kw.get("rep_num_pattern_layers", 1) 79 | act_func = kw.get("rep_act_func", "relu") 80 | dropout = kw.get("rep_dropout", 0.0) 81 | 82 | batch_norm = kw.get("rep_cnn_batch_norm", True) 83 | kernel_sizes = kw.get("rep_cnn_kernel_sizes", 2) 84 | paddings = kw.get("rep_cnn_paddings", -1) 85 | strides = kw.get("rep_cnn_strides", 1) 86 | 87 | if isinstance(kernel_sizes, int): 88 | kernel_sizes = [kernel_sizes] * num_layers 89 | if isinstance(paddings, int): 90 | paddings = [paddings] * num_layers 91 | if isinstance(strides, int): 92 | strides = [strides] * num_layers 93 | 94 | cnn = ModuleList() 95 | for i in range(num_layers): 96 | cnn.add_module( 97 | "%s_cnn_(%d)" % (type, i), 98 | CNNLayer( 99 | self.hid_dim, 100 | self.hid_dim, 101 | kernel_size=kernel_sizes[i], 102 | padding=paddings[i], 103 | stride=strides[i], 104 | batch_norm=batch_norm, 105 | act_func=act_func, 106 | dropout=dropout 107 | ) 108 | ) 109 | 110 | return ModuleDict({"cnn": cnn}) 111 | 112 | def get_pattern_rep(self, p_emb, mask=None): 113 | if mask is None: 114 | outputs = [p_emb.transpose(1, 2)] 115 | for layer in self.p_rep_net["cnn"]: 116 | o = layer(outputs[-1]) 117 | if self.rep_residual and o.size() == outputs[-1].size(): 118 | outputs.append(outputs[-1] + o) 119 | else: 120 | outputs.append(o) 121 | for i in range(len(outputs)): 122 | outputs[i] = outputs[i].transpose(1, 2) 123 | else: 124 | gate = mask.float().transpose(1, 2) 125 | outputs = [p_emb.transpose(1, 2) * gate] 126 | for layer in self.p_rep_net["cnn"]: 127 | gate = F.max_pool1d( 128 | gate, 129 | kernel_size=layer.conv.kernel_size, 130 | stride=layer.conv.stride, 131 | padding=layer.conv.padding, 132 | dilation=layer.conv.dilation 133 | ) 134 | gate = F.max_pool1d( 135 | gate, 136 | kernel_size=layer.pool.kernel_size, 137 | stride=layer.pool.stride, 138 | padding=layer.pool.padding, 139 | dilation=layer.pool.dilation 140 | ) 141 | o = layer(outputs[-1]) 142 | o = o * gate 143 | if self.rep_residual and o.size() == outputs[-1].size(): 144 | outputs.append(outputs[-1] + o) 145 | else: 146 | outputs.append(o) 147 | for i in range(len(outputs)): 148 | outputs[i] = outputs[i].transpose(1, 2) 149 | 150 | return outputs[-1] 151 | 152 | def get_graph_rep(self, g_emb, mask=None, gate=None): 153 | if mask is None and gate is None: 154 | outputs = [g_emb.transpose(1, 2)] 155 | for layer in self.g_rep_net["cnn"]: 156 | o = layer(outputs[-1]) 157 | if self.rep_residual and o.size() == outputs[-1].size(): 158 | outputs.append(outputs[-1] + o) 159 | else: 160 | outputs.append(o) 161 | for i in range(len(outputs)): 162 | outputs[i] = outputs[i].transpose(1, 2) 163 | else: 164 | gate = ((mask.float() if mask is not None else 1) * (gate if gate is not None else 1)).transpose(1, 2) 165 | outputs = [g_emb.transpose(1, 2) * gate] 166 | for layer in self.g_rep_net["cnn"]: 167 | gate = F.max_pool1d( 168 | gate, 169 | kernel_size=layer.conv.kernel_size, 170 | stride=layer.conv.stride, 171 | padding=layer.conv.padding, 172 | dilation=layer.conv.dilation 173 | ) 174 | gate = F.max_pool1d( 175 | gate, 176 | kernel_size=layer.pool.kernel_size, 177 | stride=layer.pool.stride, 178 | padding=layer.pool.padding, 179 | dilation=layer.pool.dilation 180 | ) 181 | o = layer(outputs[-1]) 182 | o = o * gate 183 | if self.rep_residual and o.size() == outputs[-1].size(): 184 | outputs.append(outputs[-1] + o) 185 | else: 186 | outputs.append(o) 187 | for i in range(len(outputs)): 188 | outputs[i] = outputs[i].transpose(1, 2) 189 | 190 | return outputs[-1] 191 | 192 | def refine_edge_weights(self, weights, use_max=False): 193 | if weights is None: 194 | return None 195 | dim = weights.dim() 196 | dtype = weights.dtype 197 | if dim == 2: 198 | weights = weights.unsqueeze(-1) 199 | weights = weights.transpose(1, 2).float() 200 | if use_max: 201 | for layer in self.g_rep_net["cnn"]: 202 | if isinstance(layer, CNNLayer): 203 | weights = F.max_pool1d( 204 | weights, 205 | kernel_size=layer.conv.kernel_size, 206 | stride=layer.conv.stride, 207 | padding=layer.conv.padding, 208 | dilation=layer.conv.dilation, 209 | ) 210 | weights = F.max_pool1d( 211 | weights, 212 | kernel_size=layer.pool.kernel_size, 213 | stride=layer.pool.stride, 214 | padding=layer.pool.padding, 215 | dilation=layer.pool.dilation, 216 | ) 217 | else: 218 | for layer in self.g_rep_net["cnn"]: 219 | if isinstance(layer, CNNLayer): 220 | weights = sum(layer.conv.kernel_size) * F.avg_pool1d( 221 | weights, 222 | kernel_size=layer.conv.kernel_size, 223 | stride=layer.conv.stride, 224 | padding=layer.conv.padding, 225 | # dilation=layer.conv.dilation, 226 | ) 227 | weights = F.max_pool1d( 228 | weights, 229 | kernel_size=layer.pool.kernel_size, 230 | stride=layer.pool.stride, 231 | padding=layer.pool.padding, 232 | dilation=layer.pool.dilation, 233 | ) 234 | weights = weights.transpose(1, 2) 235 | if dim == 2: 236 | weights = weights.squeeze(-1) 237 | return weights.to(dtype) 238 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/models/dmpnn.py: -------------------------------------------------------------------------------- 1 | import dgl.function as fn 2 | import torch as th 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from .basemodel import GraphAdjModelV2 7 | from .container import * 8 | # from ..constants import * 9 | # from ..utils.act import map_activation_str_to_layer 10 | # from ..utils.init import init_weight, init_module 11 | from constants import * 12 | from utils.act import map_activation_str_to_layer 13 | from utils.init import init_weight, init_module 14 | 15 | 16 | class DMPLayer(nn.Module): 17 | def __init__( 18 | self, 19 | input_dim, 20 | hidden_dim, 21 | init_neigenv=4.0, # empirical value of triangles 22 | init_eeigenv=4.0, # empirical value of triangles 23 | bias=True, 24 | num_mlp_layers=2, 25 | batch_norm=True, 26 | act_func="relu", 27 | dropout=0.0 28 | ): 29 | super(DMPLayer, self).__init__() 30 | self.input_dim = input_dim 31 | self.hidden_dim = hidden_dim 32 | 33 | self.in_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim)) 34 | self.out_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim)) 35 | self.src_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim)) 36 | self.dst_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim)) 37 | self.nloop_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim)) 38 | self.eloop_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim)) 39 | if bias: 40 | self.nbias = nn.Parameter(th.Tensor(hidden_dim)) 41 | self.ebias = nn.Parameter(th.Tensor(hidden_dim)) 42 | else: 43 | self.register_parameter("nbias", None) 44 | self.register_parameter("ebias", None) 45 | self.nmlp = [] 46 | for i in range(num_mlp_layers): 47 | self.nmlp.append(nn.Linear(hidden_dim, hidden_dim)) 48 | if i != num_mlp_layers - 1: 49 | if batch_norm: 50 | self.nmlp.append(nn.BatchNorm1d(hidden_dim)) 51 | self.nmlp.append(map_activation_str_to_layer(act_func)) 52 | self.nmlp = Sequential(*self.nmlp) 53 | self.emlp = [] 54 | for i in range(num_mlp_layers): 55 | self.emlp.append(nn.Linear(hidden_dim, hidden_dim)) 56 | if i != num_mlp_layers - 1: 57 | if batch_norm: 58 | self.emlp.append(nn.BatchNorm1d(hidden_dim)) 59 | self.emlp.append(map_activation_str_to_layer(act_func)) 60 | self.emlp = Sequential(*self.emlp) 61 | self.act = map_activation_str_to_layer(act_func) 62 | self.drop = nn.Dropout(dropout) 63 | 64 | # init 65 | init_weight(self.in_weight, activation=act_func, init="uniform") 66 | init_weight(self.out_weight, activation=act_func, init="uniform") 67 | init_weight(self.src_weight, activation=act_func, init="uniform") 68 | init_weight(self.dst_weight, activation=act_func, init="uniform") 69 | init_weight(self.nloop_weight, activation=act_func, init="uniform") 70 | init_weight(self.eloop_weight, activation=act_func, init="uniform") 71 | for module in self.nmlp.modules(): 72 | init_module(module, activation=act_func, init="uniform") 73 | for module in self.emlp.modules(): 74 | init_module(module, activation=act_func, init="uniform") 75 | if bias: 76 | nn.init.zeros_(self.nbias) 77 | nn.init.zeros_(self.ebias) 78 | 79 | # reparamerization tricks 80 | with th.no_grad(): 81 | self.in_weight.data.div_(init_neigenv) 82 | self.out_weight.data.div_(init_neigenv) 83 | self.nloop_weight.data.div_(init_neigenv) 84 | self.src_weight.data.div_(init_eeigenv) 85 | self.dst_weight.data.div_(init_eeigenv) 86 | self.eloop_weight.data.div_(init_eeigenv) 87 | 88 | # register functions 89 | self.node_init_func = self._node_init_func 90 | self.edge_init_func = self._edge_init_func 91 | self.node_message_func = self._node_message_func 92 | self.node_reduce_func = fn.sum(msg=NODEMSG, out=NODEAGG) 93 | self.node_update_func = self._node_update_func 94 | self.edge_update_func = self._edge_update_func 95 | 96 | def _node_init_func(self, graph, node_feat=None): 97 | if node_feat is not None: 98 | graph.ndata[NODEFEAT] = node_feat 99 | 100 | if OUTDEGREE not in graph.ndata: 101 | graph.ndata[OUTDEGREE] = graph.out_degrees() 102 | 103 | return graph.ndata[NODEFEAT] 104 | 105 | def _edge_init_func(self, graph, edge_feat=None): 106 | if edge_feat is not None: 107 | graph.edata[EDGEFEAT] = edge_feat 108 | 109 | return graph.edata[EDGEFEAT] 110 | 111 | def _node_message_func(self, edges): 112 | edge_msg = th.matmul(edges.dst[NODEFEAT], self.dst_weight) - th.matmul(edges.src[NODEFEAT], self.src_weight) 113 | node_msg = -th.matmul(edges.data[EDGEFEAT], self.in_weight) 114 | 115 | # no need to half them further 116 | if REVFLAG in edges.data: 117 | rmask = edges.data[REVFLAG].view(-1, 1) 118 | mask = ~(rmask) 119 | 120 | rev_edge_msg = th.matmul(edges.src[NODEFEAT], self.dst_weight) - th.matmul(edges.dst[NODEFEAT], self.src_weight) 121 | rev_node_msg = th.matmul(edges.data[EDGEFEAT], self.out_weight) 122 | 123 | edge_msg = edge_msg.masked_fill(rmask, 0.0) + rev_edge_msg.masked_fill(mask, 0.0) 124 | node_msg = node_msg.masked_fill(rmask, 0.0) + rev_node_msg.masked_fill(mask, 0.0) 125 | 126 | edges.data[EDGEAGG] = edge_msg 127 | return {NODEMSG: node_msg} 128 | 129 | def _node_update_func(self, nodes): 130 | agg = nodes.data[NODEAGG] 131 | out = th.matmul(nodes.data[NODEFEAT], self.nloop_weight) + agg 132 | if self.nbias is not None: 133 | out = out + self.nbias 134 | if len(self.nmlp) > 0: 135 | out = self.nmlp(out) 136 | else: 137 | out = self.act(out) 138 | out = self.drop(out) 139 | 140 | return {NODEOUTPUT: out} 141 | 142 | def _edge_update_func(self, edges): 143 | agg = edges.data[EDGEAGG] 144 | d = edges.dst[OUTDEGREE].unsqueeze(-1).float() 145 | d = (1 + d).log2() # avoid nan ... 146 | add = 2 * (1 + d) * th.matmul(edges.data[EDGEFEAT], (self.src_weight - self.dst_weight)) 147 | out = th.matmul(edges.data[EDGEFEAT], self.eloop_weight) + add + agg 148 | if self.ebias is not None: 149 | out = out + self.ebias 150 | if len(self.emlp) > 0: 151 | out = self.emlp(out) 152 | else: 153 | out = self.act(out) 154 | out = self.drop(out) 155 | 156 | return {EDGEOUTPUT: out} 157 | 158 | def forward(self, graph, node_feat, edge_feat): 159 | # g = graph.local_var() 160 | g = graph 161 | self.node_init_func(g, node_feat) 162 | self.edge_init_func(g, edge_feat) 163 | g.update_all(self.node_message_func, self.node_reduce_func, self.node_update_func) 164 | g.apply_edges(self.edge_update_func) 165 | 166 | return g.ndata.pop(NODEOUTPUT), g.edata.pop(EDGEOUTPUT) 167 | 168 | def extra_repr(self): 169 | summary = [ 170 | "in=%s, out=%s" % (self.input_dim, self.hidden_dim), 171 | ] 172 | 173 | return "\n".join(summary) 174 | 175 | def get_output_dim(self): 176 | return self.hidden_dim 177 | 178 | 179 | class DMPNN(GraphAdjModelV2): 180 | def __init__(self, **kw): 181 | super(DMPNN, self).__init__(**kw) 182 | 183 | def create_rep_net(self, type, **kw): 184 | if type == "graph": 185 | num_layers = kw.get("rep_num_graph_layers", 1) 186 | elif type == "pattern": 187 | if self.share_rep_net: 188 | return self.g_rep_net 189 | num_layers = kw.get("rep_num_pattern_layers", 1) 190 | init_neigenv = kw.get("init_neigenv", 4.0) 191 | init_eeigenv = kw.get("init_eeigenv", 4.0) 192 | num_mlp_layers = kw.get("rep_dmpnn_num_mlp_layers", 2) 193 | batch_norm = kw.get("rep_dmpnn_batch_norm", False) 194 | act_func = kw.get("rep_act_func", "relu") 195 | dropout = kw.get("rep_dropout", 0.0) 196 | 197 | dmpnn = ModuleList() 198 | for i in range(num_layers): 199 | dmpnn.add_module( 200 | "%s_dmpnn_(%d)" % (type, i), 201 | DMPLayer( 202 | self.hid_dim, 203 | self.hid_dim, 204 | init_neigenv=init_neigenv, 205 | init_eeigenv=init_eeigenv, 206 | num_mlp_layers=num_mlp_layers, 207 | batch_norm=batch_norm, 208 | act_func=act_func, 209 | dropout=dropout 210 | ) 211 | ) 212 | 213 | return ModuleDict({"dmpnn": dmpnn}) 214 | 215 | def get_pattern_rep(self, pattern, p_v_emb, p_e_emb, v_mask=None, e_mask=None): 216 | if v_mask is not None: 217 | p_v_zero_mask = ~(v_mask) 218 | v_outputs = [p_v_emb.masked_fill(p_v_zero_mask, 0.0)] 219 | else: 220 | p_v_zero_mask = None 221 | v_outputs = [p_v_emb] 222 | 223 | if e_mask is not None: 224 | p_e_zero_mask = ~(e_mask) 225 | e_outputs = [p_e_emb.masked_fill(p_e_zero_mask, 0.0)] 226 | else: 227 | p_e_zero_mask = None 228 | e_outputs = [p_e_emb] 229 | 230 | for layer in self.p_rep_net["dmpnn"]: 231 | v, e = layer(pattern, v_outputs[-1], e_outputs[-1]) 232 | if p_v_zero_mask is not None: 233 | v = v.masked_fill(p_v_zero_mask, 0.0) 234 | if p_e_zero_mask is not None: 235 | e = e.masked_fill(p_e_zero_mask, 0.0) 236 | if self.rep_residual and v_outputs[-1].size() == v.size() and e_outputs[-1].size() == e.size(): 237 | v_outputs.append(v_outputs[-1] + v) 238 | e_outputs.append(e_outputs[-1] + e) 239 | else: 240 | v_outputs.append(v) 241 | e_outputs.append(e) 242 | 243 | return v_outputs[-1], e_outputs[-1] 244 | 245 | def get_graph_rep(self, graph, g_v_emb, g_e_emb, v_mask=None, e_mask=None, v_gate=None, e_gate=None): 246 | if v_mask is not None or v_gate is not None: 247 | if v_gate is None: 248 | v_gate = v_mask.float() 249 | elif v_mask is not None: 250 | v_gate = v_mask.float() * v_gate 251 | v_outputs = [g_v_emb * v_gate] 252 | else: 253 | v_outputs = [g_v_emb] 254 | 255 | if e_mask is not None or e_gate is not None: 256 | if e_gate is None: 257 | e_gate = e_mask.float() 258 | elif e_mask is not None: 259 | e_gate = e_mask.float() * e_gate 260 | e_outputs = [g_e_emb * e_gate] 261 | else: 262 | e_outputs = [g_e_emb] 263 | 264 | for layer in self.g_rep_net["dmpnn"]: 265 | v, e = layer(graph, v_outputs[-1], e_outputs[-1]) 266 | if v_gate is not None: 267 | v = v * v_gate 268 | if e_gate is not None: 269 | e = e * e_gate 270 | if self.rep_residual and v_outputs[-1].size() == v.size() and e_outputs[-1].size() == e.size(): 271 | v_outputs.append(v_outputs[-1] + v) 272 | e_outputs.append(e_outputs[-1] + e) 273 | else: 274 | v_outputs.append(v) 275 | e_outputs.append(e) 276 | 277 | return v_outputs[-1], e_outputs[-1] 278 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/models/embed.py: -------------------------------------------------------------------------------- 1 | 2 | import numba 3 | import numpy as np 4 | import torch as th 5 | import torch.nn as nn 6 | 7 | 8 | @numba.jit(numba.int64[:](numba.int64[:], numba.int64), nopython=True) 9 | def _get_enc_len(x, base=10): 10 | lens = np.zeros((len(x), ), dtype=np.int64) 11 | for i, n in enumerate(x): 12 | cnt = 0 13 | while n > 0: 14 | n = n // base 15 | cnt += 1 16 | # avoid 0 length 17 | if cnt == 0: 18 | cnt = 1 19 | lens[i] = cnt 20 | return lens 21 | 22 | 23 | def get_enc_len(x, base=10): 24 | if isinstance(x, int): 25 | return _get_enc_len(np.array([x], dtype=np.int64), base)[0] 26 | elif isinstance(x, float): 27 | return _get_enc_len(np.array([int(x)], dtype=np.int64), base)[0] 28 | if isinstance(x, th.Tensor): 29 | x = x.numpy() 30 | elif not isinstance(x, np.ndarray): 31 | x = np.array(x) 32 | x = x.astype(np.int64) 33 | x_shape = x.shape 34 | 35 | return _get_enc_len(x.reshape(-1), base).reshape(*x_shape) 36 | 37 | 38 | @numba.jit( 39 | numba.int64[:, :](numba.int64[:], numba.int64, numba.int64), 40 | nopython=True, 41 | nogil=True 42 | ) 43 | def _int2anybase(x, len_x, base): 44 | numbers = np.zeros((len(x), len_x), dtype=np.int64) 45 | for i, n in enumerate(x): 46 | n = n % base**len_x 47 | idx = len_x - 1 48 | while n: 49 | numbers[i, idx] = n % base 50 | n = n // base 51 | idx -= 1 52 | 53 | return numbers 54 | 55 | 56 | def int2anybase(x, len_x, base=10): 57 | if isinstance(x, int): 58 | return _int2anybase(np.array([x], dtype=np.int64), len_x, base)[0] 59 | elif isinstance(x, float): 60 | return _int2anybase(np.array([int(x)], dtype=np.int64), len_x, base)[0] 61 | if isinstance(x, th.Tensor): 62 | x = x.numpy() 63 | elif not isinstance(x, np.ndarray): 64 | x = np.array(x) 65 | x = x.astype(np.int64) 66 | 67 | return _int2anybase(x, len_x, base) 68 | 69 | 70 | @numba.jit( 71 | numba.int64[:, :](numba.int64[:], numba.int64, numba.int64), 72 | nopython=True, 73 | nogil=True 74 | ) 75 | def _int2multihot(x, len_x, base): 76 | rep = np.zeros((len(x), len_x * base), dtype=np.int64) 77 | for i, n in enumerate(x): 78 | n = n % base**len_x 79 | idx = (len_x - 1) * base 80 | while n: 81 | rep[i, idx + n % base] = 1 82 | n = n // base 83 | idx -= base 84 | while idx >= 0: 85 | rep[i, idx] = 1 86 | idx -= base 87 | return rep 88 | 89 | 90 | def int2multihot(x, len_x, base=10): 91 | if isinstance(x, int): 92 | return _int2multihot(np.array([x], dtype=np.int64), len_x, base)[0] 93 | elif isinstance(x, float): 94 | return _int2multihot(np.array([int(x)], dtype=np.int64), len_x, base)[0] 95 | if isinstance(x, th.Tensor): 96 | x = x.numpy() 97 | elif not isinstance(x, np.ndarray): 98 | x = np.array(x) 99 | x = x.astype(np.int64) 100 | 101 | return _int2multihot(x, len_x, base) 102 | 103 | 104 | 105 | class Embedding(nn.Embedding): 106 | def __init__(self, num_embeddings, embedding_dim, **kw): 107 | super(Embedding, self).__init__(num_embeddings, embedding_dim, **kw) 108 | 109 | def forward(self, x): 110 | if x.dtype == th.long: 111 | emb = super(Embedding, self).forward(x) 112 | elif x.dtype == th.float and x.size(-1) == self.num_embeddings: 113 | x_size = x.size() 114 | emb = th.matmul(x.view(-1, x_size[-1]), self.weight) 115 | emb = emb.view(x_size[:-1] + (self.embedding_dim, )) 116 | else: 117 | raise NotImplementedError 118 | return emb 119 | 120 | def get_output_dim(self): 121 | return self.embedding_dim 122 | 123 | 124 | class NormalEmbedding(Embedding): 125 | def __init__(self, num_embeddings, embedding_dim, **kw): 126 | super(NormalEmbedding, self).__init__(num_embeddings, embedding_dim, **kw) 127 | 128 | # init 129 | nn.init.normal_(self.weight, 0.0, 1.0) 130 | if self.padding_idx is not None: 131 | with th.no_grad(): 132 | self.weight[self.padding_idx].fill_(0) 133 | 134 | 135 | class UniformEmbedding(Embedding): 136 | def __init__(self, num_embeddings, embedding_dim, **kw): 137 | super(UniformEmbedding, self).__init__(num_embeddings, embedding_dim, **kw) 138 | 139 | # init 140 | nn.init.uniform_(self.weight, -1.0, 1.0) 141 | if self.padding_idx is not None: 142 | with th.no_grad(): 143 | self.weight[self.padding_idx].fill_(0) 144 | 145 | 146 | class OrthogonalEmbedding(Embedding): 147 | def __init__(self, num_embeddings, embedding_dim, **kw): 148 | super(OrthogonalEmbedding, self).__init__(num_embeddings, embedding_dim, **kw) 149 | 150 | # init 151 | nn.init.orthogonal_(self.weight) 152 | if self.padding_idx is not None: 153 | with th.no_grad(): 154 | self.weight[self.padding_idx].fill_(0) 155 | 156 | 157 | """ 158 | Ravanbakhsh, S.; Schneider, J.; and Poczos, B. 159 | Equivariance Through Parameter-Sharing. 160 | In Proceedings of International Conference on Machine Learning, volume 70, of JMLR: W&CP, August 2017. 161 | """ 162 | class EquivariantEmbedding(Embedding): 163 | def __init__(self, num_embeddings, embedding_dim, **kw): 164 | super(EquivariantEmbedding, self).__init__(num_embeddings, embedding_dim, **kw) 165 | 166 | self.row_vec = nn.Parameter(th.Tensor(self.embedding_dim, )) 167 | 168 | # init 169 | self.allow_forward = True 170 | nn.init.normal_(self.row_vec, 0.0, 1.0) 171 | with th.no_grad(): 172 | for i in range(num_embeddings): 173 | self.weight[i].data.copy_(th.roll(self.row_vec, i, 0)) 174 | 175 | def forward(self, x): 176 | if not self.allow_forward: 177 | with th.no_grad(): 178 | for i in range(self.num_embeddings): 179 | self.weight[i] = th.roll(self.row_vec, i, 0) 180 | self.allow_forward = True 181 | 182 | if x.dtype == th.long: 183 | emb = super(EquivariantEmbedding, self).forward(x) 184 | elif x.dtype == th.float and x.size(-1) == self.num_embeddings: 185 | x_size = x.size() 186 | emb = th.mm(x.view(-1, x_size[-1]), self.weight) 187 | emb = emb.view(x_size[:-1] + (self.embedding_dim, )) 188 | else: 189 | raise NotImplementedError 190 | return emb 191 | 192 | def backward(self, x): 193 | self.allow_forward = False 194 | return super(EquivariantEmbedding, self).backward(x) 195 | 196 | 197 | class MultihotEmbedding(Embedding): 198 | def __init__(self, max_n=1024, base=2): 199 | self.max_n = max_n 200 | self.base = base 201 | 202 | enc_len = get_enc_len(max_n-1, base) 203 | super(MultihotEmbedding, self).__init__(max_n, 2*enc_len) 204 | with th.no_grad(): 205 | self.weight.data.copy_(th.from_numpy(int2multihot(np.arange(0, max_n), enc_len, base)).float()) 206 | 207 | def extra_repr(self): 208 | return "base=%d, max_n=%d, enc_dim=%d" % (self.base, self.max_n, self.weight.shape[1]) 209 | 210 | 211 | class PositionEmbedding(Embedding): 212 | def __init__(self, embedding_dim, max_len=512, scale=1): 213 | 214 | freq_seq = th.arange(0, embedding_dim, 2.0, dtype=th.float) 215 | inv_freq = th.pow(10000, (freq_seq / embedding_dim)).reciprocal() 216 | sinusoid_inp = th.ger(th.arange(0, max_len, 1.0), inv_freq) 217 | super(PositionEmbedding, self).__init__(max_len, embedding_dim) 218 | with th.no_grad(): 219 | self.weight.data.copy_(th.cat([th.sin(sinusoid_inp), th.cos(sinusoid_inp)], dim=-1) * scale) 220 | 221 | def extra_repr(self): 222 | return "embedding_dim=%d, max_len=%d" % (self.weight.shape[1], self.weight.shape[0]) 223 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/models/filter.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class ScalarFilter(nn.Module): 7 | def __init__(self): 8 | super(ScalarFilter, self).__init__() 9 | 10 | def forward(self, p_x, g_x): 11 | """ 12 | input should be scalar: bsz x l1, bsz x l2 13 | return bsz x l2 14 | """ 15 | matrix = g_x.unsqueeze(2) - p_x.unsqueeze(1) # bsz x l1 x l2 16 | return th.max(matrix == 0, dim=2)[0] 17 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/models/rgin.py: -------------------------------------------------------------------------------- 1 | import dgl.function as fn 2 | import torch as th 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from .basemodel import GraphAdjModel 7 | from .container import * 8 | # from ..constants import * 9 | # from ..utils.act import map_activation_str_to_layer 10 | # from ..utils.init import init_weight, init_module 11 | from constants import * 12 | from utils.act import map_activation_str_to_layer 13 | from utils.init import init_weight, init_module 14 | 15 | 16 | class RGINLayer(nn.Module): 17 | def __init__( 18 | self, 19 | input_dim, 20 | hidden_dim, 21 | num_rels=1, 22 | regularizer="basis", 23 | num_bases=-1, 24 | num_mlp_layers=2, 25 | self_loop=True, 26 | bias=True, 27 | batch_norm=False, 28 | act_func="relu", 29 | dropout=0.0, 30 | ): 31 | super(RGINLayer, self).__init__() 32 | assert regularizer in ["none", "basis", "bdd"] 33 | 34 | self.input_dim = input_dim 35 | self.hidden_dim = hidden_dim 36 | self.num_rels = num_rels 37 | self.regularizer = regularizer 38 | if regularizer == "none" or num_bases is None or num_bases > num_rels or num_bases <= 0: 39 | self.num_bases = num_rels 40 | else: 41 | self.num_bases = num_bases 42 | if self_loop: 43 | self.loop_weight = nn.Parameter(th.Tensor(input_dim, hidden_dim)) 44 | else: 45 | self.register_parameter("loop_weight", None) 46 | if bias: 47 | self.bias = nn.Parameter(th.Tensor(hidden_dim)) 48 | else: 49 | self.register_parameter("bias", None) 50 | self.mlp = [] 51 | for i in range(num_mlp_layers): 52 | self.mlp.append(nn.Linear(hidden_dim, hidden_dim)) 53 | if i != num_mlp_layers - 1: 54 | if batch_norm: 55 | self.mlp.append(nn.BatchNorm1d(hidden_dim)) 56 | self.mlp.append(map_activation_str_to_layer(act_func)) 57 | self.mlp = Sequential(*self.mlp) 58 | self.act = map_activation_str_to_layer(act_func) 59 | self.drop = nn.Dropout(dropout) 60 | 61 | if regularizer == "none" or regularizer == "basis": 62 | # add basis weights 63 | self.weight = nn.Parameter(th.Tensor(self.num_bases, self.input_dim, self.hidden_dim)) 64 | if self.num_bases < self.num_rels: 65 | # linear combination coefficients 66 | self.w_comp = nn.Parameter(th.Tensor(self.num_rels, self.num_bases)) 67 | else: 68 | self.register_parameter("w_comp", None) 69 | elif regularizer == "bdd": 70 | if input_dim % self.num_bases != 0 or hidden_dim % self.num_bases != 0: 71 | raise ValueError('Feature size must be a multiplier of num_bases (%d).' % self.num_bases) 72 | # add block diagonal weights 73 | submat_in = input_dim // self.num_bases 74 | submat_out = hidden_dim // self.num_bases 75 | 76 | # assuming input_dim and hidden_dim are both divisible by num_bases 77 | self.weight = nn.Parameter(th.Tensor(self.num_rels, self.num_bases * submat_in * submat_out)) 78 | self.register_parameter("w_comp", None) 79 | else: 80 | raise ValueError("Regularizer must be either 'basis' or 'bdd'") 81 | 82 | # init 83 | init_weight(self.weight, activation=act_func, init="uniform") 84 | if self.w_comp is not None: 85 | init_weight(self.w_comp, activation=act_func, init="uniform") 86 | if self_loop: 87 | init_weight(self.loop_weight, activation=act_func, init="uniform") 88 | nn.init.zeros_(self.bias) 89 | 90 | self.node_init_func = self._node_init_func 91 | self.edge_init_func = self._edge_init_func 92 | if regularizer == "none" or regularizer == "basis": 93 | self.node_message_func = self._basis_message_func 94 | elif regularizer == "bdd": 95 | self.node_message_func = self._bdd_message_func 96 | else: 97 | raise ValueError("Regularizer must be either 'basis' or 'bdd'") 98 | self.node_reduce_func = fn.sum(msg=NODEMSG, out=NODEAGG) 99 | self.node_update_func = self._node_update_func 100 | self.edge_update_func = None 101 | 102 | def _basis_message_func(self, edges): 103 | if self.num_bases < self.num_rels: 104 | # generate all weights from bases 105 | weight = self.weight.view(self.num_bases, self.input_dim * self.hidden_dim) 106 | weight = th.matmul(self.w_comp, weight).view(self.num_rels, self.input_dim, self.hidden_dim) 107 | else: 108 | weight = self.weight 109 | weight = weight.index_select(0, edges.data[EDGETYPE]) 110 | msg = th.bmm(edges.src[NODEFEAT].unsqueeze(1), weight).squeeze(1) 111 | 112 | return {NODEMSG: msg} 113 | 114 | def _bdd_message_func(self, edges): 115 | submat_in = self.input_dim // self.num_bases 116 | submat_out = self.hidden_dim // self.num_bases 117 | weight = self.weight.index_select(0, edges.data[EDGETYPE]).view(-1, submat_in, submat_out) 118 | msg = th.bmm(edges.src[NODEFEAT].view(-1, 1, submat_in), weight).view(-1, self.hidden_dim) 119 | 120 | return {NODEMSG: msg} 121 | 122 | @property 123 | def self_loop(self): 124 | return hasattr(self, "loop_weight") and self.loop_weight is not None 125 | 126 | def _node_init_func(self, graph, node_feat=None): 127 | if node_feat is not None: 128 | graph.ndata[NODEFEAT] = node_feat 129 | return node_feat 130 | 131 | def _edge_init_func(self, graph, edge_type=None): 132 | if edge_type is not None: 133 | graph.edata[EDGETYPE] = edge_type 134 | 135 | return edge_type 136 | 137 | def _node_update_func(self, nodes): 138 | agg = nodes.data[NODEAGG] 139 | 140 | if self.self_loop: 141 | loop_msg = th.matmul(nodes.data[NODEFEAT], self.loop_weight) 142 | out = agg + loop_msg 143 | else: 144 | out = agg 145 | if self.bias is not None: 146 | out = out + self.bias 147 | if len(self.mlp) > 0: 148 | out = self.mlp(out) 149 | else: 150 | out = self.act(out) 151 | out = self.act(out) 152 | out = self.drop(out) 153 | 154 | return {NODEOUTPUT: out} 155 | 156 | def forward(self, g, node_feat, edge_type): 157 | self.node_init_func(g, node_feat) 158 | self.edge_init_func(g, edge_type) 159 | g.update_all(self.node_message_func, self.node_reduce_func, self.node_update_func) 160 | return g.ndata.pop(NODEOUTPUT), edge_type 161 | 162 | def get_output_dim(self): 163 | return self.hidden_dim 164 | 165 | def extra_repr(self): 166 | summary = [ 167 | "in=%d, out=%d," % (self.input_dim, self.hidden_dim), 168 | "num_rels=%d, regularizer=%s, num_bases=%d," % (self.num_rels, self.regularizer, self.num_bases), 169 | "edge_norm=%s, self_loop=%s, bias=%s," % (self.edge_norm, self.self_loop, self.bias is not None), 170 | ] 171 | 172 | return "\n".join(summary) 173 | 174 | 175 | class RGIN(GraphAdjModel): 176 | def __init__(self, **kw): 177 | super(RGIN, self).__init__(**kw) 178 | 179 | def create_rep_net(self, type, **kw): 180 | if type == "graph": 181 | num_layers = kw.get("rep_num_graph_layers", 1) 182 | num_rels = self.max_ngel 183 | elif type == "pattern": 184 | if self.share_rep_net: 185 | return self.g_rep_net 186 | num_layers = kw.get("rep_num_pattern_layers", 1) 187 | num_rels = self.max_npel 188 | regularizer = kw.get("rep_rgin_regularizer", "basis") 189 | num_bases = kw.get("rep_rgin_num_bases", -1) 190 | num_mlp_layers = kw.get("rep_rgin_num_mlp_layers", 2) 191 | batch_norm = kw.get("rep_rgin_batch_norm", False) 192 | act_func = kw.get("rep_act_func", "relu") 193 | dropout = kw.get("rep_dropout", 0.0) 194 | 195 | rgin = ModuleList() 196 | for i in range(num_layers): 197 | rgin.add_module( 198 | "%s_rgin_(%d)" % (type, i), 199 | RGINLayer( 200 | self.hid_dim, 201 | self.hid_dim, 202 | num_rels=num_rels, 203 | regularizer=regularizer, 204 | num_bases=num_bases, 205 | num_mlp_layers=num_mlp_layers, 206 | batch_norm=batch_norm, 207 | act_func=act_func, 208 | dropout=dropout 209 | ) 210 | ) 211 | 212 | return ModuleDict({"rgin": rgin}) 213 | 214 | def get_pattern_rep(self, pattern, p_emb, mask=None): 215 | if mask is not None: 216 | p_zero_mask = ~(mask) 217 | outputs = [p_emb.masked_fill(p_zero_mask, 0.0)] 218 | etype = pattern.edata["label"] 219 | for layer in self.p_rep_net["rgin"]: 220 | o, etype = layer(pattern, outputs[-1], etype) 221 | outputs.append(o.masked_fill(p_zero_mask, 0.0)) 222 | else: 223 | outputs = [p_emb] 224 | etype = pattern.edata["label"] 225 | for layer in self.p_rep_net["rgin"]: 226 | o, etype = layer(pattern, outputs[-1], etype) 227 | if self.rep_residual and outputs[-1].size() == o.size(): 228 | outputs.append(outputs[-1] + o) 229 | else: 230 | outputs.append(o) 231 | 232 | return outputs[-1] 233 | 234 | def get_graph_rep(self, graph, g_emb, mask=None, gate=None): 235 | if mask is None and gate is None: 236 | outputs = [g_emb] 237 | etype = graph.edata["label"] 238 | for layer in self.g_rep_net["rgin"]: 239 | o, etype = layer(graph, outputs[-1], etype) 240 | if self.rep_residual and outputs[-1].size() == o.size(): 241 | outputs.append(outputs[-1] + o) 242 | else: 243 | outputs.append(o) 244 | else: 245 | if gate is None: 246 | gate = mask.float() 247 | elif mask is not None: 248 | gate = mask.float() * gate 249 | 250 | outputs = [g_emb * gate] 251 | etype = graph.edata["label"] 252 | for layer in self.g_rep_net["rgin"]: 253 | o, etype = layer(graph, outputs[-1], etype) 254 | o = o * gate 255 | if self.rep_residual and outputs[-1].size() == o.size(): 256 | outputs.append(outputs[-1] + o) 257 | else: 258 | outputs.append(o) 259 | 260 | return outputs[-1] 261 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/models/rnn.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .basemodel import EdgeSeqModel 6 | from .container import * 7 | # from ..utils.act import map_activation_str_to_layer 8 | # from ..utils.init import init_weight, init_module 9 | from utils.act import map_activation_str_to_layer 10 | from utils.init import init_weight, init_module 11 | 12 | 13 | class RNNLayer(nn.Module): 14 | def __init__(self, rep_rnn_type, input_dim, hid_dim, layer_norm=False, bidirectional=False, dropout=0.0): 15 | super(RNNLayer, self).__init__() 16 | self.input_dim = input_dim 17 | self.hid_dim = hid_dim 18 | if rep_rnn_type == "LSTM": 19 | self.layer = nn.LSTM( 20 | input_dim, hid_dim//2 if bidirectional else hid_dim, 21 | bidirectional=bidirectional, batch_first=True 22 | ) 23 | elif rep_rnn_type == "GRU": 24 | self.layer = nn.GRU( 25 | input_dim, hid_dim//2 if bidirectional else hid_dim, 26 | bidirectional=bidirectional, batch_first=True 27 | ) 28 | elif rep_rnn_type == "RNN": 29 | self.layer = nn.RNN( 30 | input_dim, hid_dim//2 if bidirectional else hid_dim, 31 | bidirectional=bidirectional, batch_first=True 32 | ) 33 | if layer_norm: 34 | self.ln = nn.LayerNorm(hid_dim) 35 | else: 36 | self.ln = None 37 | self.drop = nn.Dropout(dropout) 38 | 39 | # init 40 | init_module(self.layer) 41 | 42 | def forward(self, x): 43 | o = self.layer(x)[0] 44 | if self.ln is not None: 45 | o = self.ln(o) 46 | o = self.drop(o) 47 | 48 | return o 49 | 50 | def get_output_dim(self): 51 | return self.hid_dim - self.hid_dim % 2 52 | 53 | def extra_repr(self): 54 | "" 55 | 56 | 57 | class RNN(EdgeSeqModel): 58 | def __init__(self, **kw): 59 | super(RNN, self).__init__(**kw) 60 | 61 | def create_rep_net(self, type, **kw): 62 | if type == "graph": 63 | num_layers = kw.get("rep_num_graph_layers", 1) 64 | elif type == "pattern": 65 | if self.share_rep_net: 66 | return self.g_rep_net 67 | num_layers = kw.get("rep_num_pattern_layers", 1) 68 | rep_rnn_type = kw.get("rep_rnn_type", "LSTM") 69 | bidirectional = kw.get("rep_rnn_bidirectional", False) 70 | dropout = kw.get("rep_dropout", 0.0) 71 | 72 | rnn = ModuleList() 73 | for i in range(num_layers): 74 | rnn.add_module( 75 | "%s_rnn_(%d)" % (type, i), 76 | RNNLayer( 77 | rep_rnn_type, 78 | self.hid_dim, self.hid_dim, 79 | bidirectional=bidirectional, 80 | dropout=dropout 81 | ) 82 | ) 83 | 84 | return ModuleDict({"rnn": rnn}) 85 | 86 | def get_pattern_rep(self, p_emb, mask=None): 87 | if mask is not None: 88 | p_zero_mask = ~(mask) 89 | outputs = [p_emb.masked_fill(p_zero_mask, 0.0)] 90 | for layer in self.p_rep_net["rnn"]: 91 | o = layer(outputs[-1]) 92 | outputs.append(o.masked_fill(p_zero_mask, 0.0)) 93 | else: 94 | outputs = [p_emb] 95 | for layer in self.p_rep_net["rnn"]: 96 | o = layer(outputs[-1]) 97 | if self.rep_residual and outputs[-1].size() == o.size(): 98 | outputs.append(outputs[-1] + o) 99 | else: 100 | outputs.append(o) 101 | 102 | return outputs[-1] 103 | 104 | def get_graph_rep(self, g_emb, mask=None, gate=None): 105 | if mask is None and gate is None: 106 | outputs = [g_emb] 107 | for layer in self.g_rep_net["rnn"]: 108 | o = layer(outputs[-1]) 109 | if self.rep_residual and outputs[-1].size() == o.size(): 110 | outputs.append(outputs[-1] + o) 111 | else: 112 | outputs.append(o) 113 | else: 114 | gate = ((mask.float() if mask is not None else 1) * (gate if gate is not None else 1)) 115 | outputs = [g_emb * gate] 116 | for layer in self.g_rep_net["rnn"]: 117 | o = layer(outputs[-1]) 118 | o = o * gate 119 | if self.rep_residual and outputs[-1].size() == o.size(): 120 | outputs.append(outputs[-1] + o) 121 | else: 122 | outputs.append(o) 123 | 124 | return outputs[-1] 125 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .act import * 2 | from .anneal import * 3 | from .cyclical import * 4 | from .dl import * 5 | from .init import * 6 | from .io import * 7 | from .log import * 8 | from .sampler import * 9 | from .scheduler import * 10 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/utils/anneal.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | PI = 3.141592653589793 5 | INIT_STEPS = 600 6 | SCHEDULE_STEPS = 10000 7 | NUM_CYCLES = 2 8 | MIN_PERCENT = 1e-3 9 | 10 | 11 | def anneal_fn( 12 | fn, 13 | current_step, 14 | num_init_steps=INIT_STEPS, 15 | num_anneal_steps=SCHEDULE_STEPS, 16 | num_cycles=NUM_CYCLES, 17 | value1=0.0, 18 | value2=1.0 19 | ): 20 | if current_step < num_init_steps: 21 | return anneal_fn( 22 | fn, 23 | current_step, 24 | num_init_steps=0, 25 | num_anneal_steps=num_init_steps * 2, 26 | num_cycles=1, 27 | value1=value2, 28 | value2=value1 29 | ) 30 | if current_step > num_anneal_steps: 31 | return value2 32 | 33 | if not fn or fn == "none" or fn == "constant": 34 | return value2 35 | 36 | progress = float(num_cycles * (current_step - num_init_steps)) / max(1, num_anneal_steps - num_init_steps) % 1 37 | 38 | if fn == "linear": 39 | if progress < 0.5: 40 | return float(value1 + (value2 - value1) * progress * 2) 41 | else: 42 | return value2 43 | elif fn == "cosine": 44 | if progress < 0.5: 45 | return float(value1 + (value2 - value1) * (1 - math.cos(PI * progress * 2)) / 2) 46 | else: 47 | return value2 48 | else: 49 | raise NotImplementedError 50 | 51 | 52 | class AnnealManner: 53 | def __init__( 54 | self, 55 | fn, 56 | current_step=0, 57 | num_init_steps=INIT_STEPS, 58 | num_anneal_steps=SCHEDULE_STEPS, 59 | num_cycles=NUM_CYCLES, 60 | value1=0.0, 61 | value2=1.0 62 | ): 63 | self.fn = fn 64 | self.num_init_steps = num_init_steps 65 | self.num_anneal_steps = num_anneal_steps 66 | self.num_cycles = num_cycles 67 | self.value1 = value1 68 | self.value2 = value2 69 | 70 | self._step_count = current_step 71 | 72 | def step(self, step=None): 73 | value = anneal_fn( 74 | self.fn, 75 | self._step_count, 76 | num_init_steps=self.num_init_steps, 77 | num_anneal_steps=self.num_anneal_steps, 78 | num_cycles=self.num_cycles, 79 | value1=self.value1, 80 | value2=self.value2 81 | ) 82 | 83 | if step is not None: 84 | self._step_count = step 85 | else: 86 | self._step_count += 1 87 | 88 | return value 89 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/utils/cyclical.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | PI = 3.141592653589793 5 | INIT_STEPS = 600 6 | SCHEDULE_STEPS = 10000 7 | NUM_CYCLES = 2 8 | MIN_PERCENT = 1e-3 9 | 10 | 11 | def cyclical_fn( 12 | fn, 13 | current_step, 14 | num_init_steps=INIT_STEPS, 15 | num_cyclical_steps=SCHEDULE_STEPS, 16 | num_cycles=NUM_CYCLES, 17 | value1=0.0, 18 | value2=1.0 19 | ): 20 | if current_step < num_init_steps: 21 | return cyclical_fn( 22 | fn, 23 | current_step, 24 | num_init_steps=0, 25 | num_cyclical_steps=num_init_steps * 2, 26 | num_cycles=1, 27 | value1=value2, 28 | value2=value1 29 | ) 30 | if current_step > num_cyclical_steps: 31 | return value2 32 | 33 | if not fn or fn == "none" or fn == "constant": 34 | return value2 35 | 36 | progress = float(num_cycles * (current_step - num_init_steps)) / max(1, num_cyclical_steps - num_init_steps) % 1 37 | if fn == "linear": 38 | if progress < 0.5: 39 | return float(value1 + (value2 - value1) * (progress * 2)) 40 | else: 41 | return float(value2 + (value1 - value2) * (progress * 2 - 1)) 42 | elif fn == "cosine": 43 | return float(value1 + (value2 - value1) * (1 - math.cos(PI * progress * 2)) / 2) 44 | else: 45 | raise NotImplementedError 46 | 47 | 48 | class CyclicalManner: 49 | def __init__( 50 | self, 51 | fn, 52 | current_step=0, 53 | num_init_steps=INIT_STEPS, 54 | num_cyclical_steps=SCHEDULE_STEPS, 55 | num_cycles=NUM_CYCLES, 56 | value1=0.0, 57 | value2=1.0 58 | ): 59 | self.fn = fn 60 | self.num_init_steps = num_init_steps 61 | self.num_cyclical_steps = num_cyclical_steps 62 | self.num_cycles = num_cycles 63 | self.value1 = value1 64 | self.value2 = value2 65 | 66 | self._step_count = current_step 67 | 68 | def step(self, step=None): 69 | value = cyclical_fn( 70 | self.fn, 71 | self._step_count, 72 | num_init_steps=self.num_init_steps, 73 | num_cyclical_steps=self.num_cyclical_steps, 74 | num_cycles=self.num_cycles, 75 | value1=self.value1, 76 | value2=self.value2 77 | ) 78 | 79 | if step is not None: 80 | self._step_count = step 81 | else: 82 | self._step_count += 1 83 | 84 | return value 85 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/utils/dl.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import igraph as ig 3 | import json 4 | import math 5 | import numpy as np 6 | import torch as th 7 | import torch.nn as nn 8 | import os 9 | from argparse import Namespace 10 | from collections import OrderedDict, namedtuple 11 | from multiprocessing import Pool 12 | from tqdm import tqdm 13 | 14 | from .act import * 15 | 16 | 17 | def segment_data(data, max_len, pre_pad=False): 18 | pad_len = max_len - data.size(1) % max_len 19 | if pad_len != max_len: 20 | pad_size = list(data.size()) 21 | pad_size[1] = pad_len 22 | zero_pad = th.zeros(pad_size, device=data.device, dtype=data.dtype, requires_grad=False) 23 | if pre_pad: 24 | data = th.cat([zero_pad, data], dim=1) 25 | else: 26 | data = th.cat([data, zero_pad], dim=1) 27 | return th.split(data, max_len, dim=1) 28 | 29 | 30 | def segment_length(data_len, max_len): 31 | bsz = data_len.size(0) 32 | list_len = math.ceil(data_len.max().float() / max_len) 33 | segment_lens = th.arange( 34 | 0, max_len * list_len, max_len, dtype=data_len.dtype, device=data_len.device, requires_grad=False 35 | ).view(1, list_len) 36 | diff = data_len.view(-1, 1) - segment_lens 37 | fill_max = diff > max_len 38 | fill_zero = diff < 0 39 | segment_lens = diff.masked_fill(fill_max, max_len) 40 | segment_lens.masked_fill_(fill_zero, 0) 41 | return th.split(segment_lens.view(bsz, -1), 1, dim=1) 42 | 43 | 44 | def split_ids(x_ids): 45 | if x_ids[0] == x_ids[-1]: 46 | return th.LongTensor([x_ids.size(0)]).to(x_ids.device) 47 | diff = th.roll(x_ids, -1, 0) - x_ids 48 | return th.masked_select(th.arange(1, x_ids.size(0) + 1, device=x_ids.device), diff.bool()) 49 | 50 | 51 | def split_and_batchify_graph_feats(batched_graph_feats, graph_sizes, pre_pad=False): 52 | bsz = graph_sizes.size(0) 53 | dtype, device = batched_graph_feats.dtype, batched_graph_feats.device 54 | 55 | min_size, max_size = graph_sizes.min(), graph_sizes.max() 56 | if min_size == max_size: 57 | feats = batched_graph_feats.view(bsz, max_size, -1) 58 | mask = th.ones((bsz, max_size), dtype=th.bool, device=device) 59 | return feats, mask 60 | else: 61 | feats = [] 62 | mask = th.zeros((bsz, max_size), dtype=th.bool, device=device, requires_grad=False) 63 | 64 | graph_sizes_list = graph_sizes.view(-1).tolist() 65 | idx = 0 66 | if pre_pad: 67 | for i, l in enumerate(graph_sizes_list): 68 | if l < max_size: 69 | feats.append(th.zeros((max_size - l, ) + batched_graph_feats.size()[1:], dtype=dtype, device=device)) 70 | feats.append(batched_graph_feats[idx:idx + l]) 71 | mask[i, -l:].fill_(1) 72 | idx += l 73 | else: 74 | for i, l in enumerate(graph_sizes_list): 75 | feats.append(batched_graph_feats[idx:idx + l]) 76 | if l < max_size: 77 | feats.append(th.zeros((max_size - l, ) + batched_graph_feats.size()[1:], dtype=dtype, device=device)) 78 | mask[i, :l].fill_(1) 79 | idx += l 80 | feats = th.cat(feats, 0).view(bsz, max_size, -1) 81 | return feats, mask 82 | 83 | 84 | def batch_convert_list_to_tensor(batch_list, max_seq_len=-1, pad_id=0, pre_pad=False): 85 | batch_tensor = [th.tensor(v) for v in batch_list] 86 | return batch_convert_tensor_to_tensor(batch_tensor, max_seq_len=max_seq_len, pad_id=pad_id, pre_pad=pre_pad) 87 | 88 | 89 | def batch_convert_tensor_to_tensor(batch_tensor, max_seq_len=-1, pad_id=0, pre_pad=False): 90 | batch_lens = [len(v) for v in batch_tensor] 91 | if max_seq_len == -1: 92 | max_seq_len = max(batch_lens) 93 | 94 | result = th.empty( 95 | [len(batch_tensor), max_seq_len] + list(batch_tensor[0].size())[1:], 96 | dtype=batch_tensor[0].dtype, 97 | device=batch_tensor[0].device 98 | ).fill_(pad_id) 99 | for i, t in enumerate(batch_tensor): 100 | len_t = batch_lens[i] 101 | if len_t < max_seq_len: 102 | if pre_pad: 103 | result[i, -len_t:].data.copy_(t) 104 | else: 105 | result[i, :len_t].data.copy_(t) 106 | elif len_t == max_seq_len: 107 | result[i].data.copy_(t) 108 | else: 109 | result[i].data.copy_(t[:max_seq_len]) 110 | return result 111 | 112 | 113 | def batch_convert_len_to_mask(batch_lens, max_seq_len=-1, pre_pad=False): 114 | if max_seq_len == -1: 115 | max_seq_len = max(batch_lens) 116 | mask = th.ones( 117 | (len(batch_lens), max_seq_len), 118 | dtype=th.bool, 119 | device=batch_lens[0].device if isinstance(batch_lens[0], th.Tensor) else th.device("cpu") 120 | ) 121 | if pre_pad: 122 | for i, l in enumerate(batch_lens): 123 | mask[i, :-l].fill_(0) 124 | else: 125 | for i, l in enumerate(batch_lens): 126 | mask[i, l:].fill_(0) 127 | return mask 128 | 129 | 130 | def batch_convert_mask_to_start_and_end(mask): 131 | cumsum = mask.cumsum(dim=-1) * 2 132 | start_indices = cumsum.masked_fill(mask == 0, mask.size(-1)).min(dim=-1)[1] 133 | end_indices = cumsum.max(dim=-1)[1] 134 | 135 | return start_indices, end_indices 136 | 137 | 138 | def convert_dgl_graph_to_edgeseq(graph, x_emb, x_len, e_emb): 139 | uid, vid, eid = graph.all_edges(form="all", order="srcdst") 140 | e = e_emb[eid] 141 | if x_emb is not None: 142 | u, v = x_emb[uid], x_emb[vid] 143 | e = th.cat([u, v, e], dim=1) 144 | e_len = th.tensor(graph.batch_num_edges, dtype=x_len.dtype, device=x_len.device).view(x_len.size()) 145 | return e, e_len 146 | 147 | 148 | def mask_seq_by_len(x, len_x): 149 | x_size = list(x.size()) 150 | if x_size[1] == len_x.max(): 151 | mask = batch_convert_len_to_mask(len_x) 152 | mask_size = x_size[0:2] + [1] * (len(x_size) - 2) 153 | x = x * mask.view(*mask_size) 154 | return x 155 | 156 | 157 | def expand_dimensions(old_module, new_module, pre_pad=True): 158 | with th.no_grad(): 159 | # if type(old_module) != type(new_module): 160 | # raise ValueError("Error: the two input should have the same type.") 161 | if isinstance(old_module, th.Tensor) or isinstance(old_module, nn.Parameter): 162 | nn.init.zeros_(new_module) 163 | old_size = old_module.size() 164 | if pre_pad: 165 | if len(old_size) == 1: 166 | new_module.data[-old_size[0]:].copy_(old_module) 167 | elif len(old_size) == 2: 168 | new_module.data[-old_size[0]:, -old_size[1]:].copy_(old_module) 169 | elif len(old_size) == 3: 170 | new_module.data[-old_size[0]:, -old_size[1]:, -old_size[2]:].copy_(old_module) 171 | elif len(old_size) == 4: 172 | new_module.data[-old_size[0]:, -old_size[1]:, -old_size[2]:, -old_size[3]:].copy_(old_module) 173 | else: 174 | raise NotImplementedError 175 | else: 176 | if len(old_size) == 1: 177 | new_module.data[:old_size[0]].copy_(old_module) 178 | elif len(old_size) == 2: 179 | new_module.data[:old_size[0], :old_size[1]].copy_(old_module) 180 | elif len(old_size) == 3: 181 | new_module.data[:old_size[0], :old_size[1], :old_size[2]].copy_(old_module) 182 | elif len(old_size) == 4: 183 | new_module.data[:old_size[0], :old_size[1], :old_size[2], :old_size[3]].copy_(old_module) 184 | else: 185 | raise NotImplementedError 186 | return 187 | 188 | old_param_d = dict(old_module.named_parameters()) 189 | for name, params in new_module.named_parameters(): 190 | if name in old_param_d: 191 | expand_dimensions(old_param_d[name], params, pre_pad) 192 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/utils/init.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import igraph as ig 3 | import json 4 | import math 5 | import numpy as np 6 | import torch as th 7 | import torch.nn as nn 8 | import os 9 | from argparse import Namespace 10 | from collections import OrderedDict, namedtuple 11 | from multiprocessing import Pool 12 | from tqdm import tqdm 13 | 14 | from .act import * 15 | 16 | 17 | def calculate_gain(activation): 18 | if isinstance(activation, str): 19 | if activation in ["none", "maximum", "minimum"]: 20 | nonlinearity = "linear" 21 | elif activation in ["relu", "relu6", "elu", "selu", "celu", "gelu"]: 22 | nonlinearity = "relu" 23 | elif activation in ["leaky_relu", "prelu"]: 24 | nonlinearity = "leaky_relu" 25 | elif activation in ["softmax", "sparsemax", "gumbel_softmax"]: 26 | nonlinearity = "sigmoid" 27 | elif activation in ["sigmoid", "tanh"]: 28 | nonlinearity = activation 29 | else: 30 | raise NotImplementedError 31 | elif isinstance(activation, nn.Module): 32 | if isinstance(activation, (Identity, Maximum, Minimum)): 33 | nonlinearity = "linear" 34 | elif isinstance(activation, (ReLU, ReLU6, ELU, SELU, CELU, GELU)): 35 | nonlinearity = "relu" 36 | elif isinstance(activation, (LeakyReLU, PReLU)): 37 | nonlinearity = "leaky_relu" 38 | elif isinstance(activation, (Softmax, Sparsemax, GumbelSoftmax)): 39 | nonlinearity = "sigmoid" 40 | elif isinstance(activation, Sigmoid): 41 | nonlinearity = "sigmoid" 42 | elif isinstance(activation, Tanh): 43 | nonlinearity = "tanh" 44 | else: 45 | raise NotImplementedError 46 | else: 47 | raise ValueError 48 | 49 | return nn.init.calculate_gain(nonlinearity, LEAKY_RELU_A) 50 | 51 | 52 | def calculate_fan_in_and_fan_out(x): 53 | if x.dim() < 2: 54 | x = x.unsqueeze(-1) 55 | num_input_fmaps = x.size(1) 56 | num_output_fmaps = x.size(0) 57 | receptive_field_size = 1 58 | if x.dim() > 2: 59 | receptive_field_size = x[0][0].numel() 60 | fan_in = num_input_fmaps * receptive_field_size 61 | fan_out = num_output_fmaps * receptive_field_size 62 | 63 | return fan_in, fan_out 64 | 65 | 66 | def zero_init(x, gain=1.0): 67 | return nn.init.zeros_(x) 68 | 69 | 70 | def xavier_uniform_init(x, gain=1.0): 71 | fan_in, fan_out = calculate_fan_in_and_fan_out(x) 72 | std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) 73 | a = 1.7320508075688772 * std 74 | 75 | return nn.init.uniform_(x, -a, a) 76 | 77 | 78 | def kaiming_normal_init(x, gain=1.0): 79 | fan_in, fan_out = calculate_fan_in_and_fan_out(x) 80 | std = gain / math.sqrt(fan_in) 81 | return nn.init.normal_(x, 0, std) 82 | 83 | 84 | def orthogonal_init(x, gain=1.0): 85 | return nn.init.orthogonal_(x, gain=1.0) 86 | 87 | 88 | def equivariant_init(x, gain=1.0): 89 | with th.no_grad(): 90 | x_size = tuple(x.size()) 91 | if len(x_size) == 1: 92 | kaiming_normal_init(x, gain=gain) 93 | elif len(x_size) == 2: 94 | kaiming_normal_init(x[0], gain=gain) 95 | vec = x[0] 96 | for i in range(1, x.size(0)): 97 | x[i].data.copy_(th.roll(vec, i, 0)) 98 | else: 99 | x = x.view(x_size[:-2] + (-1, )) 100 | equivariant_init(x, gain=gain) 101 | x = x.view(x_size) 102 | return x 103 | 104 | 105 | def identity_init(x, gain=1.0): 106 | with th.no_grad(): 107 | x_size = tuple(x.size()) 108 | if len(x_size) == 1: 109 | fan_in, fan_out = calculate_fan_in_and_fan_out(x) 110 | std = gain * (2.0 / float(fan_in + fan_out)) 111 | nn.init.ones_(x) 112 | x += th.randn_like(x) * std**2 113 | elif len(x_size) == 2: 114 | fan_in, fan_out = calculate_fan_in_and_fan_out(x) 115 | std = gain * (2.0 / float(fan_in + fan_out)) 116 | nn.init.eye_(x) 117 | x += th.randn_like(x) * std**2 118 | else: 119 | x = x.view(x_size[:-2] + (-1, )) 120 | identity_init(x, gain=gain) 121 | x = x.view(x_size) 122 | return x 123 | 124 | 125 | def init_weight(x, activation="none", init="uniform"): 126 | gain = calculate_gain(activation) 127 | if init == "zero": 128 | init_func = zero_init 129 | elif init == "identity": 130 | init_func = identity_init 131 | elif init == "uniform": 132 | init_func = xavier_uniform_init 133 | elif init == "normal": 134 | init_func = kaiming_normal_init 135 | elif init == "orthogonal": 136 | init_func = orthogonal_init 137 | elif init == "equivariant": 138 | init_func = equivariant_init 139 | else: 140 | raise ValueError("init=%s is not supported now." % (init)) 141 | 142 | if isinstance(x, th.Tensor): 143 | init_func(x, gain=gain) 144 | 145 | 146 | def init_module(x, activation="none", init="uniform"): 147 | gain = calculate_gain(activation) 148 | if init == "zero": 149 | init_func = zero_init 150 | elif init == "identity": 151 | init_func = identity_init 152 | elif init == "uniform": 153 | init_func = xavier_uniform_init 154 | elif init == "normal": 155 | init_func = kaiming_normal_init 156 | elif init == "orthogonal": 157 | init_func = orthogonal_init 158 | elif init == "equivariant": 159 | init_func = equivariant_init 160 | else: 161 | raise ValueError("init=%s is not supported now." % (init)) 162 | 163 | if isinstance(x, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)): 164 | init_func(x.weight, gain=gain) 165 | if hasattr(x, "bias") and x.bias is not None: 166 | nn.init.zeros_(x.bias) 167 | elif isinstance(x, nn.Embedding): 168 | with th.no_grad(): 169 | if init == "uniform": 170 | nn.init.uniform_(x.weight, -1.0, 1.0) 171 | elif init == "normal": 172 | nn.init.normal_(x.weight, 0.0, 1.0) 173 | elif init == "orthogonal": 174 | nn.init.orthogonal_(x.weight, gain=math.sqrt(calculate_fan_in_and_fan_out(x.weight)[0]) * 1.0) 175 | elif init == "identity": 176 | nn.init.eye_(x.weight) 177 | elif init == "equivariant": 178 | nn.init.normal_(x.weight[0], 0.0, 1.0) 179 | vec = x.weight[0] 180 | for i in range(1, x.weight.size(0)): 181 | x.weight[i].data.copy_(th.roll(vec, i, 0)) 182 | if x.padding_idx is not None: 183 | x.weight[x.padding_idx].fill_(0) 184 | elif isinstance(x, nn.RNNBase): 185 | for layer_weights in x._all_weights: 186 | for w in layer_weights: 187 | if "weight" in w: 188 | init_func(getattr(x, w)) 189 | elif "bias" in w: 190 | nn.init.zeros_(getattr(x, w)) 191 | elif isinstance(x, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.LayerNorm, nn.GroupNorm)): 192 | nn.init.ones_(x.weight) 193 | nn.init.zeros_(x.bias) 194 | 195 | 196 | def change_dropout_rate(model, dropout): 197 | for name, child in model.named_children(): 198 | if isinstance(child, nn.Dropout): 199 | child.p = dropout 200 | change_dropout_rate(child, dropout) 201 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/utils/io.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import igraph as ig 3 | import json 4 | import math 5 | import numpy as np 6 | import torch as th 7 | import torch.nn as nn 8 | import os 9 | from argparse import Namespace 10 | from collections import OrderedDict, namedtuple 11 | from multiprocessing import Pool 12 | from tqdm import tqdm 13 | 14 | from .act import * 15 | 16 | csv.field_size_limit(500 * 1024 * 1024) 17 | 18 | 19 | def get_subdirs(dirpath, leaf_only=True): 20 | subdirs = list() 21 | is_leaf = True 22 | for filename in os.listdir(dirpath): 23 | filename = os.path.join(dirpath, filename) 24 | if os.path.isdir(filename): 25 | is_leaf = False 26 | subdirs.extend(get_subdirs(filename, leaf_only=leaf_only)) 27 | if not leaf_only or is_leaf: 28 | subdirs.append(dirpath) 29 | return subdirs 30 | 31 | 32 | def get_files(dirpath): 33 | files = list() 34 | for filename in os.listdir(dirpath): 35 | filename = os.path.join(dirpath, filename) 36 | if os.path.isdir(filename): 37 | files.extend(get_files(filename)) 38 | else: 39 | files.append(filename) 40 | return files 41 | 42 | 43 | def _read_graphs_from_dir(dirpath): 44 | graphs = dict() 45 | for filename in os.listdir(dirpath): 46 | if not os.path.isdir(os.path.join(dirpath, filename)): 47 | names = os.path.splitext(os.path.basename(filename)) 48 | if names[1] != ".gml": 49 | continue 50 | try: 51 | graph = ig.read(os.path.join(dirpath, filename)) 52 | graph.vs["id"] = list(map(int, graph.vs["id"])) 53 | graph.vs["label"] = list(map(int, graph.vs["label"])) 54 | graph.es["label"] = list(map(int, graph.es["label"])) 55 | graph.es["key"] = list(map(int, graph.es["key"])) 56 | graphs[names[0]] = graph 57 | except BaseException as e: 58 | print(e) 59 | break 60 | return graphs 61 | 62 | 63 | def read_graphs_from_dir(dirpath, num_workers=4): 64 | graphs = dict() 65 | subdirs = get_subdirs(dirpath) 66 | with Pool(num_workers if num_workers > 0 else os.cpu_count()) as pool: 67 | results = list() 68 | for subdir in subdirs: 69 | results.append((subdir, pool.apply_async(_read_graphs_from_dir, args=(subdir, )))) 70 | pool.close() 71 | 72 | for subdir, x in tqdm(results): 73 | x = x.get() 74 | graphs[os.path.basename(subdir)] = x 75 | dirpath = os.path.basename(dirpath) 76 | if dirpath in graphs and (dirpath == "graphs" or "G_" not in dirpath): 77 | graphs.update(graphs.pop(dirpath)) 78 | return graphs 79 | 80 | 81 | def read_patterns_from_dir(dirpath, num_workers=4): 82 | patterns = dict() 83 | subdirs = get_subdirs(dirpath) 84 | with Pool(num_workers if num_workers > 0 else os.cpu_count()) as pool: 85 | results = list() 86 | for subdir in subdirs: 87 | results.append((subdir, pool.apply_async(_read_graphs_from_dir, args=(subdir, )))) 88 | pool.close() 89 | 90 | for subdir, x in tqdm(results): 91 | x = x.get() 92 | patterns.update(x) 93 | dirpath = os.path.basename(dirpath) 94 | if dirpath in patterns and (dirpath == "patterns" or "P_" not in dirpath): 95 | patterns.update(patterns.pop(dirpath)) 96 | return patterns 97 | 98 | 99 | def _read_metadata_from_csv(csv_file): 100 | meta = dict() 101 | try: 102 | with open(csv_file, "r", newline="") as f: 103 | csv_reader = csv.reader(f, delimiter=",") 104 | header = next(csv_reader) 105 | gid_idx = header.index("g_id") 106 | cnt_idx = header.index("counts") 107 | iso_idx = header.index("subisomorphisms") 108 | for row in csv_reader: 109 | meta[row[gid_idx]] = { 110 | "counts": int(row[cnt_idx]), 111 | "subisomorphisms": np.asarray(eval(row[iso_idx]), dtype=np.int64) 112 | } 113 | except BaseException as e: 114 | print(csv_file, e) 115 | return meta 116 | 117 | 118 | def read_metadata_from_dir(dirpath, num_workers=4): 119 | meta = dict() 120 | files = get_files(dirpath) 121 | with Pool(num_workers if num_workers > 0 else os.cpu_count()) as pool: 122 | results = list() 123 | for filename in files: 124 | if filename.endswith(".csv"): 125 | results.append( 126 | ( 127 | os.path.splitext(os.path.basename(filename))[0], 128 | pool.apply_async(_read_metadata_from_csv, args=(filename, )) 129 | ) 130 | ) 131 | pool.close() 132 | 133 | for p_id, x in tqdm(results): 134 | x = x.get() 135 | if p_id not in meta: 136 | meta[p_id] = x 137 | else: 138 | meta[p_id].update(x) 139 | dirpath = os.path.basename(dirpath) 140 | if dirpath in meta and dirpath == "metadata": 141 | meta.update(meta.pop(dirpath)) 142 | return meta 143 | 144 | 145 | def load_data(pattern_dir, graph_dir, metadata_dir, num_workers=4): 146 | patterns = read_patterns_from_dir(pattern_dir, num_workers=num_workers) 147 | graphs = read_graphs_from_dir(graph_dir, num_workers=num_workers) 148 | meta = read_metadata_from_dir(metadata_dir, num_workers=num_workers) 149 | 150 | if os.path.exists(os.path.join(metadata_dir, "train.txt")): 151 | train_indices = set([int(x) for x in open(os.path.join(metadata_dir, "train.txt"))]) 152 | else: 153 | train_indices = None 154 | if os.path.exists(os.path.join(metadata_dir, "dev.txt")): 155 | dev_indices = set([int(x) for x in open(os.path.join(metadata_dir, "dev.txt"))]) 156 | else: 157 | dev_indices = None 158 | if os.path.exists(os.path.join(metadata_dir, "test.txt")): 159 | test_indices = set([int(x) for x in open(os.path.join(metadata_dir, "test.txt"))]) 160 | else: 161 | test_indices = None 162 | 163 | train_data, dev_data, test_data = list(), list(), list() 164 | shared_graph = True 165 | for p, pattern in patterns.items(): 166 | # each pattern corresponds to specific graphs 167 | if p in graphs: 168 | shared_graph = False 169 | for g, graph in graphs[p].items(): 170 | x = dict() 171 | x["id"] = ("%s-%s" % (p, g)) 172 | x["pattern"] = pattern 173 | x["graph"] = graph 174 | x["subisomorphisms"] = meta[p][g]["subisomorphisms"] 175 | x["counts"] = meta[p][g]["counts"] 176 | 177 | g_idx = int(g.rsplit("_", 1)[-1]) 178 | if train_indices is not None: 179 | if g_idx in train_indices: 180 | train_data.append(x) 181 | elif g_idx % 10 > 1: 182 | train_data.append(x) 183 | if dev_indices is not None: 184 | if g_idx in dev_indices: 185 | dev_data.append(x) 186 | elif g_idx % 10 == 0: 187 | dev_data.append(x) 188 | if test_indices is not None: 189 | if g_idx in test_indices: 190 | test_data.append(x) 191 | elif g_idx % 10 == 1: 192 | test_data.append(x) 193 | # patterns share graphs 194 | else: 195 | for g, graph in graphs.items(): 196 | x = dict() 197 | x["id"] = ("%s-%s" % (p, g)) 198 | x["pattern"] = pattern 199 | x["graph"] = graph 200 | x["subisomorphisms"] = meta[p][g]["subisomorphisms"] 201 | x["counts"] = meta[p][g]["counts"] 202 | 203 | g_idx = int(g.rsplit("_", 1)[-1]) 204 | if train_indices is not None: 205 | if g_idx in train_indices: 206 | train_data.append(x) 207 | elif g_idx % 3 > 1: 208 | train_data.append(x) 209 | if dev_indices is not None: 210 | if g_idx in dev_indices: 211 | dev_data.append(x) 212 | elif g_idx % 3 == 0: 213 | dev_data.append(x) 214 | if test_indices is not None: 215 | if g_idx in test_indices: 216 | test_data.append(x) 217 | elif g_idx % 3 == 1: 218 | test_data.append(x) 219 | 220 | return OrderedDict({"train": train_data, "dev": dev_data, "test": test_data}), shared_graph 221 | 222 | 223 | def str2value(x): 224 | try: 225 | return eval(x) 226 | except: 227 | return x 228 | 229 | 230 | def str2bool(x): 231 | x = x.lower() 232 | return x == "true" or x == "yes" 233 | 234 | 235 | def str2list(x): 236 | results = [] 237 | for x in x.split(","): 238 | x = x.strip() 239 | if x == "" or x == "null": 240 | continue 241 | try: 242 | x = str2value(x) 243 | except: 244 | pass 245 | results.append(x) 246 | return results 247 | 248 | 249 | def load_config(path, as_dict=True): 250 | with open(path, "r") as f: 251 | config = json.load(f) 252 | if not as_dict: 253 | config = namedtuple("config", config.keys())(*config.values()) 254 | return config 255 | 256 | 257 | def save_config(config, path): 258 | if isinstance(config, dict): 259 | pass 260 | elif isinstance(config, Namespace): 261 | config = vars(config) 262 | else: 263 | try: 264 | config = config._as_dict() 265 | except BaseException as e: 266 | raise e 267 | 268 | with open(path, "w") as f: 269 | json.dump(config, f) 270 | 271 | 272 | class TensorEncoder(json.JSONEncoder): 273 | def default(self, obj): 274 | if isinstance(obj, np.integer): 275 | return int(obj) 276 | elif isinstance(obj, np.floating): 277 | return float(obj) 278 | elif isinstance(obj, (np.ndarray, th.Tensor)): 279 | return obj.tolist() 280 | else: 281 | return super(TensorEncoder, self).default(obj) 282 | 283 | 284 | def load_results(path): 285 | with open(path, "w") as f: 286 | results = json.load(f) 287 | return results 288 | 289 | 290 | def save_results(results, path): 291 | with open(path, "w") as f: 292 | json.dump(results, f, cls=TensorEncoder) 293 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/utils/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | 5 | 6 | def init_logger(log_file=None, log_tag="GOOD LUCK"): 7 | logger = logging.getLogger() 8 | logger.setLevel(logging.INFO) 9 | 10 | log_format = logging.Formatter("[%(asctime)s {log_tag}] %(message)s".format(log_tag=log_tag)) 11 | console_handler = logging.StreamHandler() 12 | console_handler.setFormatter(log_format) 13 | logger.addHandler(console_handler) 14 | 15 | if log_file: 16 | os.makedirs(os.path.dirname(log_file), exist_ok=True) 17 | file_handler = logging.FileHandler(log_file) 18 | file_handler.setFormatter(log_format) 19 | logger.addHandler(file_handler) 20 | 21 | setattr(logger, "prefix_len", len(logger.handlers[0].formatter._fmt)) 22 | 23 | return logger 24 | 25 | 26 | def close_logger(logger): 27 | handlers = logger.handlers[:] 28 | for handler in handlers: 29 | handler.close() 30 | logger.removeHandler(handler) 31 | 32 | 33 | def generate_log_line(data_type, epoch=-1, total_epochs=-1, step=-1, total_steps=-1, **kw): 34 | line = ["data_type: {:<10s}".format(data_type)] 35 | if epoch != -1 and total_epochs != -1: 36 | line.append("epoch: {:0>5d}/{:0>5d}".format(epoch, total_epochs)) 37 | if step != -1 and total_steps != -1: 38 | line.append("step: {:0>5d}/{:0>5d}".format(step, total_steps)) 39 | for k, v in kw.items(): 40 | if isinstance(v, float): 41 | line.append("{}: {:8>5.3f}".format(k, v)) 42 | elif isinstance(v, int): 43 | line.append("{}: {:0>3d}".format(k, v)) 44 | else: 45 | line.append("{}: {}".format(k, v)) 46 | line = "\t".join(line) 47 | return line 48 | 49 | 50 | def generate_best_line(data_type, epoch, total_epochs, **kw): 51 | line = \ 52 | ["data_type: " + str(data_type)] + \ 53 | ["best %s: %s" % (str(k), str(v)) for k, v in kw.items()] + \ 54 | ["(epoch: %d/%d)" % (epoch, total_epochs)] 55 | line = "\t".join(line) 56 | return line 57 | 58 | 59 | def get_best_epochs(log_file): 60 | # 0: data type 61 | # 1: metric name 62 | # 2: metric value 63 | # 3: epoch 64 | regex = re.compile( 65 | r"data_type:\s+(\w+)\s+best\s+([a-zA-Z0-9\.\-\+\_]+):\s+([a-zA-Z0-9\.\-\+\_]+)\s+\(epoch:\s+(\d+)/\d+\)" 66 | ) 67 | best_epochs = dict() 68 | # get the best epoch 69 | with open(log_file, "r") as f: 70 | for line in f: 71 | matched_results = regex.findall(line) 72 | for matched_result in matched_results: 73 | if matched_result[1] not in best_epochs: 74 | best_epochs[matched_result[1]] = dict() 75 | best_epochs[matched_result[1]][matched_result[0]] = (int(matched_result[3]), float(matched_result[2])) 76 | return best_epochs 77 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/utils/sampler.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import torch as th 5 | 6 | from collections import OrderedDict 7 | from torch.utils.data import Sampler 8 | 9 | 10 | class BucketSampler(Sampler): 11 | def __init__( 12 | self, 13 | dataset, 14 | group_by, 15 | batch_size, 16 | shuffle=False, 17 | seed=0, 18 | drop_last=False 19 | ): 20 | super(BucketSampler, self).__init__(dataset) 21 | self.dataset = dataset 22 | if isinstance(group_by, str): 23 | group_by = [group_by] 24 | self.group_by = group_by 25 | self.cache = OrderedDict() 26 | for attr in group_by: 27 | self.cache[attr] = th.tensor([x[attr] for x in dataset], dtype=th.float32) 28 | self.batch_size = batch_size 29 | self.seed = seed 30 | self.shuffle = shuffle 31 | self.drop_last = drop_last 32 | self.epoch = 0 33 | 34 | if drop_last: 35 | self.num_samples = math.ceil((len(self.dataset) - self.batch_size) / self.batch_size) * self.batch_size 36 | else: 37 | self.num_samples = math.ceil(len(self.dataset) / self.batch_size) * self.batch_size 38 | self.total_size = self.num_samples 39 | 40 | def __iter__(self): 41 | rng = th.Generator() 42 | rng.manual_seed(self.seed + self.epoch) 43 | array = th.stack(list(self.cache.values()), dim=-1) 44 | 45 | if not self.drop_last: 46 | ind = th.arange(len(self.dataset)) 47 | padding_size = self.total_size - len(self.dataset) 48 | while padding_size > len(array): 49 | ind = th.cat([ind, ind], dim=0) 50 | padding_size -= len(array) 51 | if padding_size > 0: 52 | ind = th.cat([ind, th.randperm(len(self.dataset))[:padding_size]], dim=0) 53 | array = array[ind] 54 | else: 55 | ind = th.arange(self.total_size) 56 | array = array[:self.total_size] 57 | assert len(array) == self.total_size 58 | 59 | rand = th.rand((self.total_size, 1), generator=rng) 60 | array = th.cat([array, rand], dim=-1) 61 | array = array.numpy().view(list(zip(list(self.cache.keys()) + ["rand"], 62 | [np.float32] * (len(self.cache) + 1)))).flatten() 63 | indices = np.argsort(array, axis=0, order=self.group_by) 64 | batches = [indices[i:i + self.batch_size] for i in range(0, len(indices), self.batch_size)] 65 | 66 | if self.shuffle: 67 | indices = th.randperm(len(batches), generator=rng) 68 | batches = batches[indices] 69 | 70 | batch_idx = 0 71 | while batch_idx < len(batches): 72 | yield ind[batches[batch_idx]] 73 | batch_idx += 1 74 | 75 | def __len__(self): 76 | return self.num_samples // self.batch_size 77 | 78 | def set_epoch(self, epoch=-1): 79 | if epoch == -1: 80 | self.epoch += 1 81 | else: 82 | self.epoch = epoch 83 | 84 | 85 | class CircurriculumSampler(BucketSampler): 86 | def __init__( 87 | self, 88 | dataset, 89 | learning_by, 90 | used_ratio, 91 | batch_size, 92 | group_by=None, 93 | shuffle=False, 94 | seed=0, 95 | drop_last=False 96 | ): 97 | if isinstance(learning_by, str): 98 | learning_by = [learning_by] 99 | if isinstance(group_by, str): 100 | group_by = [group_by] 101 | elif group_by is None: 102 | group_by = learning_by 103 | super(CircurriculumSampler, self).__init__(dataset, group_by, batch_size, shuffle, seed, drop_last) 104 | self.learning_by = learning_by 105 | for attr in learning_by: 106 | if attr not in self.cache: 107 | self.cache[attr] = th.tensor([x[attr] for x in dataset], dtype=th.float32) 108 | 109 | self.used_ratio = used_ratio 110 | 111 | def __iter__(self): 112 | rng = th.Generator() 113 | rng.manual_seed(self.seed + self.epoch) 114 | array = th.stack(list(self.cache.values()), dim=-1) 115 | 116 | if not self.drop_last: 117 | ind = th.arange(len(self.dataset)) 118 | padding_size = self.total_size - len(self.dataset) 119 | while padding_size > len(array): 120 | ind = th.cat([ind, ind], dim=0) 121 | padding_size -= len(array) 122 | if padding_size > 0: 123 | ind = th.cat([ind, th.randperm(len(self.dataset))[:padding_size]], dim=0) 124 | array = array[ind] 125 | else: 126 | ind = th.arange(self.total_size) 127 | array = array[:self.total_size] 128 | assert len(array) == self.total_size 129 | 130 | rand = th.rand((self.total_size, 1), generator=rng) 131 | array = th.cat([array, rand], dim=-1) 132 | array = array.numpy().view(list(zip(list(self.cache.keys()) + ["rand"], 133 | [np.float32] * (len(self.cache) + 1)))).flatten() 134 | 135 | if self.learning_by == self.group_by or self.learning_by == self.group_by[:len(self.learning_by)]: 136 | group_indices = np.argsort(array, axis=0, order=self.group_by) 137 | indices = group_indices[:math.ceil(self.used_ratio * len(group_indices))] 138 | else: 139 | learn_indices = np.argsort(array, axis=0, order=self.learning_by) 140 | learn_indices = learn_indices[:int(self.used_ratio * len(learn_indices))] 141 | indices = np.argsort(array[learn_indices], axis=0, order=self.group_by) 142 | 143 | batches = [indices[i:i + self.batch_size] for i in range(0, len(indices), self.batch_size)] 144 | 145 | if self.shuffle: 146 | batches = [batches[i] for i in th.randperm(len(batches), generator=rng).tolist()] 147 | 148 | batch_idx = 0 149 | while batch_idx < len(batches): 150 | yield ind[batches[batch_idx]] 151 | batch_idx += 1 152 | -------------------------------------------------------------------------------- /SubgraphCountingMatching/utils/scheduler.py: -------------------------------------------------------------------------------- 1 | import math 2 | from torch.optim.lr_scheduler import LambdaLR 3 | 4 | 5 | PI = 3.141592653589793 6 | INIT_STEPS = 600 7 | SCHEDULE_STEPS = 10000 8 | NUM_CYCLES = 2 9 | MIN_PERCENT = 1e-3 10 | 11 | 12 | class ConstantScheduler(LambdaLR): 13 | def __init__(self): 14 | pass 15 | 16 | def set_optimizer(self, optimizer): 17 | super(ConstantScheduler, self).__init__(optimizer, self.lr_lambda) 18 | 19 | def lr_lambda(self, current_step): 20 | return 1.0 21 | 22 | 23 | class ConstantWarmupScheduler(LambdaLR): 24 | def __init__( 25 | self, 26 | num_warmup_steps=INIT_STEPS 27 | ): 28 | self.num_warmup_steps = num_warmup_steps 29 | 30 | def set_optimizer(self, optimizer): 31 | super(ConstantWarmupScheduler, self).__init__(optimizer, self.lr_lambda) 32 | 33 | def lr_lambda(self, current_step): 34 | if current_step < self.num_warmup_steps: 35 | return float(current_step) / max(1.0, float(self.num_warmup_steps)) 36 | return 1.0 37 | 38 | 39 | class LinearScheduler(LambdaLR): 40 | def __init__( 41 | self, 42 | num_schedule_steps=SCHEDULE_STEPS, 43 | min_percent=MIN_PERCENT 44 | ): 45 | self.num_schedule_steps = num_schedule_steps 46 | self.min_percent = min_percent 47 | 48 | def set_optimizer(self, optimizer): 49 | super(LinearScheduler, self).__init__(optimizer, self.lr_lambda) 50 | 51 | def lr_lambda(self, current_step): 52 | return max( 53 | self.min_percent, 54 | float(self.num_schedule_steps - current_step) / \ 55 | float(max(1, self.num_schedule_steps)) 56 | ) 57 | 58 | 59 | class LinearWarmupScheduler(LambdaLR): 60 | def __init__( 61 | self, 62 | num_warmup_steps=INIT_STEPS, 63 | num_schedule_steps=SCHEDULE_STEPS, 64 | min_percent=MIN_PERCENT 65 | ): 66 | self.num_warmup_steps = num_warmup_steps 67 | self.num_schedule_steps = num_schedule_steps 68 | self.min_percent = min_percent 69 | 70 | def set_optimizer(self, optimizer): 71 | super(LinearWarmupScheduler, self).__init__(optimizer, self.lr_lambda) 72 | 73 | def lr_lambda(self, current_step): 74 | if current_step < self.num_warmup_steps: 75 | return float(current_step) / float(max(1, self.num_warmup_steps)) 76 | return max( 77 | self.min_percent, 78 | float(self.num_schedule_steps - current_step) / \ 79 | float(max(1, self.num_schedule_steps - self.num_warmup_steps)) 80 | ) 81 | 82 | 83 | class LinearWarmupRestartScheduler(LambdaLR): 84 | def __init__( 85 | self, 86 | num_warmup_steps=INIT_STEPS, 87 | num_schedule_steps=SCHEDULE_STEPS, 88 | num_cycles=NUM_CYCLES, 89 | min_percent=MIN_PERCENT 90 | ): 91 | self.num_warmup_steps = num_warmup_steps 92 | self.num_schedule_steps = num_schedule_steps 93 | self.num_cycles = num_cycles 94 | self.min_percent = min_percent 95 | 96 | def set_optimizer(self, optimizer): 97 | super(LinearWarmupRestartScheduler, self).__init__(optimizer, self.lr_lambda) 98 | 99 | def lr_lambda(self, current_step): 100 | if current_step < self.num_warmup_steps: 101 | return float(current_step) / float(max(1, self.num_warmup_steps)) 102 | progress = float(current_step - self.num_warmup_steps) / \ 103 | float(max(1, self.num_schedule_steps - self.num_warmup_steps)) 104 | if progress >= 1.0: 105 | return self.min_percent 106 | return max(self.min_percent, 1 - (float(self.num_cycles) * progress) % 1.0) 107 | 108 | 109 | class CosineScheduler(LambdaLR): 110 | def __init__( 111 | self, 112 | num_schedule_steps=SCHEDULE_STEPS, 113 | num_cycles=NUM_CYCLES, 114 | min_percent=MIN_PERCENT 115 | ): 116 | self.num_schedule_steps = num_schedule_steps 117 | self.num_cycles = num_cycles 118 | self.min_percent = min_percent 119 | 120 | def set_optimizer(self, optimizer): 121 | super(CosineScheduler, self).__init__(optimizer, self.lr_lambda) 122 | 123 | def lr_lambda(self, current_step): 124 | progress = float(current_step) / float(max(1, self.num_schedule_steps)) 125 | return max(self.min_percent, 0.5 * (1.0 + math.cos(PI * float(self.num_cycles) * 2.0 * progress))) 126 | 127 | 128 | class CosineWarmupScheduler(LambdaLR): 129 | def __init__( 130 | self, 131 | num_warmup_steps=INIT_STEPS, 132 | num_schedule_steps=SCHEDULE_STEPS, 133 | num_cycles=NUM_CYCLES, 134 | min_percent=MIN_PERCENT 135 | ): 136 | self.num_warmup_steps = num_warmup_steps 137 | self.num_schedule_steps = num_schedule_steps 138 | self.num_cycles = num_cycles 139 | self.min_percent = min_percent 140 | 141 | def set_optimizer(self, optimizer): 142 | super(CosineWarmupScheduler, self).__init__(optimizer, self.lr_lambda) 143 | 144 | def lr_lambda(self, current_step): 145 | if current_step < self.num_warmup_steps: 146 | return float(current_step) / float(max(1, self.num_warmup_steps)) 147 | progress = float(current_step - self.num_warmup_steps) / \ 148 | float(max(1, self.num_schedule_steps - self.num_warmup_steps)) 149 | return max(self.min_percent, 0.5 * (1.0 + math.cos(PI * float(self.num_cycles) * 2.0 * progress))) 150 | 151 | 152 | class CosineWarmupRestartScheduler(LambdaLR): 153 | def __init__( 154 | self, 155 | num_warmup_steps=INIT_STEPS, 156 | num_schedule_steps=SCHEDULE_STEPS, 157 | num_cycles=NUM_CYCLES, 158 | min_percent=MIN_PERCENT 159 | ): 160 | self.num_warmup_steps = num_warmup_steps 161 | self.num_schedule_steps = num_schedule_steps 162 | self.num_cycles = num_cycles 163 | self.min_percent = min_percent 164 | 165 | def set_optimizer(self, optimizer): 166 | super(CosineWarmupRestartScheduler, self).__init__(optimizer, self.lr_lambda) 167 | 168 | def lr_lambda(self, current_step): 169 | if current_step < self.num_warmup_steps: 170 | return float(current_step) / float(max(1, self.num_warmup_steps)) 171 | progress = float(current_step - self.num_warmup_steps) / \ 172 | float(max(1, self.num_schedule_steps - self.num_warmup_steps)) 173 | if progress >= 1.0: 174 | return self.min_percent 175 | return max(self.min_percent, 0.5 * (1.0 + math.cos(PI * ((float(self.num_cycles) * progress) % 1.0)))) 176 | 177 | 178 | supported_schedulers = { 179 | "constant": ConstantScheduler(), 180 | "constant_with_warmup": ConstantWarmupScheduler(), 181 | "linear": LinearScheduler(), 182 | "linear_with_warmup": LinearWarmupScheduler(), 183 | "linear_with_warmup_and_restart": LinearWarmupRestartScheduler(), 184 | "cosine": CosineScheduler(), 185 | "cosine_with_warmup": CosineWarmupScheduler(), 186 | "cosine_with_warmup_and_restart": CosineWarmupRestartScheduler(), 187 | } 188 | 189 | 190 | def map_scheduler_str_to_scheduler(scheduler, **kw): 191 | if scheduler not in supported_schedulers: 192 | raise NotImplementedError 193 | 194 | sdlr = supported_schedulers[scheduler] 195 | for k, v in kw.items(): 196 | if hasattr(sdlr, k): 197 | try: 198 | setattr(sdlr, k, v) 199 | except: 200 | pass 201 | return sdlr 202 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Data/README.md: -------------------------------------------------------------------------------- 1 | ## Data 2 | 3 | Carl Yang (yangji9181@gmail.com) provides 4 HIN benchmark datasets: ```DBLP```, ```Yelp```, ```Freebase```, and ```PubMed```. 4 | 5 | Users can retrieve them here and unzip the downloaded file to the current folder. To reproduce the experiments in AAAI"2022, please download ```PubMed``` and ```Yelp```. 6 | 7 | The statistics of each dataset are as follows. 8 | 9 | **Dataset** | #node types | #nodes | #link types | #links | #attributes | #attributed nodes | #label types | #labeled nodes 10 | --- | --- | --- | --- | --- | --- | --- | --- | --- 11 | **PubMed** | 4 | 63,109 | 10 | 244,986 | 200 | ALL | 8 | 454 12 | **Yelp** | 4 | 82,465 | 4 | 30,542,675 | N/A | N/A | 16 | 7,417 13 | 14 | Each dataset contains: 15 | - 3 data files (```node.dat```, ```link.dat```, ```label.dat```); 16 | - 2 evaluation files (```link.dat.test```, ```label.dat.test```); 17 | - 2 description files (```meta.dat```, ```info.dat```); 18 | - 1 recording file (```record.dat```). 19 | 20 | ### node.dat 21 | 22 | - In each line, there are 4 elements (```node_id```, ```node_name```, ```node_type```, ```node_attributes```) separated by ```\t```. 23 | - ```Node_name``` is in Line ```node_id```. 24 | - In ```node_name```, empty space (``` ```) is replaced by underscore (```_```). 25 | - In ```node_attributes```, attributes are separated by comma (```,```). 26 | - ```DBLP``` and ```PubMed``` contain attributes, while ```Freebase``` and ```Yelp``` do not contain attributes. 27 | 28 | ### link.dat 29 | 30 | - In each line, there are 4 elements (```node_id```, ```node_id```, ```link_type```, ```link_weight```) separated by ```\t```. 31 | - All links are directed. Each node is connected by at least one link. 32 | 33 | ### label.dat 34 | 35 | - In each line, there are 4 elements (```node_id```, ```node_name```, ```node_type```, ```node_label```) separated by ```\t```. 36 | - All labeled nodes are of the same ```node_type```. 37 | - For ```DBLP```, ```Freebase```, and ```PubMed```, each labeled node only has one label. For ```Yelp```, each labeled node has one or multiple labels separated by ```,```. 38 | - For unsupervised training, ```label.dat``` and ```label.dat.test``` are merged for five-fold cross validation. For semi-supervised training, ```label.dat``` is used for training and ```label.dat.test``` is used for testing. 39 | 40 | ### link.dat.test 41 | 42 | - In each line, there are 3 elements (```node_id```, ```node_id```, ```link_status```) separated by ```\t```. 43 | - For ```link_status```, ```1``` indicates a positive link and ```0``` indicates a negative link. 44 | - Positive and negative links are of the same ```link_type```. 45 | - Number of positive links = Number of negative links = One fourth of the number of real links of the same type in ```label.dat```. 46 | 47 | ### label.dat.test 48 | 49 | - In each line, there are 4 elements (```node_id```, ```node_name```, ```node_type```, ```node_label```) separated by ```\t```. 50 | - All labeled nodes are of the same ```node_type```. 51 | - Number of labeled nodes in ```label.dat.test``` = One fourth of the number of labeled nodes in ```label.dat```. 52 | - For ```DBLP```, ```Freebase```, and ```PubMed```, each labeled node only has one label. For ```Yelp```, each labeled node has one or multiple labels separated by ```,```. 53 | - For unsupervised training, ```label.dat``` and ```label.dat.test``` are merged for five-fold cross validation. For semi-supervised training, ```label.dat``` is used for training and ```label.dat.test``` is used for testing. 54 | 55 | ### meta.dat 56 | 57 | - This file describes the number of instances of each node type, link type, and label type in the corresponding dataset. 58 | 59 | ### info.dat 60 | 61 | - This file describes the meaning of each node type, link type, and label type in the corresponding dataset. 62 | 63 | ### record.dat 64 | 65 | - In each paragraph, the first line tells the model and evaluation settings, the second line tells the set of training parameters, and the third line tells the evaluation results. 66 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Evaluate/README.md: -------------------------------------------------------------------------------- 1 | ## Evaluate 2 | 3 | This stage evaluates the output embeddings based on specific tasks. 4 | 5 | Users need to specify the following parameters in ```evaluate.sh```: 6 | - **dataset**: choose from ```PubMed``` and ```Yelp```; 7 | - **model**: choose from ```DMPNN```, ```CompGCN```, ```R-GIN```; 8 | - **attributed**: choose ```True``` for attributed training or ```False``` for unattributed training; 9 | - **supervised**: choose ```True``` for semi-supervised training or ```False``` for unsupervised training. 10 | - **task**: choose ```nc``` for node classification, ```lp``` for link prediction, or ```both``` for both tasks. 11 | 12 | *Note: Only Message-Passing Methods (```R-GCN```, ```HAN```, ```MAGNN```, ```HGT```) support attributed or semi-supervised training.*
13 | *Note: Only ```DBLP``` and ```PubMed``` contain node attributes.* 14 | 15 | **Node Classification**:
16 | We train a separate linear Support Vector Machine (LinearSVC) based on the learned embeddings on 80% of the labeled nodes and predict on the remaining 20%. We repeat the process for standard five-fold cross validation and compute the average scores regarding **Macro-F1** (across all labels) and **Micro-F1** (across all nodes). 17 | 18 | **Link Prediction**:
19 | We use the Hadamard function to construct feature vectors for node pairs, train a two-class LinearSVC on the 80% training links and evaluate towards the 20% held out links. We repeat the process for standard five-fold cross validation and compute the average scores regarding **AUC** (area under the ROC curve) and **MRR** (mean reciprocal rank). 20 | 21 | Run ```bash evaluate.sh``` to complete *Stage 4: Evaluate*. 22 | 23 | The evaluation results are stored in ```record.dat``` of the corresponding dataset. -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Evaluate/evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | from link_prediction import * 4 | from node_classification import * 5 | 6 | data_folder, model_folder = '../Data', '../Model' 7 | emb_file, record_file = 'emb.dat', 'record.dat' 8 | link_test_file, label_test_file, label_file = 'link.dat.test', 'label.dat.test', 'label.dat' 9 | 10 | 11 | def parse_args(): 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | '--dataset', required=True, type=str, choices=['DBLP', 'Freebase', 'PubMed', 'Yelp'], 16 | help='Targeting dataset.' 17 | ) 18 | parser.add_argument( 19 | '--model', 20 | required=True, 21 | type=str, 22 | help='Targeting model.', 23 | choices=[ 24 | 'metapath2vec-ESim', 'PTE', 'HIN2Vec', 'AspEm', 'HEER', 'R-GCN', 'R-GIN', 'CompGCN', 'DMPNN', 'HAN', 25 | 'MAGNN', 'HGT', 'TransE', 'DistMult', 'ComplEx', 'ConvE' 26 | ] 27 | ) 28 | parser.add_argument('--task', required=True, type=str, help='Targeting task.', choices=['nc', 'lp', 'both']) 29 | parser.add_argument( 30 | '--attributed', 31 | required=True, 32 | type=str, 33 | help='Only R-GCN, R-GIN, CompGCN, DMPNN, HAN, MAGNN, and HGT support attributed training.', 34 | choices=['True', 'False'] 35 | ) 36 | parser.add_argument( 37 | '--supervised', 38 | required=True, 39 | type=str, 40 | help='Only R-GCN, R-GIN, CompGCN, DMPNN, HAN, MAGNN, and HGT support semi-supervised training.', 41 | choices=['True', 'False'] 42 | ) 43 | parser.add_argument('--emb_file', type=str, default='') 44 | parser.add_argument('--record_file', type=str, default='') 45 | parser.add_argument('--link_test_file', type=str, default='') 46 | parser.add_argument('--label_test_file', type=str, default='') 47 | parser.add_argument('--label_file', type=str, default='') 48 | 49 | return parser.parse_args() 50 | 51 | 52 | def load(emb_file_path): 53 | 54 | emb_dict = {} 55 | with open(emb_file_path, 'r') as emb_file: 56 | for i, line in enumerate(emb_file): 57 | if i == 0: 58 | train_para = line[:-1] 59 | else: 60 | index, emb = line[:-1].split('\t') 61 | emb_dict[index] = np.array(emb.split()).astype(np.float32) 62 | 63 | return train_para, emb_dict 64 | 65 | 66 | def record(args, all_tasks, train_para, all_scores): 67 | 68 | if args.record_file != '': 69 | filename = args.record_file 70 | else: 71 | filename = f'{data_folder}/{args.dataset}/{record_file}' 72 | with open(filename, 'a') as file: 73 | for task, score in zip(all_tasks, all_scores): 74 | file.write(f'model={args.model}, task={task}, attributed={args.attributed}, supervised={args.supervised}\n') 75 | file.write(f'{train_para}\n') 76 | if task == 'nc': 77 | file.write(f'Macro-F1={score[0]:.4f}, Micro-F1={score[1]:.4f}\n') 78 | elif task == 'lp': 79 | file.write(f'AUC={score[0]:.4f}, MRR={score[1]:.4f}\n') 80 | file.write('\n') 81 | 82 | return 83 | 84 | 85 | def check(args): 86 | 87 | if args.attributed == 'True': 88 | if args.model not in ['R-GCN', 'R-GIN', 'CompGCN', 'DMPNN', 'HAN', 'MAGNN', 'HGT']: 89 | print(f'{args.model} does not support attributed training!') 90 | print('Only R-GCN, R-GIN, CompGCN, DMPNN, HAN, MAGNN, and HGT support attributed training!') 91 | return False 92 | if args.dataset not in ['DBLP', 'PubMed']: 93 | print(f'{args.dataset} does not support attributed training!') 94 | print('Only DBLP and PubMed support attributed training!') 95 | return False 96 | 97 | if args.supervised == 'True': 98 | if args.model not in ['R-GCN', 'R-GIN', 'CompGCN', 'DMPNN', 'HAN', 'MAGNN', 'HGT']: 99 | print(f'{args.model} does not support semi-supervised training!') 100 | print('Only R-GCN, R-GIN, CompGCN, DMPNN, HAN, MAGNN, and HGT support semi-supervised training!') 101 | return False 102 | 103 | return True 104 | 105 | 106 | def main(): 107 | 108 | args = parse_args() 109 | 110 | if not check(args): 111 | return 112 | 113 | print('Load Embeddings!') 114 | print(args) 115 | if args.emb_file != '': 116 | emb_file_path = args.emb_file 117 | else: 118 | emb_file_path = f'{model_folder}/{args.model}/data/{args.dataset}/{emb_file}' 119 | train_para, emb_dict = load(emb_file_path) 120 | 121 | if args.label_file != '': 122 | label_file_path = args.label_file 123 | else: 124 | label_file_path = f'{data_folder}/{args.dataset}/{label_file}' 125 | 126 | if args.label_test_file != '': 127 | label_test_file_path = args.label_test_file 128 | else: 129 | label_test_file_path = f'{data_folder}/{args.dataset}/{label_test_file}' 130 | 131 | if args.link_test_file != '': 132 | link_test_file_path = args.link_test_file 133 | else: 134 | link_test_file_path = f'{data_folder}/{args.dataset}/{link_test_file}' 135 | 136 | print('Start Evaluation!') 137 | all_tasks, all_scores = [], [] 138 | if args.task == 'nc' or args.task == 'both': 139 | print(f'Evaluate Node Classification Performance for Model {args.model} on Dataset {args.dataset}!') 140 | scores = nc_evaluate(args.dataset, args.supervised, label_file_path, label_test_file_path, emb_dict) 141 | all_tasks.append('nc') 142 | all_scores.append(scores) 143 | print(f'Macro-F1={scores[0]:.4f}, Micro-F1={scores[1]:.4f}') 144 | if args.task == 'lp' or args.task == 'both': 145 | print(f'Evaluate Link Prediction Performance for Model {args.model} on Dataset {args.dataset}!') 146 | scores = lp_evaluate(link_test_file_path, emb_dict) 147 | all_tasks.append('lp') 148 | all_scores.append(scores) 149 | print(f'AUC={scores[0]:.4f}, MRR={scores[1]:.4f}') 150 | 151 | print('Record Results!') 152 | record(args, all_tasks, train_para, all_scores) 153 | 154 | return 155 | 156 | 157 | if __name__ == '__main__': 158 | main() 159 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Evaluate/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note: Only "R-GCN", "R-GIN", "CompGCN", "ConjGCN", "HAN", "MAGNN", and "HGT" support attributed="True" or supervised="True" 4 | # Note: Only "DBLP" and "PubMed" support attributed="True" 5 | 6 | attributed="False" 7 | supervised="False" 8 | negative_sample=5 9 | dropout=0.2 10 | n_hidden=50 11 | n_epochs=50 # the epoch here is different with the epoch in original HNE 12 | grad_norm=1.0 13 | sampler=randomwalk 14 | 15 | for dataset in "PubMed" "Yelp" 16 | do 17 | for model in "DMPNN" "CompGCN" "R-GIN" "R-GCN" 18 | do 19 | folder="../Model/${model}/data/${dataset}/" 20 | node_file="${folder}node.dat" 21 | label_file="${folder}label.dat" 22 | link_file="${folder}link.dat" 23 | for lr in 1e-2 1e-3 24 | do 25 | for reg in 1e-2 1e-3 26 | do 27 | for n_layers in 1 2 28 | do 29 | for graph_split_size in 0.5 0.7 0.9 30 | do 31 | emb_file="${folder}emb_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat" 32 | # record_file="${folder}record_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat" 33 | record_file="${folder}record_noattr_unsup_hidden${n_hidden}.dat" 34 | OMP_NUM_THREADS=4 python evaluate.py \ 35 | --dataset ${dataset} \ 36 | --model ${model} \ 37 | --task nc \ 38 | --attributed ${attributed} \ 39 | --supervised ${supervised} \ 40 | --emb_file ${emb_file} \ 41 | --record_file ${record_file} 42 | done 43 | done 44 | done 45 | done 46 | done 47 | exit 1 48 | done 49 | 50 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Evaluate/link_prediction.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | from collections import defaultdict 4 | from sklearn.svm import LinearSVC 5 | from sklearn.model_selection import KFold 6 | from sklearn.metrics import roc_auc_score 7 | from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning 8 | from utils import SingleLabelBinarySeachCV, MultiLabelBinarySeachCV 9 | 10 | seed = 1 11 | max_iter = 300 12 | np.random.seed(seed) 13 | warnings.filterwarnings("ignore", category=ConvergenceWarning) 14 | warnings.filterwarnings("ignore", category=UndefinedMetricWarning) 15 | 16 | 17 | def cross_validation(edge_embs, edge_labels): 18 | 19 | auc, mrr = [], [] 20 | seed_nodes, num_nodes = np.array(list(edge_embs.keys())), len(edge_embs) 21 | 22 | skf = KFold(n_splits=5, shuffle=True, random_state=seed) 23 | for fold, (train_idx, test_idx) in enumerate(skf.split(np.zeros((num_nodes, 1)), np.zeros(num_nodes))): 24 | 25 | print(f'Start Evaluation Fold {fold}!') 26 | train_nodes = seed_nodes[train_idx] 27 | train_edge_embs = np.concatenate([edge_embs[n] for n in train_nodes]) 28 | train_edge_labels = np.concatenate([edge_labels[n] for n in train_nodes]) 29 | test_nodes = seed_nodes[test_idx] 30 | test_edge_embs = np.concatenate([edge_embs[n] for n in test_nodes]) 31 | test_edge_labels = np.concatenate([edge_labels[n] for n in test_nodes]) 32 | 33 | c = SingleLabelBinarySeachCV(train_edge_embs, train_edge_labels, multi_class="ovr") 34 | clf = LinearSVC(random_state=seed, max_iter=max_iter, multi_class="ovr", C=c) 35 | clf.fit(train_edge_embs, train_edge_labels) 36 | preds = clf.predict(test_edge_embs) 37 | auc.append(roc_auc_score(test_edge_labels, preds)) 38 | 39 | confidence = clf.decision_function(test_edge_embs) 40 | curr_mrr, conf_num = [], 0 41 | for each in test_idx: 42 | test_edge_conf = np.argsort(-confidence[conf_num:conf_num + len(edge_labels[seed_nodes[each]])]) 43 | rank = np.empty_like(test_edge_conf) 44 | rank[test_edge_conf] = np.arange(len(test_edge_conf)) 45 | curr_mrr.append(1 / (1 + np.min(rank[np.argwhere(edge_labels[seed_nodes[each]] == 1).flatten()]))) 46 | conf_num += len(rank) 47 | mrr.append(np.mean(curr_mrr)) 48 | assert conf_num == len(confidence) 49 | 50 | return np.mean(auc), np.mean(mrr) 51 | 52 | 53 | def lp_evaluate(test_file_path, emb_dict): 54 | 55 | posi, nega = defaultdict(set), defaultdict(set) 56 | with open(test_file_path, 'r') as test_file: 57 | for line in test_file: 58 | left, right, label = line[:-1].split('\t') 59 | if label == '1': 60 | posi[left].add(right) 61 | elif label == '0': 62 | nega[left].add(right) 63 | 64 | edge_embs, edge_labels = defaultdict(list), defaultdict(list) 65 | for left, rights in posi.items(): 66 | for right in rights: 67 | key = left + ',' + right 68 | if key in emb_dict: 69 | edge_embs[left].append(emb_dict[key]) 70 | else: 71 | edge_embs[left].append(emb_dict[left] * emb_dict[right]) 72 | edge_labels[left].append(1) 73 | for left, rights in nega.items(): 74 | for right in rights: 75 | key = left + ',' + right 76 | if key in emb_dict: 77 | edge_embs[left].append(emb_dict[key]) 78 | else: 79 | edge_embs[left].append(emb_dict[left] * emb_dict[right]) 80 | edge_labels[left].append(0) 81 | 82 | for node in edge_embs: 83 | edge_embs[node] = np.array(edge_embs[node]) 84 | edge_labels[node] = np.array(edge_labels[node]) 85 | 86 | auc, mrr = cross_validation(edge_embs, edge_labels) 87 | 88 | return auc, mrr 89 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Evaluate/node_classification.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | from collections import defaultdict 4 | from sklearn.svm import LinearSVC 5 | from sklearn.metrics import f1_score 6 | from sklearn.model_selection import StratifiedKFold 7 | from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning 8 | from utils import SingleLabelBinarySeachCV, MultiLabelBinarySeachCV 9 | 10 | 11 | seed = 1 12 | max_iter = 300 13 | np.random.seed(seed) 14 | warnings.filterwarnings("ignore", category=ConvergenceWarning) 15 | warnings.filterwarnings("ignore", category=UndefinedMetricWarning) 16 | 17 | 18 | def nc_evaluate(dataset, supervised, label_file_path, label_test_path, emb_dict): 19 | 20 | if supervised == 'True': 21 | if dataset == 'Yelp': 22 | return semisupervised_single_class_multi_label(label_file_path, label_test_path, emb_dict) 23 | elif dataset == 'DBLP' or dataset == 'Freebase' or dataset == 'PubMed': 24 | return semisupervised_single_class_single_label(label_file_path, label_test_path, emb_dict) 25 | elif supervised == 'False': 26 | if dataset == 'Yelp': 27 | return unsupervised_single_class_multi_label(label_file_path, label_test_path, emb_dict) 28 | elif dataset == 'DBLP' or dataset == 'Freebase' or dataset == 'PubMed': 29 | return unsupervised_single_class_single_label(label_file_path, label_test_path, emb_dict) 30 | 31 | 32 | def semisupervised_single_class_single_label(label_file_path, label_test_path, emb_dict): 33 | 34 | train_labels, train_embeddings = [], [] 35 | with open(label_file_path, 'r') as label_file: 36 | for line in label_file: 37 | index, _, _, label = line.strip().split('\t') 38 | train_labels.append(label) 39 | train_embeddings.append(emb_dict[index]) 40 | train_labels, train_embeddings = np.array(train_labels).astype(int), np.array(train_embeddings) 41 | 42 | test_labels, test_embeddings = [], [] 43 | with open(label_test_path, 'r') as label_file: 44 | for line in label_file: 45 | index, _, _, label = line.strip().split('\t') 46 | test_labels.append(label) 47 | test_embeddings.append(emb_dict[index]) 48 | test_labels, test_embeddings = np.array(test_labels).astype(int), np.array(test_embeddings) 49 | 50 | c = SingleLabelBinarySeachCV(train_embeddings, train_labels, multi_class="ovr") 51 | clf = LinearSVC(random_state=seed, max_iter=max_iter, multi_class="ovr", C=c) 52 | clf.fit(train_embeddings, train_labels) 53 | preds = clf.predict(test_embeddings) 54 | 55 | macro = f1_score(test_labels, preds, average='macro') 56 | micro = f1_score(test_labels, preds, average='micro') 57 | 58 | return macro, micro 59 | 60 | 61 | def unsupervised_single_class_single_label(label_file_path, label_test_path, emb_dict): 62 | 63 | labels, embeddings = [], [] 64 | for file_path in [label_file_path, label_test_path]: 65 | with open(file_path, 'r') as label_file: 66 | for line in label_file: 67 | index, _, _, label = line.strip().split('\t') 68 | labels.append(label) 69 | embeddings.append(emb_dict[index]) 70 | labels, embeddings = np.array(labels).astype(int), np.array(embeddings) 71 | 72 | macro, micro = [], [] 73 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) 74 | for train_idx, test_idx in skf.split(embeddings, labels): 75 | c = SingleLabelBinarySeachCV(embeddings[train_idx], labels[train_idx], multi_class="ovr") 76 | clf = LinearSVC(random_state=seed, max_iter=max_iter, multi_class="ovr", C=c) 77 | clf.fit(embeddings[train_idx], labels[train_idx]) 78 | preds = clf.predict(embeddings[test_idx]) 79 | 80 | macro.append(f1_score(labels[test_idx], preds, average='macro')) 81 | micro.append(f1_score(labels[test_idx], preds, average='micro')) 82 | print(macro) 83 | print(micro) 84 | return np.mean(macro), np.mean(micro) 85 | 86 | 87 | def semisupervised_single_class_multi_label(label_file_path, label_test_path, emb_dict): 88 | 89 | train_embs = [] 90 | with open(label_file_path, 'r') as label_file: 91 | for line in label_file: 92 | index, _, nclass, label = line.strip().split('\t') 93 | train_embs.append(emb_dict[index]) 94 | train_embs = np.array(train_embs) 95 | 96 | test_embs = [] 97 | with open(label_test_path, 'r') as label_file: 98 | for line in label_file: 99 | index, _, nclass, label = line.strip().split('\t') 100 | test_embs.append(emb_dict[index]) 101 | test_embs = np.array(test_embs) 102 | 103 | label_count, labled_node_index = 0, 0 104 | binary_labels, label_dict, = [], {} 105 | with open(label_file_path, 'r') as label_file: 106 | for line in label_file: 107 | index, _, nclass, label = line.strip().split('\t') 108 | for each in label.split(','): 109 | if (nclass, each) not in label_dict: 110 | label_dict[(nclass, each)] = label_count 111 | label_count += 1 112 | binary_labels.append(np.zeros(len(train_embs), dtype=np.bool8)) 113 | binary_labels[label_dict[(nclass, each)]][labled_node_index] = 1 114 | labled_node_index += 1 115 | train_labels = np.vstack(binary_labels) 116 | 117 | label_count, labled_node_index = 0, 0 118 | binary_labels, label_dict, = [], {} 119 | with open(label_test_path, 'r') as label_file: 120 | for line in label_file: 121 | index, _, nclass, label = line.strip().split('\t') 122 | for each in label.split(','): 123 | if (nclass, each) not in label_dict: 124 | label_dict[(nclass, each)] = label_count 125 | label_count += 1 126 | binary_labels.append(np.zeros(len(test_embs), dtype=np.bool8)) 127 | binary_labels[label_dict[(nclass, each)]][labled_node_index] = 1 128 | labled_node_index += 1 129 | test_labels = np.vstack(binary_labels) 130 | 131 | weights, total_scores = [], [] 132 | for ntype, (train_label, test_label) in enumerate(zip(train_labels, test_labels)): 133 | c = MultiLabelBinarySeachCV(train_embs, train_label, multi_class="crammer_singer") 134 | clf = LinearSVC(random_state=seed, max_iter=max_iter, C=c, multi_class="crammer_singer") 135 | clf.fit(train_embs, train_label) 136 | preds = clf.predict(test_embs) 137 | scores = f1_score(test_label, preds, average='binary') 138 | weights.append(sum(test_label)) 139 | total_scores.append(scores) 140 | 141 | macro = sum(total_scores) / len(total_scores) 142 | micro = sum([score * weight for score, weight in zip(total_scores, weights)]) / sum(weights) 143 | 144 | return macro, micro 145 | 146 | 147 | def unsupervised_single_class_multi_label(label_file_path, label_test_path, emb_dict): 148 | embs = [] 149 | for file_path in [label_file_path, label_test_path]: 150 | with open(file_path, 'r') as label_file: 151 | for line in label_file: 152 | index, _, nclass, label = line.strip().split('\t') 153 | embs.append(emb_dict[index]) 154 | embs = np.array(embs) 155 | 156 | label_count, labled_node_index = 0, 0 157 | binary_labels, label_dict, = [], {} 158 | for file_path in [label_file_path, label_test_path]: 159 | with open(file_path, 'r') as label_file: 160 | for line in label_file: 161 | index, _, nclass, label = line.strip().split('\t') 162 | for each in label.split(','): 163 | if (nclass, each) not in label_dict: 164 | label_dict[(nclass, each)] = label_count 165 | label_count += 1 166 | binary_labels.append(np.zeros(len(embs), dtype=np.int32)) 167 | binary_labels[label_dict[(nclass, each)]][labled_node_index] = 1 168 | labled_node_index += 1 169 | binary_labels = np.vstack(binary_labels) 170 | 171 | cs = [] 172 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) 173 | for train_idx, test_idx in skf.split(embs, binary_labels[binary_labels.sum(axis=1).argmax()]): 174 | c = MultiLabelBinarySeachCV(embs[train_idx], binary_labels[:, train_idx], multi_class="crammer_singer") 175 | cs.append(c) 176 | weights, total_scores = [], [] 177 | for ntype, binary_label in enumerate(binary_labels): 178 | scores = [] 179 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) 180 | for nsplit, train_idx, test_idx in enumerate(skf.split(embs, binary_label)): 181 | clf = LinearSVC(random_state=seed, max_iter=max_iter, C=cs[nsplit], multi_class="crammer_singer") 182 | clf.fit(embs[train_idx], binary_label[train_idx]) 183 | preds = clf.predict(embs[test_idx]) 184 | scores.append(f1_score(binary_label[test_idx], preds, average='binary')) 185 | 186 | weights.append(sum(binary_label)) 187 | total_scores.append(sum(scores) / 5) 188 | 189 | macro = sum(total_scores) / len(total_scores) 190 | micro = sum([score * weight for score, weight in zip(total_scores, weights)]) / sum(weights) 191 | 192 | return macro, micro 193 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Evaluate/utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | from collections import defaultdict 4 | from sklearn.svm import LinearSVC 5 | from sklearn.metrics import f1_score 6 | from sklearn.model_selection import StratifiedKFold 7 | from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning 8 | 9 | seed = 1 10 | max_iter = 100 11 | np.random.seed(seed) 12 | warnings.filterwarnings("ignore", category=ConvergenceWarning) 13 | warnings.filterwarnings("ignore", category=UndefinedMetricWarning) 14 | 15 | 16 | def SingleLabelBinarySeachCV(data, labels, multi_class="ovr"): 17 | best_c = 1.0 18 | c0 = np.power(10.0, -(labels.max() - labels.min() + 1)) 19 | c1 = 1 / c0 20 | cnt = 0 21 | max_cnt = 2 * (labels.max() - labels.min() + 1) - 1 22 | while cnt < max_cnt and np.abs(c0 - c1) > 1e-10: 23 | np.random.seed(cnt) 24 | index = np.random.choice(len(data), size=(int(len(data) * (cnt+1) / max_cnt), ), replace=False) 25 | cur_data, cur_labels = data[index], labels[index] 26 | clf0 = LinearSVC(random_state=seed, max_iter=int(max_iter * (cnt+1) / max_cnt), multi_class=multi_class, C=c0) 27 | clf0.fit(cur_data, cur_labels) 28 | preds0 = clf0.predict(cur_data) 29 | macro0 = f1_score(cur_labels, preds0, average='macro') 30 | micro0 = f1_score(cur_labels, preds0, average='micro') 31 | 32 | clf1 = LinearSVC(random_state=seed, max_iter=int(max_iter * (cnt+1) / max_cnt), multi_class=multi_class, C=c1) 33 | clf1.fit(cur_data, cur_labels) 34 | preds1 = clf1.predict(cur_data) 35 | macro1 = f1_score(cur_labels, preds1, average='macro') 36 | micro1 = f1_score(cur_labels, preds1, average='micro') 37 | 38 | if macro0 + micro0 > macro1 + micro1: 39 | best_c = c0 40 | c1 /= 10 41 | else: 42 | best_c = c1 43 | c0 *= 10 44 | cnt += 1 45 | return best_c 46 | 47 | 48 | def MultiLabelBinarySeachCV(data, labels, multi_class="crammer_singer"): 49 | best_c = 1.0 50 | c0 = np.power(10.0, -len(labels)) 51 | c1 = 1 / c0 52 | cnt = 0 53 | max_cnt = 2 * len(labels) - 1 54 | while cnt < max_cnt and np.abs(c0 - c1) > 1e-10: 55 | np.random.seed(cnt) 56 | index = np.random.choice(len(data), size=(int(len(data) * (cnt+1) / max_cnt), ), replace=False) 57 | cur_data, cur_labels = data[index], labels[:, index] 58 | weights0 = np.zeros((len(cur_data)), dtype=np.float32) 59 | scores0 = np.zeros((len(cur_data)), dtype=np.float32) 60 | for ntype, nlabels in enumerate(cur_labels): 61 | clf0 = LinearSVC(random_state=seed, max_iter=int(max_iter * (cnt+1) / max_cnt), multi_class="crammer_singer", C=c0) 62 | clf0.fit(cur_data, nlabels) 63 | preds0 = clf0.predict(cur_data) 64 | scores0[ntype] = f1_score(nlabels, preds0, average='binary') 65 | weights0[ntype] = sum(nlabels) 66 | macro0 = scores0.sum() / scores0.shape[0] 67 | micro0 = (scores0 * weights0 / weights0.sum()).sum() 68 | 69 | weights1 = np.zeros((len(cur_data)), dtype=np.float32) 70 | scores1 = np.zeros((len(cur_data)), dtype=np.float32) 71 | for ntype, nlabels in enumerate(cur_labels): 72 | clf1 = LinearSVC(random_state=seed, max_iter=int(max_iter * (cnt+1) / max_cnt), multi_class="crammer_singer", C=c1) 73 | clf1.fit(cur_data, nlabels) 74 | preds1 = clf1.predict(cur_data) 75 | scores1[ntype] = f1_score(nlabels, preds1, average='binary') 76 | weights1[ntype] = sum(nlabels) 77 | macro1 = scores1.sum() / scores1.shape[0] 78 | micro1 = (scores1 * weights1 / weights1.sum()).sum() 79 | 80 | if macro0 + micro0 > macro1 + micro1: 81 | best_c = c0 82 | c1 /= 10 83 | else: 84 | best_c = c1 85 | c0 *= 10 86 | cnt += 1 87 | return best_c 88 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Model/CompGCN/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu=2 4 | attributed="False" 5 | supervised="False" 6 | negative_sample=5 7 | dropout=0.2 8 | n_hidden=50 9 | n_epochs=50 # the epoch here is different with the epoch in original HNE 10 | graph_batch_size=10000 11 | sample_depth=3 12 | sample_width=10 13 | label_batch_size=64 14 | grad_norm=1.0 15 | sampler=randomwalk 16 | 17 | for dataset in "PubMed" "Yelp" 18 | do 19 | folder="data/${dataset}/" 20 | node_file="${folder}node.dat" 21 | label_file="${folder}label.dat" 22 | link_file="${folder}link.dat" 23 | for lr in 1e-2 1e-3 24 | do 25 | for reg in 1e-2 1e-3 26 | do 27 | for n_layers in 1 2 28 | do 29 | for graph_split_size in 0.5 0.7 0.9 30 | do 31 | emb_file="${folder}emb_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat" 32 | record_file="${folder}record_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat" 33 | OMP_NUM_THREADS=4 python src/main.py \ 34 | --link ${link_file} \ 35 | --node ${node_file} \ 36 | --label ${label_file} \ 37 | --output ${emb_file} \ 38 | --n-hidden ${n_hidden} \ 39 | --negative-sample ${negative_sample} \ 40 | --lr ${lr} \ 41 | --dropout ${dropout} \ 42 | --gpu ${gpu} \ 43 | --n-layers ${n_layers} \ 44 | --n-epochs ${n_epochs} \ 45 | --regularization ${reg} \ 46 | --grad-norm ${grad_norm} \ 47 | --graph-batch-size ${graph_batch_size} \ 48 | --graph-split-size ${graph_split_size} \ 49 | --label-batch-size ${label_batch_size} \ 50 | --sampler ${sampler} \ 51 | --sample-depth ${sample_depth} \ 52 | --sample-width ${sample_width} \ 53 | --attributed ${attributed} \ 54 | --supervised ${supervised} 55 | done 56 | done 57 | done 58 | done 59 | done 60 | 61 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Model/CompGCN/src/main.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import dgl 4 | import math 5 | import numpy as np 6 | import os 7 | import time 8 | import torch 9 | import torch.nn.functional as F 10 | from torch.optim.lr_scheduler import LambdaLR 11 | from tqdm import tqdm 12 | 13 | import utils 14 | from model import * 15 | 16 | 17 | np.random.seed(12345) 18 | torch.manual_seed(12345) 19 | torch.cuda.manual_seed(12345) 20 | 21 | 22 | class CosineWarmupRestartScheduler(LambdaLR): 23 | def __init__( 24 | self, 25 | num_warmup_steps=600, 26 | num_schedule_steps=10000, 27 | num_cycles=2, 28 | min_percent=1e-3 29 | ): 30 | self.num_warmup_steps = num_warmup_steps 31 | self.num_schedule_steps = num_schedule_steps 32 | self.num_cycles = num_cycles 33 | self.min_percent = min_percent 34 | 35 | def set_optimizer(self, optimizer): 36 | super(CosineWarmupRestartScheduler, self).__init__(optimizer, self.lr_lambda) 37 | 38 | def lr_lambda(self, current_step): 39 | if current_step < self.num_warmup_steps: 40 | return float(current_step) / float(max(1, self.num_warmup_steps)) 41 | progress = float(current_step - self.num_warmup_steps) / \ 42 | float(max(1, self.num_schedule_steps - self.num_warmup_steps)) 43 | if progress >= 1.0: 44 | return self.min_percent 45 | return max(self.min_percent, 0.5 * (1.0 + math.cos(math.pi * ((float(self.num_cycles) * progress) % 1.0)))) 46 | 47 | 48 | def main(args): 49 | 50 | # load graph data 51 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start loading...", flush=True) 52 | if args.supervised == "True": 53 | train_pool, train_labels, nlabels, multi = utils.load_label(args.label) 54 | train_data, num_nodes, num_rels, train_indices, ntrain, node_attri = utils.load_supervised( 55 | args, args.link, args.node, train_pool 56 | ) 57 | elif args.supervised == "False": 58 | train_data, num_nodes, num_rels, node_attri = utils.load_unsupervised(args, args.link, args.node) 59 | nlabels = 0 60 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "finish loading...", flush=True) 61 | 62 | # check cuda 63 | use_cuda = args.gpu >= 0 and torch.cuda.is_available() 64 | if use_cuda: 65 | torch.cuda.set_device(args.gpu) 66 | print("check 1", flush=True) 67 | # create model 68 | model = TrainModel( 69 | node_attri, 70 | num_nodes, 71 | args.n_hidden, 72 | num_rels, 73 | nlabels, 74 | num_hidden_layers=args.n_layers, 75 | dropout=args.dropout, 76 | use_cuda=use_cuda, 77 | reg_param=args.regularization 78 | ) 79 | print("check 2", flush=True) 80 | if use_cuda: 81 | model.to("cuda:%d" % (args.gpu)) 82 | print("check 3", flush=True) 83 | """ 84 | # build adj list and calculate degrees for sampling 85 | degrees = utils.get_adj_and_degrees(num_nodes, train_data) 86 | """ 87 | # build graph 88 | graph = utils.build_graph_from_triplets(num_nodes, num_rels, train_data) 89 | graph.ndata[dgl.NID] = torch.arange(num_nodes, dtype=torch.long) 90 | graph.edata[dgl.EID] = torch.arange(len(train_data) * 2, dtype=torch.long) 91 | seed_nodes = list() 92 | if os.path.exists(args.node.replace("node.dat", "seed_node.dat")): 93 | with open(args.node.replace("node.dat", "seed_node.dat"), "r") as f: 94 | for line in f: 95 | seed_nodes.append(int(line)) 96 | seed_nodes = set(seed_nodes) 97 | if len(seed_nodes) > 0: 98 | dataloader = torch.utils.data.DataLoader( 99 | np.array([x for x in train_data if x[0] in seed_nodes or x[2] in seed_nodes]), 100 | batch_size=args.graph_batch_size, shuffle=True 101 | ) 102 | else: 103 | dataloader = torch.utils.data.DataLoader( 104 | train_data, 105 | batch_size=args.graph_batch_size, shuffle=True 106 | ) 107 | print("check 4", flush=True) 108 | # optimizer 109 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) 110 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs * len(dataloader), eta_min=3e-6) 111 | optimizer.zero_grad() 112 | scheduler.step(0) 113 | 114 | # training loop 115 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start training...", flush=True) 116 | model.train() 117 | prev_loss = np.float32("inf") 118 | for epoch in range(args.n_epochs): 119 | losses = [] 120 | for batch in tqdm(dataloader): 121 | # perform edge neighborhood sampling to generate training graph and data 122 | if args.supervised == "True": 123 | subg, samples, matched_labels, matched_index = \ 124 | utils.generate_sampled_graph_and_labels_supervised( 125 | graph, batch, args.sampler, args.sample_depth, args.sample_width, 126 | args.graph_split_size, 127 | train_indices, train_labels, multi, nlabels, ntrain, 128 | if_train=True, label_batch_size=args.label_batch_size 129 | ) 130 | if multi: 131 | matched_labels = torch.from_numpy(matched_labels).float() 132 | else: 133 | matched_labels = torch.from_numpy(matched_labels).long() 134 | if use_cuda: 135 | matched_labels = matched_labels.to("cuda:%d" % (args.gpu)) 136 | elif args.supervised == "False": 137 | subg, samples, labels = \ 138 | utils.generate_sampled_graph_and_labels_unsupervised( 139 | graph, batch, args.sampler, args.sample_depth, args.sample_width, 140 | args.graph_split_size, args.negative_sample 141 | ) 142 | samples = torch.from_numpy(samples) 143 | labels = torch.from_numpy(labels) 144 | if use_cuda: 145 | samples = samples.to("cuda:%d" % (args.gpu)) 146 | labels = labels.to("cuda:%d" % (args.gpu)) 147 | else: 148 | raise ValueError 149 | 150 | # calculate norms and eigenvalues of the subgraph 151 | edge_norm = utils.compute_edgenorm(subg) 152 | if use_cuda: 153 | subg = subg.to("cuda:%d" % (args.gpu)) 154 | edge_norm = edge_norm.to("cuda:%d" % (args.gpu)) 155 | edge_type = subg.edata["type"] 156 | 157 | embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm) 158 | 159 | if args.supervised == "True": 160 | loss = model.get_supervised_loss(subg, embed, edge_type, pred, matched_labels, matched_index, multi) 161 | elif args.supervised == "False": 162 | loss = model.get_unsupervised_loss(subg, embed, edge_type, samples, labels) 163 | loss.backward() 164 | losses.append(loss.item()) 165 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm) # clip gradients 166 | optimizer.step() 167 | optimizer.zero_grad() 168 | scheduler.step() 169 | loss = sum(losses) / len(losses) 170 | 171 | print( 172 | time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + 173 | "Epoch {:05d} | Loss {:.4f}".format(epoch, loss), 174 | flush=True 175 | ) 176 | if loss > prev_loss: 177 | break 178 | prev_loss = loss 179 | 180 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "training done", flush=True) 181 | 182 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start output...", flush=True) 183 | dataloader = torch.utils.data.DataLoader(train_data, batch_size=args.graph_batch_size * 4, shuffle=False) 184 | model.eval() 185 | with torch.no_grad(): 186 | node_emb, node_sampled = model.model.node_emb.weight.detach().cpu().clone(), set() 187 | for batch in tqdm(dataloader): 188 | subg, samples, labels = \ 189 | utils.generate_sampled_graph_and_labels_unsupervised( 190 | graph, batch, args.sampler, args.sample_depth, args.sample_width, 191 | args.graph_split_size, args.negative_sample 192 | ) 193 | 194 | # calculate norms and eigenvalues of the subgraph 195 | edge_norm = utils.compute_edgenorm(subg) 196 | nid = subg.ndata[dgl.NID] 197 | coef = (subg.ndata["in_deg"].float() + 1) / (graph.ndata["in_deg"][nid].float() + 1) 198 | coef = coef.view(-1, 1) 199 | if use_cuda: 200 | subg = subg.to("cuda:%d" % (args.gpu)) 201 | edge_norm = edge_norm.to("cuda:%d" % (args.gpu)) 202 | edge_type = subg.edata["type"] 203 | 204 | embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm) 205 | 206 | node_emb[nid] = node_emb[nid] * (1 - coef) + embed[0].detach().cpu() * coef 207 | # node_emb[nid].data.copy_(embed[0].detach().cpu()) 208 | node_sampled.update(nid.numpy()) 209 | 210 | print("{:5}% node embeddings are saved.".format(len(node_sampled) * 100 / num_nodes)) 211 | if len(seed_nodes) > 0: 212 | seed_nodes = np.array(sorted(seed_nodes)) 213 | utils.save(args, node_emb[seed_nodes].numpy(), index=seed_nodes) 214 | else: 215 | utils.save(args, node_emb.numpy()) 216 | 217 | return 218 | 219 | 220 | if __name__ == "__main__": 221 | parser = argparse.ArgumentParser(description="CompGCN") 222 | parser.add_argument( 223 | "--link", type=str, required=True, 224 | help="dataset to use" 225 | ) 226 | parser.add_argument( 227 | "--node", type=str, required=True, 228 | help="dataset to use" 229 | ) 230 | parser.add_argument( 231 | "--label", type=str, required=True, 232 | help="dataset to use" 233 | ) 234 | parser.add_argument( 235 | "--output", required=True, type=str, 236 | help="Output embedding file" 237 | ) 238 | parser.add_argument( 239 | "--dropout", type=float, default=0.2, 240 | help="dropout probability" 241 | ) 242 | parser.add_argument( 243 | "--n-hidden", type=int, default=50, 244 | help="number of hidden units" 245 | ) 246 | parser.add_argument( 247 | "--gpu", type=int, default=-1, 248 | help="gpu" 249 | ) 250 | parser.add_argument( 251 | "--lr", type=float, default=1e-2, 252 | help="learning rate" 253 | ) 254 | parser.add_argument( 255 | "--n-layers", type=int, default=2, 256 | help="number of propagation rounds" 257 | ) 258 | parser.add_argument( 259 | "--n-epochs", type=int, default=2000, 260 | help="number of minimum training epochs" 261 | ) 262 | parser.add_argument( 263 | "--regularization", type=float, default=0.01, 264 | help="regularization weight" 265 | ) 266 | parser.add_argument( 267 | "--grad-norm", type=float, default=1.0, 268 | help="norm to clip gradient to" 269 | ) 270 | parser.add_argument( 271 | "--label-batch-size", type=int, default=512 272 | ) 273 | parser.add_argument( 274 | "--graph-batch-size", type=int, default=20000, 275 | help="number of edges to sample in each iteration" 276 | ) 277 | parser.add_argument( 278 | "--graph-split-size", type=float, default=0.5, 279 | help="portion of edges used as positive sample" 280 | ) 281 | parser.add_argument( 282 | "--negative-sample", type=int, default=5, 283 | help="number of negative samples per positive sample" 284 | ) 285 | parser.add_argument( 286 | "--sampler", type=str, default="neighbor", 287 | help="type of subgraph sampler: neighbor or randomwalk" 288 | ) 289 | parser.add_argument( 290 | "--sample-depth", type=int, default=6 291 | ) 292 | parser.add_argument( 293 | "--sample-width", type=int, default=128 294 | ) 295 | parser.add_argument( 296 | "--attributed", type=str, default="False" 297 | ) 298 | parser.add_argument( 299 | "--supervised", type=str, default="False" 300 | ) 301 | 302 | args = parser.parse_args() 303 | print(args, flush=True) 304 | main(args) 305 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Model/DMPNN/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu=1 4 | attributed="False" 5 | supervised="False" 6 | negative_sample=5 7 | dropout=0.2 8 | n_hidden=50 9 | n_epochs=50 # the epoch here is different with the epoch in original HNE 10 | graph_batch_size=10000 11 | sample_depth=3 12 | sample_width=10 13 | label_batch_size=64 14 | grad_norm=1.0 15 | sampler=randomwalk 16 | 17 | for dataset in "PubMed" "Yelp" 18 | do 19 | folder="data/${dataset}/" 20 | node_file="${folder}node.dat" 21 | label_file="${folder}label.dat" 22 | link_file="${folder}link.dat" 23 | for lr in 1e-2 1e-3 24 | do 25 | for reg in 1e-2 1e-3 26 | do 27 | for n_layers in 1 2 28 | do 29 | for graph_split_size in 0.5 0.7 0.9 30 | do 31 | emb_file="${folder}emb_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat" 32 | record_file="${folder}record_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat" 33 | OMP_NUM_THREADS=4 python src/main.py \ 34 | --link ${link_file} \ 35 | --node ${node_file} \ 36 | --label ${label_file} \ 37 | --output ${emb_file} \ 38 | --n-hidden ${n_hidden} \ 39 | --negative-sample ${negative_sample} \ 40 | --lr ${lr} \ 41 | --dropout ${dropout} \ 42 | --gpu ${gpu} \ 43 | --n-layers ${n_layers} \ 44 | --n-epochs ${n_epochs} \ 45 | --regularization ${reg} \ 46 | --grad-norm ${grad_norm} \ 47 | --graph-batch-size ${graph_batch_size} \ 48 | --graph-split-size ${graph_split_size} \ 49 | --label-batch-size ${label_batch_size} \ 50 | --sampler ${sampler} \ 51 | --sample-depth ${sample_depth} \ 52 | --sample-width ${sample_width} \ 53 | --attributed ${attributed} \ 54 | --supervised ${supervised} 55 | done 56 | done 57 | done 58 | done 59 | done 60 | 61 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Model/DMPNN/src/main.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import dgl 4 | import math 5 | import numpy as np 6 | import os 7 | import time 8 | import torch 9 | import torch.nn.functional as F 10 | from torch.optim.lr_scheduler import LambdaLR 11 | from tqdm import tqdm 12 | 13 | import utils 14 | from model import * 15 | 16 | 17 | np.random.seed(12345) 18 | torch.manual_seed(12345) 19 | torch.cuda.manual_seed(12345) 20 | 21 | 22 | class CosineWarmupRestartScheduler(LambdaLR): 23 | def __init__( 24 | self, 25 | num_warmup_steps=600, 26 | num_schedule_steps=10000, 27 | num_cycles=2, 28 | min_percent=1e-3 29 | ): 30 | self.num_warmup_steps = num_warmup_steps 31 | self.num_schedule_steps = num_schedule_steps 32 | self.num_cycles = num_cycles 33 | self.min_percent = min_percent 34 | 35 | def set_optimizer(self, optimizer): 36 | super(CosineWarmupRestartScheduler, self).__init__(optimizer, self.lr_lambda) 37 | 38 | def lr_lambda(self, current_step): 39 | if current_step < self.num_warmup_steps: 40 | return float(current_step) / float(max(1, self.num_warmup_steps)) 41 | progress = float(current_step - self.num_warmup_steps) / \ 42 | float(max(1, self.num_schedule_steps - self.num_warmup_steps)) 43 | if progress >= 1.0: 44 | return self.min_percent 45 | return max(self.min_percent, 0.5 * (1.0 + math.cos(math.pi * ((float(self.num_cycles) * progress) % 1.0)))) 46 | 47 | 48 | def main(args): 49 | 50 | # load graph data 51 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start loading...", flush=True) 52 | if args.supervised == "True": 53 | train_pool, train_labels, nlabels, multi = utils.load_label(args.label) 54 | train_data, num_nodes, num_rels, train_indices, ntrain, node_attri = utils.load_supervised( 55 | args, args.link, args.node, train_pool 56 | ) 57 | elif args.supervised == "False": 58 | train_data, num_nodes, num_rels, node_attri = utils.load_unsupervised(args, args.link, args.node) 59 | nlabels = 0 60 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "finish loading...", flush=True) 61 | 62 | # check cuda 63 | use_cuda = args.gpu >= 0 and torch.cuda.is_available() 64 | if use_cuda: 65 | torch.cuda.set_device(args.gpu) 66 | print("check 1", flush=True) 67 | # create model 68 | model = TrainModel( 69 | node_attri, 70 | num_nodes, 71 | args.n_hidden, 72 | num_rels, 73 | nlabels, 74 | num_hidden_layers=args.n_layers, 75 | dropout=args.dropout, 76 | use_cuda=use_cuda, 77 | reg_param=args.regularization 78 | ) 79 | print("check 2", flush=True) 80 | if use_cuda: 81 | model.to("cuda:%d" % (args.gpu)) 82 | print("check 3", flush=True) 83 | """ 84 | # build adj list and calculate degrees for sampling 85 | degrees = utils.get_adj_and_degrees(num_nodes, train_data) 86 | """ 87 | # build graph 88 | graph = utils.build_graph_from_triplets(num_nodes, num_rels, train_data) 89 | graph.ndata[dgl.NID] = torch.arange(num_nodes, dtype=torch.long) 90 | graph.edata[dgl.EID] = torch.arange(len(train_data) * 2, dtype=torch.long) 91 | seed_nodes = list() 92 | if os.path.exists(args.node.replace("node.dat", "seed_node.dat")): 93 | with open(args.node.replace("node.dat", "seed_node.dat"), "r") as f: 94 | for line in f: 95 | seed_nodes.append(int(line)) 96 | seed_nodes = set(seed_nodes) 97 | if len(seed_nodes) > 0: 98 | dataloader = torch.utils.data.DataLoader( 99 | np.array([x for x in train_data if x[0] in seed_nodes or x[2] in seed_nodes]), 100 | batch_size=args.graph_batch_size, shuffle=True 101 | ) 102 | else: 103 | dataloader = torch.utils.data.DataLoader( 104 | train_data, 105 | batch_size=args.graph_batch_size, shuffle=True 106 | ) 107 | args.n_epochs = math.ceil(args.n_epochs * len(dataloader) * args.graph_batch_size / num_nodes) 108 | print("check 4", flush=True) 109 | # optimizer 110 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) 111 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs * len(dataloader), eta_min=3e-6) 112 | optimizer.zero_grad() 113 | scheduler.step(0) 114 | 115 | # training loop 116 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start training...", flush=True) 117 | model.train() 118 | prev_loss = np.float32("inf") 119 | for epoch in range(args.n_epochs): 120 | losses = [] 121 | for batch in tqdm(dataloader): 122 | # perform edge neighborhood sampling to generate training graph and data 123 | if args.supervised == "True": 124 | subg, samples, matched_labels, matched_index = \ 125 | utils.generate_sampled_graph_and_labels_supervised( 126 | graph, batch, args.sampler, args.sample_depth, args.sample_width, 127 | args.graph_split_size, 128 | train_indices, train_labels, multi, nlabels, ntrain, 129 | if_train=True, label_batch_size=args.label_batch_size 130 | ) 131 | if multi: 132 | matched_labels = torch.from_numpy(matched_labels).float() 133 | else: 134 | matched_labels = torch.from_numpy(matched_labels).long() 135 | if use_cuda: 136 | matched_labels = matched_labels.to("cuda:%d" % (args.gpu)) 137 | elif args.supervised == "False": 138 | subg, samples, labels = \ 139 | utils.generate_sampled_graph_and_labels_unsupervised( 140 | graph, batch, args.sampler, args.sample_depth, args.sample_width, 141 | args.graph_split_size, args.negative_sample 142 | ) 143 | samples = torch.from_numpy(samples) 144 | labels = torch.from_numpy(labels) 145 | if use_cuda: 146 | samples = samples.to("cuda:%d" % (args.gpu)) 147 | labels = labels.to("cuda:%d" % (args.gpu)) 148 | else: 149 | raise ValueError 150 | 151 | # calculate norms and eigenvalues of the subgraph 152 | edge_norm = utils.compute_edgenorm(subg) 153 | if use_cuda: 154 | subg = subg.to("cuda:%d" % (args.gpu)) 155 | edge_norm = edge_norm.to("cuda:%d" % (args.gpu)) 156 | edge_type = subg.edata["type"] 157 | 158 | embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm) 159 | 160 | if args.supervised == "True": 161 | loss = model.get_supervised_loss(subg, embed, edge_type, pred, matched_labels, matched_index, multi) 162 | elif args.supervised == "False": 163 | loss = model.get_unsupervised_loss(subg, embed, edge_type, samples, labels) 164 | loss.backward() 165 | losses.append(loss.item()) 166 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm) # clip gradients 167 | optimizer.step() 168 | optimizer.zero_grad() 169 | scheduler.step() 170 | loss = sum(losses) / len(losses) 171 | 172 | print( 173 | time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + 174 | "Epoch {:05d} | Loss {:.4f}".format(epoch, loss), 175 | flush=True 176 | ) 177 | if loss > prev_loss: 178 | break 179 | prev_loss = loss 180 | 181 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "training done", flush=True) 182 | 183 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start output...", flush=True) 184 | dataloader = torch.utils.data.DataLoader(train_data, batch_size=args.graph_batch_size * 4, shuffle=False) 185 | model.eval() 186 | with torch.no_grad(): 187 | node_emb, node_sampled = model.model.node_emb.weight.detach().cpu().clone(), set() 188 | for batch in tqdm(dataloader): 189 | subg, samples, labels = \ 190 | utils.generate_sampled_graph_and_labels_unsupervised( 191 | graph, batch, args.sampler, args.sample_depth, args.sample_width, 192 | args.graph_split_size, args.negative_sample 193 | ) 194 | 195 | # calculate norms and eigenvalues of the subgraph 196 | edge_norm = utils.compute_edgenorm(subg) 197 | nid = subg.ndata[dgl.NID] 198 | coef = (subg.ndata["in_deg"].float() + 1) / (graph.ndata["in_deg"][nid].float() + 1) 199 | coef = coef.view(-1, 1) 200 | if use_cuda: 201 | subg = subg.to("cuda:%d" % (args.gpu)) 202 | edge_norm = edge_norm.to("cuda:%d" % (args.gpu)) 203 | edge_type = subg.edata["type"] 204 | 205 | embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm) 206 | 207 | node_emb[nid] = node_emb[nid] * (1 - coef) + embed[0].detach().cpu() * coef 208 | # node_emb[nid].data.copy_(embed[0].detach().cpu()) 209 | node_sampled.update(nid.numpy()) 210 | 211 | print("{:5}% node embeddings are saved.".format(len(node_sampled) * 100 / num_nodes)) 212 | if len(seed_nodes) > 0: 213 | seed_nodes = np.array(sorted(seed_nodes)) 214 | utils.save(args, node_emb[seed_nodes].numpy(), index=seed_nodes) 215 | else: 216 | utils.save(args, node_emb.numpy()) 217 | 218 | return 219 | 220 | 221 | if __name__ == "__main__": 222 | parser = argparse.ArgumentParser(description="DMPNN") 223 | parser.add_argument( 224 | "--link", type=str, required=True, 225 | help="dataset to use" 226 | ) 227 | parser.add_argument( 228 | "--node", type=str, required=True, 229 | help="dataset to use" 230 | ) 231 | parser.add_argument( 232 | "--label", type=str, required=True, 233 | help="dataset to use" 234 | ) 235 | parser.add_argument( 236 | "--output", required=True, type=str, 237 | help="Output embedding file" 238 | ) 239 | parser.add_argument( 240 | "--dropout", type=float, default=0.2, 241 | help="dropout probability" 242 | ) 243 | parser.add_argument( 244 | "--n-hidden", type=int, default=50, 245 | help="number of hidden units" 246 | ) 247 | parser.add_argument( 248 | "--gpu", type=int, default=-1, 249 | help="gpu" 250 | ) 251 | parser.add_argument( 252 | "--lr", type=float, default=1e-2, 253 | help="learning rate" 254 | ) 255 | parser.add_argument( 256 | "--n-layers", type=int, default=2, 257 | help="number of propagation rounds" 258 | ) 259 | parser.add_argument( 260 | "--n-epochs", type=int, default=2000, 261 | help="number of minimum training epochs" 262 | ) 263 | parser.add_argument( 264 | "--regularization", type=float, default=0.01, 265 | help="regularization weight" 266 | ) 267 | parser.add_argument( 268 | "--grad-norm", type=float, default=1.0, 269 | help="norm to clip gradient to" 270 | ) 271 | parser.add_argument( 272 | "--label-batch-size", type=int, default=512 273 | ) 274 | parser.add_argument( 275 | "--graph-batch-size", type=int, default=20000, 276 | help="number of edges to sample in each iteration" 277 | ) 278 | parser.add_argument( 279 | "--graph-split-size", type=float, default=0.5, 280 | help="portion of edges used as positive sample" 281 | ) 282 | parser.add_argument( 283 | "--negative-sample", type=int, default=5, 284 | help="number of negative samples per positive sample" 285 | ) 286 | parser.add_argument( 287 | "--sampler", type=str, default="neighbor", 288 | help="type of subgraph sampler: neighbor or randomwalk" 289 | ) 290 | parser.add_argument( 291 | "--sample-depth", type=int, default=6 292 | ) 293 | parser.add_argument( 294 | "--sample-width", type=int, default=128 295 | ) 296 | parser.add_argument( 297 | "--attributed", type=str, default="False" 298 | ) 299 | parser.add_argument( 300 | "--supervised", type=str, default="False" 301 | ) 302 | 303 | args = parser.parse_args() 304 | print(args, flush=True) 305 | main(args) 306 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Model/R-GCN/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu=3 4 | attributed="False" 5 | supervised="False" 6 | negative_sample=5 7 | dropout=0.2 8 | n_hidden=50 9 | n_epochs=50 # the epoch here is different with the epoch in original HNE 10 | graph_batch_size=10000 11 | sample_depth=3 12 | sample_width=10 13 | label_batch_size=64 14 | grad_norm=1.0 15 | sampler=randomwalk 16 | 17 | for dataset in "PubMed" "Yelp" 18 | do 19 | folder="data/${dataset}/" 20 | node_file="${folder}node.dat" 21 | label_file="${folder}label.dat" 22 | link_file="${folder}link.dat" 23 | for lr in 1e-2 1e-3 24 | do 25 | for reg in 1e-2 1e-3 26 | do 27 | for n_layers in 1 2 28 | do 29 | for graph_split_size in 0.5 0.7 0.9 30 | do 31 | emb_file="${folder}emb_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat" 32 | record_file="${folder}record_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat" 33 | OMP_NUM_THREADS=4 python src/main.py \ 34 | --link ${link_file} \ 35 | --node ${node_file} \ 36 | --label ${label_file} \ 37 | --output ${emb_file} \ 38 | --n-hidden ${n_hidden} \ 39 | --negative-sample ${negative_sample} \ 40 | --lr ${lr} \ 41 | --dropout ${dropout} \ 42 | --gpu ${gpu} \ 43 | --n-layers ${n_layers} \ 44 | --n-epochs ${n_epochs} \ 45 | --regularization ${reg} \ 46 | --grad-norm ${grad_norm} \ 47 | --graph-batch-size ${graph_batch_size} \ 48 | --graph-split-size ${graph_split_size} \ 49 | --label-batch-size ${label_batch_size} \ 50 | --sampler ${sampler} \ 51 | --sample-depth ${sample_depth} \ 52 | --sample-width ${sample_width} \ 53 | --attributed ${attributed} \ 54 | --supervised ${supervised} 55 | done 56 | done 57 | done 58 | done 59 | done 60 | 61 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Model/R-GCN/src/model.py: -------------------------------------------------------------------------------- 1 | import dgl 2 | import dgl.function as fn 3 | import math 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from dgl.nn.pytorch import RelGraphConv 9 | from utils import * 10 | 11 | 12 | class MultiHotEmbeddingLayer(nn.Module): 13 | def __init__(self, num_emb, emb_dim, base=2): 14 | super(MultiHotEmbeddingLayer, self).__init__() 15 | self.num_emb = num_emb 16 | enc_len = get_enc_len(num_emb - 1, base) 17 | self.encoding = nn.Embedding(num_emb, enc_len * base) 18 | self.embedding = nn.Parameter(torch.Tensor(enc_len * base, emb_dim)) 19 | 20 | with torch.no_grad(): 21 | self.encoding.weight.data.copy_( 22 | torch.from_numpy(int2multihot(np.arange(0, num_emb), enc_len, base)).float() 23 | ) 24 | 25 | scale = 1 / (emb_dim * enc_len)**0.5 26 | nn.init.uniform_(self.embedding, -scale, scale) 27 | self.encoding.weight.requires_grad = False 28 | 29 | def forward(self, g, x): 30 | enc = self.encoding(x.squeeze()) 31 | emb = torch.matmul(enc.view(-1, self.embedding.size(0)), self.embedding) 32 | return emb 33 | 34 | @property 35 | def weight(self): 36 | return torch.matmul(self.encoding.weight, self.embedding) 37 | 38 | 39 | class EmbeddingLayer(nn.Module): 40 | def __init__(self, num_emb, emb_dim): 41 | super(EmbeddingLayer, self).__init__() 42 | self.embedding = nn.Embedding(num_emb, emb_dim) 43 | scale = 1 / (emb_dim)**0.5 44 | nn.init.uniform_(self.embedding.weight, -scale, scale) 45 | 46 | def forward(self, g, x): 47 | return self.embedding(x.squeeze()) 48 | 49 | @property 50 | def weight(self): 51 | return self.embedding.weight 52 | 53 | 54 | class EmbeddingLayerAttri(nn.Module): 55 | def __init__(self, attri): 56 | super(EmbeddingLayerAttri, self).__init__() 57 | self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(attri)) 58 | 59 | def forward(self, g, x): 60 | return self.embedding(x.squeeze()) 61 | 62 | @property 63 | def weight(self): 64 | return self.embedding.weight 65 | 66 | 67 | class BaseModel(nn.Module): 68 | def __init__( 69 | self, 70 | node_attri, 71 | rel_attri, 72 | num_nodes, 73 | h_dim, 74 | out_dim, 75 | num_rels, 76 | num_hidden_layers=1, 77 | dropout=0, 78 | use_cuda=False 79 | ): 80 | super(BaseModel, self).__init__() 81 | self.num_nodes = num_nodes 82 | self.h_dim = h_dim 83 | self.out_dim = out_dim 84 | self.num_rels = num_rels 85 | self.num_hidden_layers = num_hidden_layers 86 | self.dropout = dropout 87 | self.use_cuda = use_cuda 88 | 89 | # create conjgcn layers 90 | self.build_model(node_attri, rel_attri) 91 | 92 | def build_model(self, node_attri, rel_attri): 93 | self.node_emb, self.rel_emb = self.build_input_layer(node_attri, rel_attri) 94 | self.layers = nn.ModuleList() 95 | # h2h 96 | for idx in range(self.num_hidden_layers): 97 | h2h = self.build_hidden_layer(idx) 98 | self.layers.append(h2h) 99 | # h2o 100 | h2o = self.build_output_layer() 101 | if h2o is not None: 102 | self.layers.append(h2o) 103 | 104 | def build_input_layer(self, node_attri, rel_attri): 105 | return None, None 106 | 107 | def build_hidden_layer(self, idx): 108 | raise NotImplementedError 109 | 110 | def build_output_layer(self): 111 | return None 112 | 113 | def forward(self, g, h, r, norm): 114 | raise NotImplementedError 115 | 116 | 117 | class RGCN(BaseModel): 118 | def build_input_layer(self, node_attri, rel_attri): 119 | if node_attri is not None: 120 | return EmbeddingLayerAttri(node_attri), None 121 | return EmbeddingLayer(self.num_nodes, self.h_dim), None 122 | 123 | def build_hidden_layer(self, idx): 124 | if idx == 0: 125 | in_dim = self.h_dim 126 | else: 127 | in_dim = self.out_dim 128 | if idx < self.num_hidden_layers - 1: 129 | act = nn.Tanh() 130 | else: 131 | act = None 132 | return RelGraphConv( 133 | in_dim, 134 | self.out_dim, 135 | self.num_rels, 136 | regularizer="basis", 137 | num_bases=-1, 138 | bias=True, 139 | activation=act, 140 | self_loop=True, 141 | dropout=self.dropout 142 | ) 143 | 144 | def forward(self, g, h, r, norm): 145 | h = self.node_emb(g, h) 146 | for layer in self.layers: 147 | h = layer(g, h, r, norm) 148 | return h 149 | 150 | 151 | class TrainModel(nn.Module): 152 | def __init__( 153 | self, 154 | node_attri, 155 | num_nodes, 156 | o_dim, 157 | num_rels, 158 | nlabel, 159 | num_hidden_layers=1, 160 | dropout=0, 161 | use_cuda=False, 162 | reg_param=0 163 | ): 164 | super(TrainModel, self).__init__() 165 | 166 | i_dim = o_dim if node_attri is None else node_attri.shape[1] 167 | self.model = RGCN( 168 | node_attri, None, num_nodes, i_dim, o_dim, num_rels * 2, num_hidden_layers, dropout, use_cuda 169 | ) 170 | self.reg_param = reg_param 171 | 172 | if nlabel == 0: 173 | self.supervised = False 174 | self.w_relation = nn.Parameter(torch.Tensor(num_rels, o_dim)) 175 | nn.init.xavier_uniform_(self.w_relation, gain=nn.init.calculate_gain('relu')) 176 | else: 177 | self.supervised = True 178 | self.node_fc = nn.Linear(o_dim, nlabel) 179 | nn.init.xavier_normal_(self.node_fc.weight, gain=nn.init.calculate_gain("sigmoid")) 180 | nn.init.zeros_(self.node_fc.bias) 181 | 182 | # self.edge_fc = nn.Linear(o_dim, num_rels * 2) 183 | self.edge_fc = nn.Linear(o_dim, o_dim) 184 | nn.init.xavier_normal_(self.edge_fc.weight, gain=nn.init.calculate_gain('sigmoid')) 185 | nn.init.zeros_(self.edge_fc.bias) 186 | 187 | def calc_score(self, embedding, triplets): 188 | if isinstance(embedding, (tuple, list)): 189 | node_emb = embedding[0] 190 | else: 191 | node_emb = embedding 192 | s = node_emb[triplets[:, 0]] 193 | r = self.w_relation[triplets[:, 1]] 194 | o = node_emb[triplets[:, 2]] 195 | score = torch.sum(s * r * o, dim=1) 196 | return score 197 | 198 | def forward(self, g, h, edge_type, edge_norm): 199 | output = self.model.forward(g, h, edge_type, edge_norm) 200 | if self.supervised: 201 | if isinstance(output, (tuple, list)): 202 | pred = self.node_fc(output[0]) 203 | else: 204 | pred = self.node_fc(output) 205 | else: 206 | pred = None 207 | 208 | return output, pred 209 | 210 | def unsupervised_regularization_loss(self, embedding, edge_type=None): 211 | reg = torch.mean(self.w_relation.pow(2)) 212 | if isinstance(embedding, (tuple, list)): 213 | for emb in embedding: 214 | reg = reg + torch.mean(emb.pow(2)) 215 | elif isinstance(embedding, torch.Tensor): 216 | reg = reg + torch.mean(embedding.pow(2)) 217 | else: 218 | raise ValueError 219 | if edge_type is not None: 220 | if isinstance(embedding, (tuple, list)): 221 | for emb in embedding: 222 | if emb.size(0) == edge_type.size(0): 223 | mask = edge_type < self.w_relation.size(0) 224 | # reg = reg + F.cross_entropy(self.edge_fc(emb[mask]), edge_type[mask]) 225 | emb_diff = self.edge_fc(emb[mask]) - torch.index_select(self.w_relation, 0, edge_type[mask]) 226 | reg = reg + torch.mean(torch.pow(emb_diff, 2)) 227 | elif isinstance(embedding, torch.Tensor): 228 | if embedding.size(0) == edge_type.size(0): 229 | mask = edge_type < self.w_relation.size(0) 230 | # reg = reg + F.cross_entropy(self.edge_fc(embedding[mask]), edge_type[mask]) 231 | emb_diff = self.edge_fc(embedding[mask]) - torch.index_select(self.w_relation, 0, edge_type[mask]) 232 | reg = reg + torch.mean(torch.pow(emb_diff, 2)) 233 | 234 | return reg 235 | 236 | def get_unsupervised_loss(self, g, embedding, edge_type, triplets, labels): 237 | # triplets is a list of data samples (positive and negative) 238 | # each row in the triplets is a 3-tuple of (source, relation, destination) 239 | score = self.calc_score(embedding, triplets) 240 | predict_loss = F.binary_cross_entropy_with_logits(score, labels) 241 | reg_loss = self.unsupervised_regularization_loss(embedding, edge_type=edge_type) 242 | return predict_loss + self.reg_param * reg_loss 243 | 244 | def supervised_regularization_loss(self, embedding, edge_type=None): 245 | return self.unsupervised_regularization_loss(embedding, edge_type=edge_type) 246 | 247 | def get_supervised_loss(self, g, embedding, edge_type, pred, matched_labels, matched_index, multi): 248 | # triplets is a list of data samples (positive and negative) 249 | # each row in the triplets is a 3-tuple of (source, relation, destination) 250 | if multi: 251 | predict_loss = F.binary_cross_entropy(torch.sigmoid(pred[matched_index]), matched_labels) 252 | else: 253 | predict_loss = F.nll_loss(F.log_softmax(pred[matched_index]), matched_labels) 254 | reg_loss = self.supervised_regularization_loss(embedding, edge_type=edge_type) 255 | return predict_loss + self.reg_param * reg_loss 256 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Model/R-GIN/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu=2 4 | attributed="False" 5 | supervised="False" 6 | negative_sample=5 7 | dropout=0.2 8 | n_hidden=50 9 | n_epochs=50 # the epoch here is different with the epoch in original HNE 10 | graph_batch_size=10000 11 | sample_depth=3 12 | sample_width=10 13 | label_batch_size=64 14 | grad_norm=1.0 15 | sampler=randomwalk 16 | 17 | for dataset in "PubMed" "Yelp" 18 | do 19 | folder="data/${dataset}/" 20 | node_file="${folder}node.dat" 21 | label_file="${folder}label.dat" 22 | link_file="${folder}link.dat" 23 | for lr in 1e-2 1e-3 24 | do 25 | for reg in 1e-2 1e-3 26 | do 27 | for n_layers in 1 2 28 | do 29 | for graph_split_size in 0.5 0.7 0.9 30 | do 31 | emb_file="${folder}emb_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat" 32 | record_file="${folder}record_noattr_unsup_${sampler}_lr${lr}_reg${reg}_nlayer${n_layers}_gsplit${graph_split_size}_hidden${n_hidden}.dat" 33 | OMP_NUM_THREADS=4 python src/main.py \ 34 | --link ${link_file} \ 35 | --node ${node_file} \ 36 | --label ${label_file} \ 37 | --output ${emb_file} \ 38 | --n-hidden ${n_hidden} \ 39 | --negative-sample ${negative_sample} \ 40 | --lr ${lr} \ 41 | --dropout ${dropout} \ 42 | --gpu ${gpu} \ 43 | --n-layers ${n_layers} \ 44 | --n-epochs ${n_epochs} \ 45 | --regularization ${reg} \ 46 | --grad-norm ${grad_norm} \ 47 | --graph-batch-size ${graph_batch_size} \ 48 | --graph-split-size ${graph_split_size} \ 49 | --label-batch-size ${label_batch_size} \ 50 | --sampler ${sampler} \ 51 | --sample-depth ${sample_depth} \ 52 | --sample-width ${sample_width} \ 53 | --attributed ${attributed} \ 54 | --supervised ${supervised} 55 | done 56 | done 57 | done 58 | done 59 | done 60 | 61 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Model/R-GIN/src/main.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import dgl 4 | import math 5 | import numpy as np 6 | import os 7 | import time 8 | import torch 9 | import torch.nn.functional as F 10 | from torch.optim.lr_scheduler import LambdaLR 11 | from tqdm import tqdm 12 | 13 | import utils 14 | from model import * 15 | 16 | 17 | np.random.seed(12345) 18 | torch.manual_seed(12345) 19 | torch.cuda.manual_seed(12345) 20 | 21 | 22 | class CosineWarmupRestartScheduler(LambdaLR): 23 | def __init__( 24 | self, 25 | num_warmup_steps=600, 26 | num_schedule_steps=10000, 27 | num_cycles=2, 28 | min_percent=1e-3 29 | ): 30 | self.num_warmup_steps = num_warmup_steps 31 | self.num_schedule_steps = num_schedule_steps 32 | self.num_cycles = num_cycles 33 | self.min_percent = min_percent 34 | 35 | def set_optimizer(self, optimizer): 36 | super(CosineWarmupRestartScheduler, self).__init__(optimizer, self.lr_lambda) 37 | 38 | def lr_lambda(self, current_step): 39 | if current_step < self.num_warmup_steps: 40 | return float(current_step) / float(max(1, self.num_warmup_steps)) 41 | progress = float(current_step - self.num_warmup_steps) / \ 42 | float(max(1, self.num_schedule_steps - self.num_warmup_steps)) 43 | if progress >= 1.0: 44 | return self.min_percent 45 | return max(self.min_percent, 0.5 * (1.0 + math.cos(math.pi * ((float(self.num_cycles) * progress) % 1.0)))) 46 | 47 | 48 | def main(args): 49 | 50 | # load graph data 51 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start loading...", flush=True) 52 | if args.supervised == "True": 53 | train_pool, train_labels, nlabels, multi = utils.load_label(args.label) 54 | train_data, num_nodes, num_rels, train_indices, ntrain, node_attri = utils.load_supervised( 55 | args, args.link, args.node, train_pool 56 | ) 57 | elif args.supervised == "False": 58 | train_data, num_nodes, num_rels, node_attri = utils.load_unsupervised(args, args.link, args.node) 59 | nlabels = 0 60 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "finish loading...", flush=True) 61 | 62 | # check cuda 63 | use_cuda = args.gpu >= 0 and torch.cuda.is_available() 64 | if use_cuda: 65 | torch.cuda.set_device(args.gpu) 66 | print("check 1", flush=True) 67 | # create model 68 | model = TrainModel( 69 | node_attri, 70 | num_nodes, 71 | args.n_hidden, 72 | num_rels, 73 | nlabels, 74 | num_hidden_layers=args.n_layers, 75 | dropout=args.dropout, 76 | use_cuda=use_cuda, 77 | reg_param=args.regularization 78 | ) 79 | print("check 2", flush=True) 80 | if use_cuda: 81 | model.to("cuda:%d" % (args.gpu)) 82 | print("check 3", flush=True) 83 | """ 84 | # build adj list and calculate degrees for sampling 85 | degrees = utils.get_adj_and_degrees(num_nodes, train_data) 86 | """ 87 | # build graph 88 | graph = utils.build_graph_from_triplets(num_nodes, num_rels, train_data) 89 | graph.ndata[dgl.NID] = torch.arange(num_nodes, dtype=torch.long) 90 | graph.edata[dgl.EID] = torch.arange(len(train_data) * 2, dtype=torch.long) 91 | seed_nodes = list() 92 | if os.path.exists(args.node.replace("node.dat", "seed_node.dat")): 93 | with open(args.node.replace("node.dat", "seed_node.dat"), "r") as f: 94 | for line in f: 95 | seed_nodes.append(int(line)) 96 | seed_nodes = set(seed_nodes) 97 | if len(seed_nodes) > 0: 98 | dataloader = torch.utils.data.DataLoader( 99 | np.array([x for x in train_data if x[0] in seed_nodes or x[2] in seed_nodes]), 100 | batch_size=args.graph_batch_size, shuffle=True 101 | ) 102 | else: 103 | dataloader = torch.utils.data.DataLoader( 104 | train_data, 105 | batch_size=args.graph_batch_size, shuffle=True 106 | ) 107 | print("check 4", flush=True) 108 | # optimizer 109 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) 110 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs * len(dataloader), eta_min=3e-6) 111 | optimizer.zero_grad() 112 | scheduler.step(0) 113 | 114 | # training loop 115 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start training...", flush=True) 116 | model.train() 117 | prev_loss = np.float32("inf") 118 | for epoch in range(args.n_epochs): 119 | losses = [] 120 | for batch in tqdm(dataloader): 121 | # perform edge neighborhood sampling to generate training graph and data 122 | if args.supervised == "True": 123 | subg, samples, matched_labels, matched_index = \ 124 | utils.generate_sampled_graph_and_labels_supervised( 125 | graph, batch, args.sampler, args.sample_depth, args.sample_width, 126 | args.graph_split_size, 127 | train_indices, train_labels, multi, nlabels, ntrain, 128 | if_train=True, label_batch_size=args.label_batch_size 129 | ) 130 | if multi: 131 | matched_labels = torch.from_numpy(matched_labels).float() 132 | else: 133 | matched_labels = torch.from_numpy(matched_labels).long() 134 | if use_cuda: 135 | matched_labels = matched_labels.to("cuda:%d" % (args.gpu)) 136 | elif args.supervised == "False": 137 | subg, samples, labels = \ 138 | utils.generate_sampled_graph_and_labels_unsupervised( 139 | graph, batch, args.sampler, args.sample_depth, args.sample_width, 140 | args.graph_split_size, args.negative_sample 141 | ) 142 | samples = torch.from_numpy(samples) 143 | labels = torch.from_numpy(labels) 144 | if use_cuda: 145 | samples = samples.to("cuda:%d" % (args.gpu)) 146 | labels = labels.to("cuda:%d" % (args.gpu)) 147 | else: 148 | raise ValueError 149 | 150 | # calculate norms and eigenvalues of the subgraph 151 | edge_norm = utils.compute_edgenorm(subg) 152 | if use_cuda: 153 | subg = subg.to("cuda:%d" % (args.gpu)) 154 | edge_norm = edge_norm.to("cuda:%d" % (args.gpu)) 155 | edge_type = subg.edata["type"] 156 | 157 | embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm) 158 | 159 | if args.supervised == "True": 160 | loss = model.get_supervised_loss(subg, embed, edge_type, pred, matched_labels, matched_index, multi) 161 | elif args.supervised == "False": 162 | loss = model.get_unsupervised_loss(subg, embed, edge_type, samples, labels) 163 | loss.backward() 164 | losses.append(loss.item()) 165 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm) # clip gradients 166 | optimizer.step() 167 | optimizer.zero_grad() 168 | scheduler.step() 169 | loss = sum(losses) / len(losses) 170 | 171 | print( 172 | time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + 173 | "Epoch {:05d} | Loss {:.4f}".format(epoch, loss), 174 | flush=True 175 | ) 176 | if loss > prev_loss: 177 | break 178 | prev_loss = loss 179 | 180 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "training done", flush=True) 181 | 182 | print(time.strftime("%a, %d %b %Y %H:%M:%S +0000: ", time.localtime()) + "start output...", flush=True) 183 | dataloader = torch.utils.data.DataLoader(train_data, batch_size=args.graph_batch_size * 4, shuffle=False) 184 | model.eval() 185 | with torch.no_grad(): 186 | node_emb, node_sampled = model.model.node_emb.weight.detach().cpu().clone(), set() 187 | for batch in tqdm(dataloader): 188 | subg, samples, labels = \ 189 | utils.generate_sampled_graph_and_labels_unsupervised( 190 | graph, batch, args.sampler, args.sample_depth, args.sample_width, 191 | args.graph_split_size, args.negative_sample 192 | ) 193 | 194 | # calculate norms and eigenvalues of the subgraph 195 | edge_norm = utils.compute_edgenorm(subg) 196 | nid = subg.ndata[dgl.NID] 197 | coef = (subg.ndata["in_deg"].float() + 1) / (graph.ndata["in_deg"][nid].float() + 1) 198 | coef = coef.view(-1, 1) 199 | if use_cuda: 200 | subg = subg.to("cuda:%d" % (args.gpu)) 201 | edge_norm = edge_norm.to("cuda:%d" % (args.gpu)) 202 | edge_type = subg.edata["type"] 203 | 204 | embed, pred = model(subg, h=subg.ndata[dgl.NID], edge_type=edge_type, edge_norm=edge_norm) 205 | 206 | node_emb[nid] = node_emb[nid] * (1 - coef) + embed[0].detach().cpu() * coef 207 | # node_emb[nid].data.copy_(embed[0].detach().cpu()) 208 | node_sampled.update(nid.numpy()) 209 | 210 | print("{:5}% node embeddings are saved.".format(len(node_sampled) * 100 / num_nodes)) 211 | if len(seed_nodes) > 0: 212 | seed_nodes = np.array(sorted(seed_nodes)) 213 | utils.save(args, node_emb[seed_nodes].numpy(), index=seed_nodes) 214 | else: 215 | utils.save(args, node_emb.numpy()) 216 | 217 | return 218 | 219 | 220 | if __name__ == "__main__": 221 | parser = argparse.ArgumentParser(description="R-GIN") 222 | parser.add_argument( 223 | "--link", type=str, required=True, 224 | help="dataset to use" 225 | ) 226 | parser.add_argument( 227 | "--node", type=str, required=True, 228 | help="dataset to use" 229 | ) 230 | parser.add_argument( 231 | "--label", type=str, required=True, 232 | help="dataset to use" 233 | ) 234 | parser.add_argument( 235 | "--output", required=True, type=str, 236 | help="Output embedding file" 237 | ) 238 | parser.add_argument( 239 | "--dropout", type=float, default=0.2, 240 | help="dropout probability" 241 | ) 242 | parser.add_argument( 243 | "--n-hidden", type=int, default=50, 244 | help="number of hidden units" 245 | ) 246 | parser.add_argument( 247 | "--gpu", type=int, default=-1, 248 | help="gpu" 249 | ) 250 | parser.add_argument( 251 | "--lr", type=float, default=1e-2, 252 | help="learning rate" 253 | ) 254 | parser.add_argument( 255 | "--n-layers", type=int, default=2, 256 | help="number of propagation rounds" 257 | ) 258 | parser.add_argument( 259 | "--n-epochs", type=int, default=2000, 260 | help="number of minimum training epochs" 261 | ) 262 | parser.add_argument( 263 | "--regularization", type=float, default=0.01, 264 | help="regularization weight" 265 | ) 266 | parser.add_argument( 267 | "--grad-norm", type=float, default=1.0, 268 | help="norm to clip gradient to" 269 | ) 270 | parser.add_argument( 271 | "--label-batch-size", type=int, default=512 272 | ) 273 | parser.add_argument( 274 | "--graph-batch-size", type=int, default=20000, 275 | help="number of edges to sample in each iteration" 276 | ) 277 | parser.add_argument( 278 | "--graph-split-size", type=float, default=0.5, 279 | help="portion of edges used as positive sample" 280 | ) 281 | parser.add_argument( 282 | "--negative-sample", type=int, default=5, 283 | help="number of negative samples per positive sample" 284 | ) 285 | parser.add_argument( 286 | "--sampler", type=str, default="neighbor", 287 | help="type of subgraph sampler: neighbor or randomwalk" 288 | ) 289 | parser.add_argument( 290 | "--sample-depth", type=int, default=6 291 | ) 292 | parser.add_argument( 293 | "--sample-width", type=int, default=128 294 | ) 295 | parser.add_argument( 296 | "--attributed", type=str, default="False" 297 | ) 298 | parser.add_argument( 299 | "--supervised", type=str, default="False" 300 | ) 301 | 302 | args = parser.parse_args() 303 | print(args, flush=True) 304 | main(args) 305 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Model/R-GIN/src/model.py: -------------------------------------------------------------------------------- 1 | import dgl 2 | import dgl.function as fn 3 | import math 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from dgl.nn.pytorch import RelGraphConv 9 | from utils import * 10 | 11 | 12 | class MultiHotEmbeddingLayer(nn.Module): 13 | def __init__(self, num_emb, emb_dim, base=2): 14 | super(MultiHotEmbeddingLayer, self).__init__() 15 | self.num_emb = num_emb 16 | enc_len = get_enc_len(num_emb - 1, base) 17 | self.encoding = nn.Embedding(num_emb, enc_len * base) 18 | self.embedding = nn.Parameter(torch.Tensor(enc_len * base, emb_dim)) 19 | 20 | with torch.no_grad(): 21 | self.encoding.weight.data.copy_( 22 | torch.from_numpy(int2multihot(np.arange(0, num_emb), enc_len, base)).float() 23 | ) 24 | 25 | scale = 1 / (emb_dim * enc_len)**0.5 26 | nn.init.uniform_(self.embedding, -scale, scale) 27 | self.encoding.weight.requires_grad = False 28 | 29 | def forward(self, g, x): 30 | enc = self.encoding(x.squeeze()) 31 | emb = torch.matmul(enc.view(-1, self.embedding.size(0)), self.embedding) 32 | return emb 33 | 34 | @property 35 | def weight(self): 36 | return torch.matmul(self.encoding.weight, self.embedding) 37 | 38 | 39 | class EmbeddingLayer(nn.Module): 40 | def __init__(self, num_emb, emb_dim): 41 | super(EmbeddingLayer, self).__init__() 42 | self.embedding = nn.Embedding(num_emb, emb_dim) 43 | scale = 1 / (emb_dim)**0.5 44 | nn.init.uniform_(self.embedding.weight, -scale, scale) 45 | 46 | def forward(self, g, x): 47 | return self.embedding(x.squeeze()) 48 | 49 | @property 50 | def weight(self): 51 | return self.embedding.weight 52 | 53 | 54 | class EmbeddingLayerAttri(nn.Module): 55 | def __init__(self, attri): 56 | super(EmbeddingLayerAttri, self).__init__() 57 | self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(attri)) 58 | 59 | def forward(self, g, x): 60 | return self.embedding(x.squeeze()) 61 | 62 | @property 63 | def weight(self): 64 | return self.embedding.weight 65 | 66 | 67 | class BaseModel(nn.Module): 68 | def __init__( 69 | self, 70 | node_attri, 71 | rel_attri, 72 | num_nodes, 73 | h_dim, 74 | out_dim, 75 | num_rels, 76 | num_hidden_layers=1, 77 | dropout=0, 78 | use_cuda=False 79 | ): 80 | super(BaseModel, self).__init__() 81 | self.num_nodes = num_nodes 82 | self.h_dim = h_dim 83 | self.out_dim = out_dim 84 | self.num_rels = num_rels 85 | self.num_hidden_layers = num_hidden_layers 86 | self.dropout = dropout 87 | self.use_cuda = use_cuda 88 | 89 | # create conjgcn layers 90 | self.build_model(node_attri, rel_attri) 91 | 92 | def build_model(self, node_attri, rel_attri): 93 | self.node_emb, self.rel_emb = self.build_input_layer(node_attri, rel_attri) 94 | self.layers = nn.ModuleList() 95 | # h2h 96 | for idx in range(self.num_hidden_layers): 97 | h2h = self.build_hidden_layer(idx) 98 | self.layers.append(h2h) 99 | # h2o 100 | h2o = self.build_output_layer() 101 | if h2o is not None: 102 | self.layers.append(h2o) 103 | 104 | def build_input_layer(self, node_attri, rel_attri): 105 | return None, None 106 | 107 | def build_hidden_layer(self, idx): 108 | raise NotImplementedError 109 | 110 | def build_output_layer(self): 111 | return None 112 | 113 | def forward(self, g, h, r, norm): 114 | raise NotImplementedError 115 | 116 | 117 | class RelGraphIso(RelGraphConv): 118 | def __init__( 119 | self, 120 | in_feat, 121 | out_feat, 122 | num_rels, 123 | regularizer="basis", 124 | num_bases=None, 125 | bias=True, 126 | activation=None, 127 | self_loop=True, 128 | low_mem=False, 129 | dropout=0.0, 130 | layer_norm=False 131 | ): 132 | super().__init__( 133 | in_feat, 134 | out_feat, 135 | num_rels, 136 | regularizer=regularizer, 137 | num_bases=num_bases, 138 | bias=bias, 139 | activation=activation, 140 | self_loop=self_loop, 141 | low_mem=low_mem, 142 | dropout=dropout, 143 | layer_norm=layer_norm 144 | ) 145 | self.out_layer = nn.Linear(out_feat, out_feat) 146 | 147 | nn.init.xavier_uniform_(self.out_layer.weight, gain=nn.init.calculate_gain('relu')) 148 | nn.init.zeros_(self.out_layer.bias) 149 | 150 | def forward(self, g, feat, etypes, norm=None): 151 | o = super().forward(g, feat, etypes, norm=None) 152 | o = self.out_layer(o) 153 | if self.activation: 154 | o = self.activation(o) 155 | o = self.dropout(o) 156 | 157 | return o 158 | 159 | 160 | class RGIN(BaseModel): 161 | def build_input_layer(self, node_attri, rel_attri): 162 | if node_attri is not None: 163 | return EmbeddingLayerAttri(node_attri), None 164 | return EmbeddingLayer(self.num_nodes, self.h_dim), None 165 | 166 | def build_hidden_layer(self, idx): 167 | if idx == 0: 168 | in_dim = self.h_dim 169 | else: 170 | in_dim = self.out_dim 171 | if idx < self.num_hidden_layers - 1: 172 | act = nn.Tanh() 173 | else: 174 | act = None 175 | return RelGraphIso( 176 | in_dim, 177 | self.out_dim, 178 | self.num_rels, 179 | "basis", 180 | self.num_rels, 181 | activation=act, 182 | self_loop=True, 183 | dropout=self.dropout 184 | ) 185 | 186 | def forward(self, g, h, r, norm): 187 | h = self.node_emb(g, h) 188 | has_norm = False 189 | if "norm" in g.edata: 190 | has_norm = True 191 | norm = g.edata.pop("norm") 192 | for layer in self.layers: 193 | h = layer(g, h, r, None) 194 | if has_norm: 195 | g.edata["norm"] = norm 196 | return h 197 | 198 | 199 | class TrainModel(nn.Module): 200 | def __init__( 201 | self, 202 | node_attri, 203 | num_nodes, 204 | o_dim, 205 | num_rels, 206 | nlabel, 207 | num_hidden_layers=1, 208 | dropout=0, 209 | use_cuda=False, 210 | reg_param=0 211 | ): 212 | super(TrainModel, self).__init__() 213 | 214 | i_dim = o_dim if node_attri is None else node_attri.shape[1] 215 | self.model = RGIN( 216 | node_attri, None, num_nodes, i_dim, o_dim, num_rels * 2, num_hidden_layers, dropout, use_cuda 217 | ) 218 | self.reg_param = reg_param 219 | 220 | if nlabel == 0: 221 | self.supervised = False 222 | self.w_relation = nn.Parameter(torch.Tensor(num_rels, o_dim)) 223 | nn.init.xavier_uniform_(self.w_relation, gain=nn.init.calculate_gain('relu')) 224 | else: 225 | self.supervised = True 226 | self.node_fc = nn.Linear(o_dim, nlabel) 227 | nn.init.xavier_normal_(self.node_fc.weight, gain=nn.init.calculate_gain("sigmoid")) 228 | nn.init.zeros_(self.node_fc.bias) 229 | 230 | # self.edge_fc = nn.Linear(o_dim, num_rels * 2) 231 | self.edge_fc = nn.Linear(o_dim, o_dim) 232 | nn.init.xavier_normal_(self.edge_fc.weight, gain=nn.init.calculate_gain('sigmoid')) 233 | nn.init.zeros_(self.edge_fc.bias) 234 | 235 | def calc_score(self, embedding, triplets): 236 | if isinstance(embedding, (tuple, list)): 237 | node_emb = embedding[0] 238 | else: 239 | node_emb = embedding 240 | s = node_emb[triplets[:, 0]] 241 | r = self.w_relation[triplets[:, 1]] 242 | o = node_emb[triplets[:, 2]] 243 | score = torch.sum(s * r * o, dim=1) 244 | return score 245 | 246 | def forward(self, g, h, edge_type, edge_norm): 247 | output = self.model.forward(g, h, edge_type, edge_norm) 248 | if self.supervised: 249 | if isinstance(output, (tuple, list)): 250 | pred = self.node_fc(output[0]) 251 | else: 252 | pred = self.node_fc(output) 253 | else: 254 | pred = None 255 | 256 | return output, pred 257 | 258 | def unsupervised_regularization_loss(self, embedding, edge_type=None): 259 | reg = torch.mean(self.w_relation.pow(2)) 260 | if isinstance(embedding, (tuple, list)): 261 | for emb in embedding: 262 | reg = reg + torch.mean(emb.pow(2)) 263 | elif isinstance(embedding, torch.Tensor): 264 | reg = reg + torch.mean(embedding.pow(2)) 265 | else: 266 | raise ValueError 267 | if edge_type is not None: 268 | if isinstance(embedding, (tuple, list)): 269 | for emb in embedding: 270 | if emb.size(0) == edge_type.size(0): 271 | mask = edge_type < self.w_relation.size(0) 272 | # reg = reg + F.cross_entropy(self.edge_fc(emb[mask]), edge_type[mask]) 273 | emb_diff = self.edge_fc(emb[mask]) - torch.index_select(self.w_relation, 0, edge_type[mask]) 274 | reg = reg + torch.mean(torch.pow(emb_diff, 2)) 275 | elif isinstance(embedding, torch.Tensor): 276 | if embedding.size(0) == edge_type.size(0): 277 | mask = edge_type < self.w_relation.size(0) 278 | # reg = reg + F.cross_entropy(self.edge_fc(embedding[mask]), edge_type[mask]) 279 | emb_diff = self.edge_fc(embedding[mask]) - torch.index_select(self.w_relation, 0, edge_type[mask]) 280 | reg = reg + torch.mean(torch.pow(emb_diff, 2)) 281 | 282 | return reg 283 | 284 | def get_unsupervised_loss(self, g, embedding, edge_type, triplets, labels): 285 | # triplets is a list of data samples (positive and negative) 286 | # each row in the triplets is a 3-tuple of (source, relation, destination) 287 | score = self.calc_score(embedding, triplets) 288 | predict_loss = F.binary_cross_entropy_with_logits(score, labels) 289 | reg_loss = self.unsupervised_regularization_loss(embedding, edge_type=edge_type) 290 | return predict_loss + self.reg_param * reg_loss 291 | 292 | def supervised_regularization_loss(self, embedding, edge_type=None): 293 | return self.unsupervised_regularization_loss(embedding, edge_type=edge_type) 294 | 295 | def get_supervised_loss(self, g, embedding, edge_type, pred, matched_labels, matched_index, multi): 296 | # triplets is a list of data samples (positive and negative) 297 | # each row in the triplets is a 3-tuple of (source, relation, destination) 298 | if multi: 299 | predict_loss = F.binary_cross_entropy(torch.sigmoid(pred[matched_index]), matched_labels) 300 | else: 301 | predict_loss = F.nll_loss(F.log_softmax(pred[matched_index]), matched_labels) 302 | reg_loss = self.supervised_regularization_loss(embedding, edge_type=edge_type) 303 | return predict_loss + self.reg_param * reg_loss 304 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Model/README.md: -------------------------------------------------------------------------------- 1 | ## Models: Message-Passing 2 | 3 | **DMPNN: Graph Convolutional Networks with Dual Message Passing for Subgraph Isomorphism Counting and Matching** 4 | ``` 5 | @inproceedings{liu2022graph, 6 | title={Graph convolutional networks with dual message passing for subgraph isomorphism counting and matching}, 7 | author={Liu, Xin and Song, Yangqiu}, 8 | booktitle={AAAI}, 9 | year={2022} 10 | } 11 | ``` 12 | 13 | *Source: https://github.com/HKUST-KnowComp/DualMessagePassing/blob/master/src/rgin.py* 14 | 15 | **R-GIN: Neural subgraph isomorphism counting** 16 | ``` 17 | @inproceedings{liu2020neural, 18 | title={Neural subgraph isomorphism counting}, 19 | author={Liu, Xin and Pan, Haojie and He, Mutian and Song, Yangqiu and Jiang, Xin and Shang, Lifeng}, 20 | booktitle={SIGKDD}, 21 | pages={1959--1969}, 22 | year={2020} 23 | } 24 | ``` 25 | 26 | *Source: https://github.com/HKUST-KnowComp/NeuralSubgraphCounting/blob/master/src/rgin.py* 27 | 28 | **CompGCN: Neural subgraph isomorphism counting** 29 | ``` 30 | @inproceedings{vashishth2019composition, 31 | title={Composition-based multi-relational graph convolutional networks}, 32 | author={Vashishth, Shikhar and Sanyal, Soumya and Nitin, Vikram and Talukdar, Partha}, 33 | booktitle={ICLR}, 34 | year={2020} 35 | } 36 | ``` 37 | 38 | *Source: https://github.com/dmlc/dgl/blob/master/examples/pytorch/compGCN/models.py* 39 | 40 | **R-GCN: Modeling Relational Data with Graph Convolutional Networks** 41 | ``` 42 | @inproceedings{schlichtkrull2018modeling, 43 | title={Modeling relational data with graph convolutional networks}, 44 | author={Schlichtkrull, Michael and Kipf, Thomas N and Bloem, Peter and Van Den Berg, Rianne and Titov, Ivan and Welling, Max}, 45 | booktitle={ESWC}, 46 | pages={593--607}, 47 | year={2018}, 48 | organization={Springer} 49 | } 50 | ``` 51 | 52 | *Source: https://github.com/dmlc/dgl/blob/master/examples/pytorch/rgcn/model.py* 53 | 54 | ### Deployment 55 | 56 | This implementation relies on 2 external packages: 57 | - [PyTorch] 58 | - [DGL] 59 | 60 | ### Input 61 | 62 | *Stage 2: Transform* prepares 3 input files stored in ```data/{dataset}```: 63 | - ```node.dat```: This file is only needed for attributed training, each line is formatted as ```{node_id}\t{node_attributes}``` where entries in ```{node_attributes}``` are separated by ```,```. 64 | - ```link.dat```: The first line specifies ```{number_of_nodes} {number_of_link_types}```. Each folloing line is formatted as ```{head_node_id} {link_type} {tail_node_id}```. 65 | - ```label.dat```: This file is only needed for semi-supervised training. Each line is formatted as ```{node_id}\t{node_label}```. 66 | 67 | ### Run 68 | 69 | Users need to specify the targeting dataset and the set of training parameters in ```run.sh```.
70 | Run ```bash run.sh``` to start training. 71 | 72 | ### Output 73 | 74 | This implementation generates 1 output file stored in ```data/{dataset}```: 75 | - ```emb.dat```: The first line specifies the parameters used in training. Each following line describes the id and the embeddings of a node. The id and the embeddings are separated by ```\t```. Entries in the embeddings are separated by ``` ```. -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/README.md: -------------------------------------------------------------------------------- 1 | # Unsupervised Node Classification 2 | 3 | This part is modified from [HNE](https://github.com/yangji9181/HNE) 4 | 5 | ## Reproduction 6 | 7 | ### Stage 1: Data 8 | 9 | We conduct experiments on 2 HIN benchmark datasets: ```PubMed``` and ```Yelp```. 10 | Please refer to the ```Data``` folder for more details. 11 | 12 | ### Stage 2: Transform 13 | 14 | This stage transforms a dataset from its original format to the training input format. 15 | 16 | Users need to specify the targeting dataset, the targeting model, and the training settings. 17 | 18 | Please refer to the ```Transform``` folder for more details. 19 | 20 | ### Stage 3: Model 21 | 22 | We add ```DMPNN``` and 2 more heterogeneous Message-Passing baseline implementaions (```CompGCN``` and ```R-GIN```) 23 | 24 | Please refer to the ```Model``` folder for more details. 25 | 26 | ### Stage 4: Evaluate 27 | 28 | This stage evaluates the output embeddings based on specific tasks. 29 | 30 | Users need to specify the targeting dataset, the targeting model, and the evaluation tasks. 31 | 32 | Please refer to the ```Evaluate``` folder for more details. 33 | -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Transform/README.md: -------------------------------------------------------------------------------- 1 | ## Transform 2 | 3 | This stage transforms a dataset from its original format to the training input format. 4 | 5 | Users need to specify the following parameters in ```transform.sh```: 6 | - **dataset**: choose from ```PubMed``` and ```Yelp```; 7 | - **model**: choose from ```DMPNN```, ```CompGCN```, and ```R-GIN``` (more basedlines can be found [here](https://github.com/yangji9181/HNE/tree/master/Model)); 8 | - **attributed**: choose ```False``` for unattributed training; 9 | - **supervised**: choose ```False``` for unsupervised training. 10 | 11 | Run ```bash transform.sh``` to complete *Stage 2: Transform*. 12 | 13 | We also generate a file including node ids of all labeled nodes and nodes in predicted links. -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Transform/transform.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transform_model import * 3 | 4 | 5 | def parse_args(): 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('-dataset', required=True, type=str, help='Targeting dataset.', 9 | choices=['DBLP','Freebase','PubMed','Yelp']) 10 | parser.add_argument('-model', required=True, type=str, help='Targeting model.', 11 | choices=['metapath2vec-ESim','PTE','HIN2Vec','AspEm','HEER','R-GCN','HAN','MAGNN','HGT','TransE','DistMult','ComplEx','ConvE','R-GIN','CompGCN','DMPNN']) 12 | parser.add_argument('-attributed', required=True, type=str, help='Only DMPNN, CompGCN, R-GIN, R-GCN, HAN, MAGNN, and HGT support attributed training.', 13 | choices=['True','False']) 14 | parser.add_argument('-supervised', required=True, type=str, help='Only DMPNN, CompGCN, R-GIN, R-GCN, HAN, MAGNN, and HGT support semi-supervised training.', 15 | choices=['True','False']) 16 | 17 | return parser.parse_args() 18 | 19 | 20 | def check(args): 21 | 22 | if args.attributed=='True': 23 | if args.model not in ['DMPNN', 'CompGCN', 'R-GIN', 'R-GCN','HAN', 'MAGNN', 'HGT']: 24 | print(f'{args.model} does not support attributed training!') 25 | print('Only DMPNN, CompGCN, R-GIN, R-GCN, HAN, MAGNN, and HGT support attributed training!') 26 | return False 27 | if args.dataset not in ['DBLP', 'PubMed']: 28 | print(f'{args.dataset} does not support attributed training!') 29 | print('Only DBLP and PubMed support attributed training!') 30 | return False 31 | 32 | if args.supervised=='True': 33 | if args.model not in ['DMPNN', 'CompGCN', 'R-GIN', 'R-GCN','HAN', 'MAGNN', 'HGT']: 34 | print(f'{args.model} does not support semi-supervised training!') 35 | print('Only DMPNN, CompGCN, R-GIN, R-GCN, HAN, MAGNN, and HGT support semi-supervised training!') 36 | return False 37 | 38 | return True 39 | 40 | 41 | def main(): 42 | 43 | args = parse_args() 44 | 45 | if not check(args): 46 | return 47 | 48 | print('Transforming {} to {} input format for {}, {} training!' 49 | .format(args.dataset, args.model, 50 | 'attributed' if args.attributed=='True' else 'unattributed', 51 | 'semi-supervised' if args.supervised=='True' else 'unsupervised')) 52 | 53 | if args.model=='metapath2vec-ESim': metapath2vec_esim_convert(args.dataset) 54 | elif args.model=='PTE': pte_convert(args.dataset) 55 | elif args.model=='HIN2Vec': hin2vec_convert(args.dataset) 56 | elif args.model=='AspEm': aspem_convert(args.dataset) 57 | elif args.model=='HEER': heer_convert(args.dataset) 58 | elif args.model=='R-GCN': rgcn_convert(args.dataset, args.attributed, args.supervised) 59 | elif args.model=='HAN': han_convert(args.dataset, args.attributed, args.supervised) 60 | elif args.model=='MAGNN': magnn_convert(args.dataset, args.attributed, args.supervised) 61 | elif args.model=='HGT': hgt_convert(args.dataset, args.attributed, args.supervised) 62 | elif args.model=='TransE': transe_convert(args.dataset) 63 | elif args.model=='DistMult': distmult_convert(args.dataset) 64 | elif args.model=='ComplEx': complex_convert(args.dataset) 65 | elif args.model=='ConvE': conve_convert(args.dataset) 66 | elif args.model=='R-GIN': rgin_convert(args.dataset, args.attributed, args.supervised) 67 | elif args.model=='CompGCN': compgcn_convert(args.dataset, args.attributed, args.supervised) 68 | elif args.model=='DMPNN': dmpnn_convert(args.dataset, args.attributed, args.supervised) 69 | 70 | print('Data transformation finished!') 71 | 72 | return 73 | 74 | 75 | if __name__=='__main__': 76 | main() -------------------------------------------------------------------------------- /UnsupervisedNodeClassification/Transform/transform.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note: Only 'R-GCN', 'HAN', 'MAGNN', and 'HGT' support attributed='True' or supervised='True' 4 | # Note: Only 'DBLP' and 'PubMed' contain node attributes. 5 | 6 | dataset='PubMed' # choose from 'DBLP', 'Yelp', 'Freebase', and 'PubMed' 7 | model='R-GCN' # choose from 'metapath2vec-ESim', 'PTE', 'HIN2Vec', 'AspEm', 'HEER', 'R-GCN', 'HAN', 'MAGNN', 'HGT', 'TransE', 'DistMult', 'ComplEx', and 'ConvE' 8 | attributed='False' # choose 'True' or 'False' 9 | supervised='False' # choose 'True' or 'False' 10 | 11 | for dataset in 'PubMed' 'Yelp' 12 | do 13 | for model in 'R-GCN' 'DMPNN' 'CompGCN' 'R-GIN' 14 | do 15 | mkdir -p ../Model/${model}/data 16 | mkdir -p ../Model/${model}/data/${dataset} 17 | python transform.py -dataset ${dataset} -model ${model} -attributed ${attributed} -supervised ${supervised} 18 | done 19 | done 20 | 21 | 22 | --------------------------------------------------------------------------------