├── xydata └── raw │ └── .gitignore ├── model_files └── .gitignore ├── submit └── .gitignore ├── utils ├── __init__.py ├── utils.py ├── evaluator.py ├── xygraph.py └── xygraph_no_valid.py ├── models ├── __init__.py ├── sage.py ├── gcn.py ├── mlp.py ├── gat.py ├── sage_neighsampler.py └── gat_neighsampler.py ├── README.md ├── requirements.txt ├── .gitignore ├── train_mini_batch.py └── inference_mini_batch.py /xydata/raw/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /model_files/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /submit/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !submit_demo.npy 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .xygraph import XYGraphP1 2 | from .xygraph_no_valid import XYGraphP1_no_valid 3 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlp import MLP, MLPLinear 2 | from .gcn import GCN 3 | from .sage import SAGE 4 | from .sage_neighsampler import SAGE_NeighSampler 5 | from .gat import GAT, GATv2 6 | from .gat_neighsampler import GAT_NeighSampler, GATv2_NeighSampler -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | from datetime import datetime 4 | import shutil 5 | 6 | 7 | def prepare_folder(name, model_name): 8 | model_dir = f'./model_files/{name}/{model_name}/' 9 | 10 | if os.path.exists(model_dir): 11 | shutil.rmtree(model_dir) 12 | os.makedirs(model_dir) 13 | return model_dir 14 | 15 | def prepare_tune_folder(name, model_name): 16 | str_time = datetime.strftime(datetime.now(), '%Y%m%d_%H%M%S') 17 | tune_model_dir = f'./tune_results/{name}/{model_name}/{str_time}/' 18 | 19 | if os.path.exists(tune_model_dir): 20 | print(f'rm tune_model_dir {tune_model_dir}') 21 | shutil.rmtree(tune_model_dir) 22 | os.makedirs(tune_model_dir) 23 | print(f'make tune_model_dir {tune_model_dir}') 24 | return tune_model_dir 25 | 26 | def save_preds_and_params(parameters, preds, model, file): 27 | save_dict = {'parameters':parameters, 'preds': preds, 'params': model.state_dict() 28 | , 'nparams': sum(p.numel() for p in model.parameters())} 29 | torch.save(save_dict, file) 30 | return 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /models/sage.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from torch import Tensor 4 | from torch_sparse import SparseTensor 5 | import torch 6 | import torch.nn.functional as F 7 | from torch_geometric.nn import SAGEConv 8 | 9 | class SAGE(torch.nn.Module): 10 | def __init__(self 11 | , in_channels 12 | , hidden_channels 13 | , out_channels 14 | , num_layers 15 | , dropout 16 | , batchnorm=True): 17 | super(SAGE, self).__init__() 18 | 19 | self.convs = torch.nn.ModuleList() 20 | self.convs.append(SAGEConv(in_channels, hidden_channels)) 21 | self.bns = torch.nn.ModuleList() 22 | self.batchnorm = batchnorm 23 | if self.batchnorm: 24 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels)) 25 | for _ in range(num_layers - 2): 26 | self.convs.append(SAGEConv(hidden_channels, hidden_channels)) 27 | if self.batchnorm: 28 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels)) 29 | self.convs.append(SAGEConv(hidden_channels, out_channels)) 30 | 31 | self.dropout = dropout 32 | 33 | def reset_parameters(self): 34 | for conv in self.convs: 35 | conv.reset_parameters() 36 | if self.batchnorm: 37 | for bn in self.bns: 38 | bn.reset_parameters() 39 | 40 | def forward(self, x, edge_index: Union[Tensor, SparseTensor]): 41 | for i, conv in enumerate(self.convs[:-1]): 42 | x = conv(x, edge_index) 43 | if self.batchnorm: 44 | x = self.bns[i](x) 45 | x = F.relu(x) 46 | x = F.dropout(x, p=self.dropout, training=self.training) 47 | x = self.convs[-1](x, edge_index) 48 | return x.log_softmax(dim=-1) 49 | -------------------------------------------------------------------------------- /models/gcn.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from torch import Tensor 4 | from torch_sparse import SparseTensor 5 | import torch 6 | import torch.nn.functional as F 7 | from torch_geometric.nn import GCNConv 8 | 9 | ''' 10 | 此模型邻居矩阵采用sparse tensor的形式,可以大大减少计算量, 11 | 如果不使用sparse tensor形式传递,将adj_t替换成edge_index 12 | ''' 13 | class GCN(torch.nn.Module): 14 | def __init__(self 15 | , in_channels 16 | , hidden_channels 17 | , out_channels 18 | , num_layers 19 | , dropout 20 | , batchnorm=True): 21 | super(GCN, self).__init__() 22 | 23 | self.convs = torch.nn.ModuleList() 24 | self.convs.append(GCNConv(in_channels, hidden_channels, cached=True)) 25 | self.batchnorm = batchnorm 26 | if self.batchnorm: 27 | self.bns = torch.nn.ModuleList() 28 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels)) 29 | for _ in range(num_layers - 2): 30 | self.convs.append( 31 | GCNConv(hidden_channels, hidden_channels, cached=True)) 32 | if self.batchnorm: 33 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels)) 34 | self.convs.append(GCNConv(hidden_channels, out_channels, cached=True)) 35 | 36 | self.dropout = dropout 37 | 38 | def reset_parameters(self): 39 | for conv in self.convs: 40 | conv.reset_parameters() 41 | if self.batchnorm: 42 | for bn in self.bns: 43 | bn.reset_parameters() 44 | 45 | def forward(self, x, edge_index: Union[Tensor, SparseTensor]): 46 | for i, conv in enumerate(self.convs[:-1]): 47 | x = conv(x, edge_index) 48 | if self.batchnorm: 49 | x = self.bns[i](x) 50 | x = F.relu(x) 51 | x = F.dropout(x, p=self.dropout, training=self.training) 52 | x = self.convs[-1](x, edge_index) 53 | return x.log_softmax(dim=-1) -------------------------------------------------------------------------------- /models/mlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | class MLP(torch.nn.Module): 5 | def __init__(self 6 | , in_channels 7 | , hidden_channels 8 | , out_channels 9 | , num_layers 10 | , dropout 11 | , batchnorm=True): 12 | super(MLP, self).__init__() 13 | self.lins = torch.nn.ModuleList() 14 | self.lins.append(torch.nn.Linear(in_channels, hidden_channels)) 15 | self.batchnorm = batchnorm 16 | if self.batchnorm: 17 | self.bns = torch.nn.ModuleList() 18 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels)) 19 | for _ in range(num_layers - 2): 20 | self.lins.append(torch.nn.Linear(hidden_channels, hidden_channels)) 21 | if self.batchnorm: 22 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels)) 23 | self.lins.append(torch.nn.Linear(hidden_channels, out_channels)) 24 | 25 | self.dropout = dropout 26 | 27 | def reset_parameters(self): 28 | for lin in self.lins: 29 | lin.reset_parameters() 30 | if self.batchnorm: 31 | for bn in self.bns: 32 | bn.reset_parameters() 33 | 34 | def forward(self, x): 35 | for i, lin in enumerate(self.lins[:-1]): 36 | x = lin(x) 37 | if self.batchnorm: 38 | x = self.bns[i](x) 39 | x = F.relu(x) 40 | x = F.dropout(x, p=self.dropout, training=self.training) 41 | x = self.lins[-1](x) 42 | return F.log_softmax(x, dim=-1) 43 | 44 | 45 | 46 | class MLPLinear(torch.nn.Module): 47 | def __init__(self, in_channels, out_channels): 48 | super(MLPLinear, self).__init__() 49 | self.lin = torch.nn.Linear(in_channels, out_channels) 50 | 51 | def reset_parameters(self): 52 | self.lin.reset_parameters() 53 | 54 | def forward(self, x): 55 | return F.log_softmax(self.lin(x), dim=-1) 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 第七届信也科技杯 RobertAckley 初赛代码 2 | 这是第七届信也科技杯-欺诈用户风险识别RobertAckley的初赛代码。 测评AUC最终为0.83631,排名33。本代码主要参照比赛baseline代码 https://github.com/DGraphXinye/2022_finvcup_baseline 3 | 请在比赛网站上下载"初赛数据集.zip"文件,将zip文件中的"phase1_gdata.npz"放到路径'./xydata/raw'中。 4 | 5 | 6 | ## Environments 7 | Implementing environment: 8 | - python = 3.7.6 9 | - numpy = 1.21.2 10 | - pytorch = 1.6.0 11 | - torch_geometric = 1.7.2 12 | - torch_scatter = 2.0.8 13 | - torch_sparse = 0.6.9 14 | - networkx = 2.6.3 15 | - pandas = 1.3.5 16 | - scikit-learn = 1.0.2 17 | - lightgbm = 3.3.2 18 | - torchvision = 0.7.0 19 | - tqdm = 4.64.0 20 | 21 | 详细见requirements.txt 22 | 23 | - GPU: RTX A4000 24 | 25 | 26 | ## Training 27 | 28 | - **GraphSAGE (NeighborSampler)** 29 | ```bash 30 | python train_mini_batch.py --model sage_neighsampler --epochs 200 --device 0 31 | python inference_mini_batch.py --model sage_neighsampler --device 0 32 | ``` 33 | 34 | ## Results: 35 | 36 | | Methods | Valid AUC | Test AUC | 37 | | :---- | ---- | ---- | 38 | | GraphSAGE (NeighborSampler) embedding + features + LightGBM | 0.8518 | **0.8363** | 39 | 40 | ## 解决方案及算法方案 41 | 42 | 本次比赛团队的解决方案主要包括三步:基于GraphSAGE的节点Embedding(与baseline一致),手工加入时序等特征,通过LightGBM分类 43 | 44 | 1. 基于GraphSAGE的节点Embedding(与baseline一致) 45 | 46 | ​ 基于baseline代码中GraphSAGE(NeighborSampler)的AUC最高,团队使用该网络对数据集中的节点进行embedding。网络训练与baseline中一致,修改的点为将17-128-2的GraphSAGE原模型修改为12-128-64-2的新模型,最后一层为64-2的线性层作为分类器。网络训练完成后,只取GraphSAGE的前两层,如此便将所有节点转换成了64维的向量。 47 | 48 | ​ 值得注意的是,得到embedding时不再采用验证集(能使测试AUC上升0.003左右),因此inference_mini_batch.py中使用的是XYGraphP1_no_valid作为数据集类。 49 | 50 | 2. 手工加入时序等特征 51 | 52 | 最终每个节点为201维的特征向量(由以下特征向量直接拼接得到),其中: 53 | 54 | - 点的embedding特征向量,64维 55 | - 节点本身的特征向量,17维 56 | - 节点的相邻节点的类别的数量,有4类节点,测试节点为未知节点,算作0类和1类各一个,共4维。考虑到数据集为单向边,因此将节点出边入边分别分开计算,因此总共4x2为8维。 57 | - 节点的边的类别,共有11类边,统计节点的边的不同类别的数量。同理出边入边分别分开统计,共11x2为22维。 58 | - 节点的随时间的边的数量。将边的时序特征均分为45份,统计改节点45个时间段边的数量。同理出边入边分别分开统计,共45x2为90维。 59 | 60 | 3. LightGBM分类 61 | 62 | ​ 最终通过LightGBM分类器分类,参数设置具体见代码,1200个epoch的设置是由之前包含验证集的相同设置的实验推测而来。 63 | 64 | ## 复现流程 65 | 66 | 1. 首先通过python train_mini_batch.py --model sage_neighsampler --epochs 200 --device 0命名训练网络。 67 | 2. 网络Embedding,手工特征,LightGBM分类只需要运行python inference_mini_batch.py --model sage_neighsampler --device 0命令即可。 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /utils/evaluator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.metrics import roc_auc_score 5 | try: 6 | import torch 7 | except ImportError: 8 | torch = None 9 | 10 | ### Evaluator for node property prediction 11 | class Evaluator: 12 | def __init__(self, eval_metric): 13 | if eval_metric not in ['acc', 'auc']: 14 | raise ValueError('eval_metric should be acc or auc') 15 | 16 | self.eval_metric = eval_metric 17 | 18 | def _check_input(self, y_true, y_pred): 19 | ''' 20 | y_true: numpy ndarray or torch tensor of shape (num_node) 21 | y_pred: numpy ndarray or torch tensor of shape (num_node, num_tasks) 22 | ''' 23 | 24 | # converting to torch.Tensor to numpy on cpu 25 | if torch is not None and isinstance(y_true, torch.Tensor): 26 | y_true = y_true.detach().cpu().numpy() 27 | 28 | if torch is not None and isinstance(y_pred, torch.Tensor): 29 | y_pred = y_pred.detach().cpu().numpy() 30 | 31 | ## check type 32 | if not (isinstance(y_true, np.ndarray) and isinstance(y_true, np.ndarray)): 33 | raise RuntimeError('Arguments to Evaluator need to be either numpy ndarray or torch tensor') 34 | 35 | if not y_pred.ndim == 2: 36 | raise RuntimeError('y_pred must to 2-dim arrray, {}-dim array given'.format(y_true.ndim)) 37 | 38 | return y_true, y_pred 39 | 40 | def eval(self, y_true, y_pred): 41 | if self.eval_metric == 'auc': 42 | y_true, y_pred = self._check_input(y_true, y_pred) 43 | return self._eval_rocauc(y_true, y_pred) 44 | if self.eval_metric == 'acc': 45 | y_true, y_pred = self._check_input(y_true, y_pred) 46 | return self._eval_acc(y_true, y_pred) 47 | 48 | 49 | def _eval_rocauc(self, y_true, y_pred): 50 | ''' 51 | compute ROC-AUC and AP score averaged across tasks 52 | ''' 53 | 54 | if y_pred.shape[1] ==2: 55 | auc = roc_auc_score(y_true, y_pred[:, 1]) 56 | else: 57 | onehot_code = np.eye(y_pred.shape[1]) 58 | y_true_onehot = onehot_code[y_true] 59 | auc = roc_auc_score(y_true_onehot, y_pred) 60 | 61 | return {'auc': auc} 62 | 63 | def _eval_acc(self, y_true, y_pred): 64 | y_pred = y_pred.argmax(axis=-1) 65 | 66 | correct = y_true == y_pred 67 | acc = float(np.sum(correct))/len(correct) 68 | 69 | return {'acc': acc} 70 | 71 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work 2 | certifi @ file:///opt/conda/conda-bld/certifi_1655968806487/work/certifi 3 | charset-normalizer==2.1.0 4 | cycler @ file:///tmp/build/80754af9/cycler_1637851556182/work 5 | debugpy @ file:///tmp/build/80754af9/debugpy_1637091426235/work 6 | decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work 7 | entrypoints @ file:///tmp/build/80754af9/entrypoints_1649908585034/work 8 | fonttools==4.25.0 9 | googledrivedownloader==0.4 10 | idna==3.3 11 | importlib-metadata==4.12.0 12 | ipykernel @ file:///tmp/build/80754af9/ipykernel_1646983213839/work/dist/ipykernel-6.9.1-py3-none-any.whl 13 | ipython @ file:///tmp/build/80754af9/ipython_1643818144432/work 14 | isodate==0.6.1 15 | jedi @ file:///tmp/build/80754af9/jedi_1644299024593/work 16 | Jinja2==3.1.2 17 | joblib==1.1.0 18 | jupyter-client @ file:///opt/conda/conda-bld/jupyter_client_1650622202839/work 19 | jupyter-core @ file:///opt/conda/conda-bld/jupyter_core_1651671229925/work 20 | kiwisolver @ file:///opt/conda/conda-bld/kiwisolver_1653292039266/work 21 | lightgbm==3.3.2 22 | MarkupSafe==2.1.1 23 | matplotlib @ file:///tmp/build/80754af9/matplotlib-suite_1647441664166/work 24 | matplotlib-inline @ file:///tmp/build/80754af9/matplotlib-inline_1628242447089/work 25 | mkl-fft==1.3.1 26 | mkl-random @ file:///tmp/build/80754af9/mkl_random_1626179032232/work 27 | mkl-service==2.4.0 28 | munkres==1.1.4 29 | nest-asyncio @ file:///tmp/build/80754af9/nest-asyncio_1649847911453/work 30 | networkx==2.6.3 31 | numpy @ file:///opt/conda/conda-bld/numpy_and_numpy_base_1653915516269/work 32 | packaging @ file:///tmp/build/80754af9/packaging_1637314298585/work 33 | pandas==1.3.5 34 | parso @ file:///opt/conda/conda-bld/parso_1641458642106/work 35 | pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work 36 | pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work 37 | Pillow==9.0.1 38 | prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1633440160888/work 39 | ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl 40 | Pygments @ file:///opt/conda/conda-bld/pygments_1644249106324/work 41 | pyparsing==3.0.9 42 | python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work 43 | python-louvain==0.16 44 | pytz==2022.1 45 | pyzmq @ file:///tmp/build/80754af9/pyzmq_1638434924971/work 46 | rdflib==6.1.1 47 | requests==2.28.1 48 | scikit-learn==1.0.2 49 | scipy==1.7.3 50 | six @ file:///tmp/build/80754af9/six_1644875935023/work 51 | threadpoolctl==3.1.0 52 | torch==1.6.0 53 | torch-geometric==1.7.2 54 | torch-scatter==2.0.8 55 | torch-sparse==0.6.9 56 | torch-spline-conv==1.2.1 57 | torchvision==0.7.0 58 | tornado @ file:///tmp/build/80754af9/tornado_1606942283357/work 59 | tqdm==4.64.0 60 | traitlets @ file:///tmp/build/80754af9/traitlets_1636710298902/work 61 | typing_extensions==4.3.0 62 | urllib3==1.26.9 63 | wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work 64 | zipp==3.8.0 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | xydata/processed 2 | tmp.ipynb 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 108 | __pypackages__/ 109 | 110 | # Celery stuff 111 | celerybeat-schedule 112 | celerybeat.pid 113 | 114 | # SageMath parsed files 115 | *.sage.py 116 | 117 | # Environments 118 | .env 119 | .venv 120 | env/ 121 | venv/ 122 | ENV/ 123 | env.bak/ 124 | venv.bak/ 125 | 126 | # Spyder project settings 127 | .spyderproject 128 | .spyproject 129 | 130 | # Rope project settings 131 | .ropeproject 132 | 133 | # mkdocs documentation 134 | /site 135 | 136 | # mypy 137 | .mypy_cache/ 138 | .dmypy.json 139 | dmypy.json 140 | 141 | # Pyre type checker 142 | .pyre/ 143 | 144 | # pytype static type analyzer 145 | .pytype/ 146 | 147 | # Cython debug symbols 148 | cython_debug/ 149 | 150 | # PyCharm 151 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 152 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 153 | # and can be added to the global gitignore or merged into this file. For a more nuclear 154 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 155 | #.idea/ 156 | -------------------------------------------------------------------------------- /utils/xygraph.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Callable, List 2 | import os.path as osp 3 | 4 | import numpy as np 5 | import torch 6 | from torch_geometric.data import InMemoryDataset 7 | from torch_geometric.data import Data 8 | 9 | 10 | def read_xygraphp1(folder): 11 | print('read_xygraphp1') 12 | names = ['phase1_gdata.npz'] 13 | items = [np.load(folder+'/'+name) for name in names] 14 | 15 | x = items[0]['x'] 16 | y = items[0]['y'].reshape(-1, 1) 17 | edge_index = items[0]['edge_index'] 18 | edge_type = items[0]['edge_type'] 19 | np.random.seed(42) 20 | train_mask_t = items[0]['train_mask'] 21 | np.random.shuffle(train_mask_t) 22 | train_mask = train_mask_t[:int(len(train_mask_t)/10*6)] 23 | valid_mask = train_mask_t[int(len(train_mask_t)/10*6):] 24 | test_mask = items[0]['test_mask'] 25 | 26 | x = torch.tensor(x, dtype=torch.float).contiguous() 27 | y = torch.tensor(y, dtype=torch.int64) 28 | edge_index = torch.tensor(edge_index.transpose(), 29 | dtype=torch.int64).contiguous() 30 | edge_type = torch.tensor(edge_type, dtype=torch.float) 31 | train_mask = torch.tensor(train_mask, dtype=torch.int64) 32 | valid_mask = torch.tensor(valid_mask, dtype=torch.int64) 33 | test_mask = torch.tensor(test_mask, dtype=torch.int64) 34 | 35 | data = Data(x=x, edge_index=edge_index, edge_attr=edge_type, y=y) 36 | data.train_mask = train_mask 37 | data.valid_mask = valid_mask 38 | data.test_mask = test_mask 39 | 40 | return data 41 | 42 | 43 | class XYGraphP1(InMemoryDataset): 44 | r""" 45 | Args: 46 | root (string): Root directory where the dataset should be saved. 47 | name (string): The name of the dataset (:obj:`"xygraphp1"`). 48 | transform (callable, optional): A function/transform that takes in an 49 | :obj:`torch_geometric.data.Data` object and returns a transformed 50 | version. The data object will be transformed before every access. 51 | (default: :obj:`None`) 52 | pre_transform (callable, optional): A function/transform that takes in 53 | an :obj:`torch_geometric.data.Data` object and returns a 54 | transformed version. The data object will be transformed before 55 | being saved to disk. (default: :obj:`None`) 56 | """ 57 | 58 | url = '' 59 | 60 | def __init__(self, root: str, name: str, 61 | transform: Optional[Callable] = None, 62 | pre_transform: Optional[Callable] = None): 63 | 64 | self.name = name 65 | super().__init__(root, transform, pre_transform) 66 | self.data, self.slices = torch.load(self.processed_paths[0]) 67 | 68 | @property 69 | def raw_dir(self) -> str: 70 | return osp.join(self.root, self.name, 'raw') 71 | 72 | @property 73 | def processed_dir(self) -> str: 74 | return osp.join(self.root, self.name, 'processed') 75 | 76 | @property 77 | def raw_file_names(self) -> List[str]: 78 | names = ['phase1_gdata.npz'] 79 | return names 80 | 81 | @property 82 | def processed_file_names(self) -> str: 83 | return 'data.pt' 84 | 85 | def download(self): 86 | pass 87 | # for name in self.raw_file_names: 88 | # download_url('{}/{}'.format(self.url, name), self.raw_dir) 89 | 90 | def process(self): 91 | data = read_xygraphp1(self.raw_dir) 92 | data = data if self.pre_transform is None else self.pre_transform(data) 93 | torch.save(self.collate([data]), self.processed_paths[0]) 94 | 95 | def __repr__(self) -> str: 96 | return f'{self.name}()' 97 | -------------------------------------------------------------------------------- /utils/xygraph_no_valid.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Callable, List 2 | import os.path as osp 3 | 4 | import numpy as np 5 | import torch 6 | from torch_geometric.data import InMemoryDataset 7 | from torch_geometric.data import Data 8 | 9 | 10 | def read_xygraphp1(folder): 11 | print('read_xygraphp_no_valid') 12 | names = ['phase1_gdata.npz'] 13 | items = [np.load(folder+'/'+name) for name in names] 14 | 15 | x = items[0]['x'] 16 | y = items[0]['y'].reshape(-1, 1) 17 | edge_index = items[0]['edge_index'] 18 | edge_type = items[0]['edge_type'] 19 | np.random.seed(42) 20 | train_mask_t = items[0]['train_mask'] 21 | np.random.shuffle(train_mask_t) 22 | # train_mask = train_mask_t[:int(len(train_mask_t)/10*6)] 23 | # valid_mask = train_mask_t[int(len(train_mask_t)/10*6):] 24 | test_mask = items[0]['test_mask'] 25 | 26 | x = torch.tensor(x, dtype=torch.float).contiguous() 27 | y = torch.tensor(y, dtype=torch.int64) 28 | edge_index = torch.tensor(edge_index.transpose(), 29 | dtype=torch.int64).contiguous() 30 | edge_type = torch.tensor(edge_type, dtype=torch.float) 31 | train_mask_t = torch.tensor(train_mask_t, dtype=torch.float) 32 | # train_mask = torch.tensor(train_mask, dtype=torch.int64) 33 | # valid_mask = torch.tensor(valid_mask, dtype=torch.int64) 34 | test_mask = torch.tensor(test_mask, dtype=torch.int64) 35 | 36 | data = Data(x=x, edge_index=edge_index, edge_attr=edge_type, y=y) 37 | data.train_mask = train_mask_t 38 | # data.train_mask = train_mask 39 | # data.valid_mask = valid_mask 40 | data.test_mask = test_mask 41 | 42 | return data 43 | 44 | 45 | class XYGraphP1_no_valid(InMemoryDataset): 46 | r""" 47 | Args: 48 | root (string): Root directory where the dataset should be saved. 49 | name (string): The name of the dataset (:obj:`"xygraphp1"`). 50 | transform (callable, optional): A function/transform that takes in an 51 | :obj:`torch_geometric.data.Data` object and returns a transformed 52 | version. The data object will be transformed before every access. 53 | (default: :obj:`None`) 54 | pre_transform (callable, optional): A function/transform that takes in 55 | an :obj:`torch_geometric.data.Data` object and returns a 56 | transformed version. The data object will be transformed before 57 | being saved to disk. (default: :obj:`None`) 58 | """ 59 | 60 | url = '' 61 | 62 | def __init__(self, root: str, name: str, 63 | transform: Optional[Callable] = None, 64 | pre_transform: Optional[Callable] = None): 65 | 66 | self.name = name 67 | super().__init__(root, transform, pre_transform) 68 | self.data, self.slices = torch.load(self.processed_paths[0]) 69 | 70 | @property 71 | def raw_dir(self) -> str: 72 | return osp.join(self.root, self.name, 'raw_no_valid') 73 | 74 | @property 75 | def processed_dir(self) -> str: 76 | return osp.join(self.root, self.name, 'processed_no_valid') 77 | 78 | @property 79 | def raw_file_names(self) -> List[str]: 80 | names = ['phase1_gdata.npz'] 81 | return names 82 | 83 | @property 84 | def processed_file_names(self) -> str: 85 | return 'data.pt' 86 | 87 | def download(self): 88 | pass 89 | # for name in self.raw_file_names: 90 | # download_url('{}/{}'.format(self.url, name), self.raw_dir) 91 | 92 | def process(self): 93 | data = read_xygraphp1(self.raw_dir) 94 | data = data if self.pre_transform is None else self.pre_transform(data) 95 | torch.save(self.collate([data]), self.processed_paths[0]) 96 | 97 | def __repr__(self) -> str: 98 | return f'{self.name}()' 99 | -------------------------------------------------------------------------------- /models/gat.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from torch import Tensor 4 | from torch_sparse import SparseTensor 5 | import torch 6 | import torch.nn.functional as F 7 | from torch_geometric.nn import GATConv, GATv2Conv 8 | 9 | class GAT(torch.nn.Module): 10 | def __init__(self 11 | , in_channels 12 | , hidden_channels 13 | , out_channels 14 | , num_layers 15 | , dropout 16 | , layer_heads = [] 17 | , batchnorm=True): 18 | super(GAT, self).__init__() 19 | 20 | self.convs = torch.nn.ModuleList() 21 | self.convs.append(GATConv(in_channels, hidden_channels, heads=layer_heads[0], concat=True)) 22 | self.bns = torch.nn.ModuleList() 23 | self.batchnorm = batchnorm 24 | if self.batchnorm: 25 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[0])) 26 | for _ in range(num_layers - 2): 27 | self.convs.append(GATConv(hidden_channels*layer_heads[i-1], hidden_channels, heads=layer_heads[i], concat=True)) 28 | if self.batchnorm: 29 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[i-1])) 30 | self.convs.append(GATConv(hidden_channels*layer_heads[num_layers-2] 31 | , out_channels 32 | , heads=layer_heads[num_layers-1] 33 | , concat=False)) 34 | 35 | self.dropout = dropout 36 | 37 | def reset_parameters(self): 38 | for conv in self.convs: 39 | conv.reset_parameters() 40 | if self.batchnorm: 41 | for bn in self.bns: 42 | bn.reset_parameters() 43 | 44 | def forward(self, x, edge_index: Union[Tensor, SparseTensor]): 45 | for i, conv in enumerate(self.convs[:-1]): 46 | x = conv(x, edge_index) 47 | if self.batchnorm: 48 | x = self.bns[i](x) 49 | x = F.relu(x) 50 | x = F.dropout(x, p=self.dropout, training=self.training) 51 | x = self.convs[-1](x, edge_index) 52 | return x.log_softmax(dim=-1) 53 | 54 | 55 | 56 | 57 | class GATv2(torch.nn.Module): 58 | def __init__(self 59 | , in_channels 60 | , hidden_channels 61 | , out_channels 62 | , num_layers 63 | , dropout 64 | , layer_heads = [] 65 | , batchnorm=True): 66 | super(GATv2, self).__init__() 67 | 68 | self.convs = torch.nn.ModuleList() 69 | self.convs.append(GATv2Conv(in_channels, hidden_channels, heads=layer_heads[0], concat=True)) 70 | self.bns = torch.nn.ModuleList() 71 | self.batchnorm = batchnorm 72 | if self.batchnorm: 73 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[0])) 74 | for _ in range(num_layers - 2): 75 | self.convs.append(GATv2Conv(hidden_channels*layer_heads[i-1], hidden_channels, heads=layer_heads[i], concat=True)) 76 | if self.batchnorm: 77 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[i-1])) 78 | self.convs.append(GATv2Conv(hidden_channels*layer_heads[num_layers-2] 79 | , out_channels 80 | , heads=layer_heads[num_layers-1] 81 | , concat=False)) 82 | 83 | self.dropout = dropout 84 | 85 | def reset_parameters(self): 86 | for conv in self.convs: 87 | conv.reset_parameters() 88 | if self.batchnorm: 89 | for bn in self.bns: 90 | bn.reset_parameters() 91 | 92 | def forward(self, x, edge_index: Union[Tensor, SparseTensor]): 93 | for i, conv in enumerate(self.convs[:-1]): 94 | x = conv(x, edge_index) 95 | if self.batchnorm: 96 | x = self.bns[i](x) 97 | x = F.relu(x) 98 | x = F.dropout(x, p=self.dropout, training=self.training) 99 | x = self.convs[-1](x, edge_index) 100 | return x.log_softmax(dim=-1) -------------------------------------------------------------------------------- /models/sage_neighsampler.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from torch import Tensor 4 | from torch_sparse import SparseTensor 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch_geometric.nn import SAGEConv 9 | from tqdm import tqdm 10 | 11 | 12 | class SAGE_NeighSampler(torch.nn.Module): 13 | def __init__(self, in_channels, hidden_channels, out_channels, num_layers, dropout, batchnorm=True): 14 | super(SAGE_NeighSampler, self).__init__() 15 | 16 | self.convs = torch.nn.ModuleList() 17 | self.convs.append(SAGEConv(in_channels, hidden_channels)) 18 | self.bns = torch.nn.ModuleList() 19 | self.batchnorm = batchnorm 20 | self.num_layers = num_layers 21 | if self.batchnorm: 22 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels)) 23 | for i in range(num_layers - 2): 24 | self.convs.append(SAGEConv(hidden_channels, hidden_channels)) 25 | if self.batchnorm: 26 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels)) 27 | self.convs.append(SAGEConv(hidden_channels, int(hidden_channels/2))) 28 | 29 | self.dropout = dropout 30 | 31 | self.fc = nn.Linear(int(hidden_channels/2), out_channels) 32 | 33 | def reset_parameters(self): 34 | for conv in self.convs: 35 | conv.reset_parameters() 36 | if self.batchnorm: 37 | for bn in self.bns: 38 | bn.reset_parameters() 39 | 40 | def forward(self, x, adjs): 41 | for i, (edge_index, _, size) in enumerate(adjs): 42 | x_target = x[:size[1]] 43 | x = self.convs[i]((x, x_target), edge_index) 44 | if i != self.num_layers-1: 45 | if self.batchnorm: 46 | x = self.bns[i](x) 47 | x = F.relu(x) 48 | x = F.dropout(x, p=0.5, training=self.training) 49 | 50 | x = self.fc(x) 51 | 52 | return x.log_softmax(dim=-1) 53 | 54 | ''' 55 | subgraph_loader: size = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1], 56 | batch_size=**, shuffle=False, 57 | num_workers=12) 58 | You can also sample the complete k-hop neighborhood, but this is rather expensive (especially for Reddit). 59 | We apply here trick here to compute the node embeddings efficiently: 60 | Instead of sampling multiple layers for a mini-batch, we instead compute the node embeddings layer-wise. 61 | Doing this exactly k times mimics a k-layer GNN. 62 | ''' 63 | 64 | def inference_all(self, data): 65 | x, adj_t = data.x, data.adj_t 66 | for i, conv in enumerate(self.convs[:-1]): 67 | x = conv(x, adj_t) 68 | if self.batchnorm: 69 | x = self.bns[i](x) 70 | x = F.relu(x) 71 | x = F.dropout(x, p=self.dropout, training=self.training) 72 | x = self.convs[-1](x, adj_t) 73 | 74 | x = self.fc(x) 75 | 76 | return x.log_softmax(dim=-1) 77 | 78 | def inference(self, x_all, layer_loader, device): 79 | pbar = tqdm(total=x_all.size(0) * self.num_layers, ncols=80) 80 | pbar.set_description('Evaluating') 81 | 82 | # Compute representations of nodes layer by layer, using *all* 83 | # available edges. This leads to faster computation in contrast to 84 | # immediately computing the final representations of each batch. 85 | for i in range(self.num_layers): 86 | xs = [] 87 | for batch_size, n_id, adj in layer_loader: 88 | edge_index, _, size = adj.to(device) 89 | x = x_all[n_id].to(device) 90 | x_target = x[:size[1]] 91 | x = self.convs[i]((x, x_target), edge_index) 92 | if i != self.num_layers - 1: 93 | x = F.relu(x) 94 | if self.batchnorm: 95 | x = self.bns[i](x) 96 | xs.append(x) 97 | 98 | pbar.update(batch_size) 99 | 100 | x_all = torch.cat(xs, dim=0) 101 | 102 | x_all = self.fc(x_all) 103 | 104 | pbar.close() 105 | 106 | return x_all.log_softmax(dim=-1) 107 | 108 | def to_embedding(self, x_all, layer_loader, device): 109 | pbar = tqdm(total=x_all.size(0) * self.num_layers, ncols=80) 110 | pbar.set_description('Evaluating') 111 | 112 | for i in range(self.num_layers): 113 | xs = [] 114 | for batch_size, n_id, adj in layer_loader: 115 | edge_index, _, size = adj.to(device) 116 | x = x_all[n_id].to(device) 117 | x_target = x[:size[1]] 118 | x = self.convs[i]((x, x_target), edge_index) 119 | if i != self.num_layers - 1: 120 | x = F.relu(x) 121 | if self.batchnorm: 122 | x = self.bns[i](x) 123 | xs.append(x) 124 | 125 | pbar.update(batch_size) 126 | 127 | x_all = torch.cat(xs, dim=0) 128 | 129 | pbar.close() 130 | 131 | return x_all 132 | -------------------------------------------------------------------------------- /train_mini_batch.py: -------------------------------------------------------------------------------- 1 | # dataset name: XYGraphP1 2 | 3 | from utils import XYGraphP1 4 | from utils.utils import prepare_folder 5 | from utils.evaluator import Evaluator 6 | from torch_geometric.data import NeighborSampler 7 | from models import SAGE_NeighSampler, GAT_NeighSampler, GATv2_NeighSampler 8 | from tqdm import tqdm 9 | 10 | import argparse 11 | 12 | import torch 13 | import torch.nn.functional as F 14 | import torch.nn as nn 15 | 16 | import torch_geometric.transforms as T 17 | from torch_sparse import SparseTensor 18 | from torch_geometric.utils import to_undirected 19 | import pandas as pd 20 | 21 | eval_metric = 'auc' 22 | 23 | sage_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-7 24 | } 25 | 26 | gat_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-7, 'layer_heads': [4, 1] 27 | } 28 | 29 | gatv2_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-6, 'layer_heads': [4, 1] 30 | } 31 | 32 | 33 | def train(epoch, train_loader, model, data, train_idx, optimizer, device, no_conv=False): 34 | model.train() 35 | 36 | pbar = tqdm(total=train_idx.size(0), ncols=80) 37 | pbar.set_description(f'Epoch {epoch:02d}') 38 | 39 | total_loss = total_correct = 0 40 | for batch_size, n_id, adjs in train_loader: 41 | # `adjs` holds a list of `(edge_index, e_id, size)` tuples. 42 | adjs = [adj.to(device) for adj in adjs] 43 | 44 | optimizer.zero_grad() 45 | out = model(data.x[n_id], adjs) 46 | loss = F.nll_loss(out, data.y[n_id[:batch_size]]) 47 | loss.backward() 48 | optimizer.step() 49 | 50 | total_loss += float(loss) 51 | pbar.update(batch_size) 52 | 53 | pbar.close() 54 | loss = total_loss / len(train_loader) 55 | 56 | return loss 57 | 58 | 59 | @torch.no_grad() 60 | def test(layer_loader, model, data, split_idx, device, no_conv=False): 61 | # data.y is labels of shape (N, ) 62 | model.eval() 63 | 64 | out = model.inference(data.x, layer_loader, device) 65 | # out = model.inference_all(data) 66 | y_pred = out.exp() # (N,num_classes) 67 | 68 | losses = dict() 69 | for key in ['train', 'valid', 'test']: 70 | node_id = split_idx[key] 71 | node_id = node_id.to(device) 72 | losses[key] = F.nll_loss(out[node_id], data.y[node_id]).item() 73 | 74 | return losses, y_pred 75 | 76 | 77 | def main(): 78 | parser = argparse.ArgumentParser(description='minibatch_gnn_models') 79 | parser.add_argument('--device', type=int, default=0) 80 | parser.add_argument('--dataset', type=str, default='XYGraphP1') 81 | parser.add_argument('--log_steps', type=int, default=10) 82 | parser.add_argument('--model', type=str, default='mlp') 83 | parser.add_argument('--epochs', type=int, default=100) 84 | 85 | args = parser.parse_args() 86 | print(args) 87 | 88 | no_conv = False 89 | if args.model in ['mlp']: 90 | no_conv = True 91 | 92 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' 93 | device = torch.device(device) 94 | 95 | dataset = XYGraphP1(root='./', name='xydata', transform=T.ToSparseTensor()) 96 | 97 | nlabels = dataset.num_classes 98 | if args.dataset == 'XYGraphP1': 99 | nlabels = 2 100 | 101 | data = dataset[0] 102 | data.adj_t = data.adj_t.to_symmetric() 103 | 104 | if args.dataset in ['XYGraphP1']: 105 | x = data.x 106 | x = (x-x.mean(0))/x.std(0) 107 | data.x = x 108 | if data.y.dim() == 2: 109 | data.y = data.y.squeeze(1) 110 | 111 | split_idx = {'train': data.train_mask, 112 | 'valid': data.valid_mask, 'test': data.test_mask} 113 | 114 | data = data.to(device) 115 | train_idx = split_idx['train'].to(device) 116 | 117 | model_dir = prepare_folder(args.dataset, args.model) 118 | print('model_dir:', model_dir) 119 | 120 | train_loader = NeighborSampler(data.adj_t, node_idx=train_idx, sizes=[ 121 | 10, 5], batch_size=1024, shuffle=True, num_workers=12) 122 | layer_loader = NeighborSampler( 123 | data.adj_t, node_idx=None, sizes=[-1], batch_size=4096, shuffle=False, num_workers=12) 124 | 125 | if args.model == 'sage_neighsampler': 126 | para_dict = sage_neighsampler_parameters 127 | model_para = sage_neighsampler_parameters.copy() 128 | model_para.pop('lr') 129 | model_para.pop('l2') 130 | model = SAGE_NeighSampler( 131 | in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device) 132 | if args.model == 'gat_neighsampler': 133 | para_dict = gat_neighsampler_parameters 134 | model_para = gat_neighsampler_parameters.copy() 135 | model_para.pop('lr') 136 | model_para.pop('l2') 137 | model = GAT_NeighSampler( 138 | in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device) 139 | if args.model == 'gatv2_neighsampler': 140 | para_dict = gatv2_neighsampler_parameters 141 | model_para = gatv2_neighsampler_parameters.copy() 142 | model_para.pop('lr') 143 | model_para.pop('l2') 144 | model = GATv2_NeighSampler( 145 | in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device) 146 | 147 | print(f'Model {args.model} initialized') 148 | 149 | model.reset_parameters() 150 | optimizer = torch.optim.Adam( 151 | model.parameters(), lr=para_dict['lr'], weight_decay=para_dict['l2']) 152 | min_valid_loss = 1e8 153 | 154 | for epoch in range(1, args.epochs+1): 155 | loss = train(epoch, train_loader, model, data, 156 | train_idx, optimizer, device, no_conv) 157 | losses, out = test(layer_loader, model, data, 158 | split_idx, device, no_conv) 159 | train_loss, valid_loss, test_loss = losses['train'], losses['valid'], losses['test'] 160 | 161 | if valid_loss < min_valid_loss: 162 | min_valid_loss = valid_loss 163 | torch.save(model.state_dict(), model_dir+'model.pt') 164 | 165 | if epoch % args.log_steps == 0: 166 | print(f'Epoch: {epoch:02d}, ' 167 | f'Loss: {loss:.4f}, ' 168 | f'Train: {100 * train_loss:.3f}%, ' 169 | f'Valid: {100 * valid_loss:.3f}% ' 170 | f'Test: {100 * test_loss:.3f}%') 171 | 172 | 173 | if __name__ == "__main__": 174 | main() 175 | -------------------------------------------------------------------------------- /inference_mini_batch.py: -------------------------------------------------------------------------------- 1 | # dataset name: XYGraphP1_no_valid 2 | 3 | import pickle 4 | from utils import XYGraphP1_no_valid 5 | from utils.utils import prepare_folder 6 | from utils.evaluator import Evaluator 7 | from torch_geometric.data import NeighborSampler 8 | from models import SAGE_NeighSampler, GAT_NeighSampler, GATv2_NeighSampler 9 | from tqdm import tqdm 10 | 11 | import argparse 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | import torch.nn as nn 16 | 17 | import torch_geometric.transforms as T 18 | from torch_sparse import SparseTensor 19 | from torch_geometric.utils import to_undirected 20 | import pandas as pd 21 | import numpy as np 22 | 23 | 24 | from lightgbm import LGBMClassifier 25 | from sklearn.metrics import accuracy_score 26 | from sklearn.model_selection import GridSearchCV 27 | import joblib 28 | 29 | 30 | sage_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-7 31 | } 32 | 33 | gat_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-7, 'layer_heads': [4, 1] 34 | } 35 | 36 | gatv2_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-6, 'layer_heads': [4, 1] 37 | } 38 | 39 | 40 | @torch.no_grad() 41 | def to_embedding(layer_loader, model, data, device, no_conv=False): 42 | # data.y is labels of shape (N, ) 43 | model.eval() 44 | 45 | out = model.to_embedding(data.x, layer_loader, device) 46 | print("Model embedding data : ", out.shape) 47 | 48 | return out 49 | 50 | 51 | def main(): 52 | parser = argparse.ArgumentParser(description='minibatch_gnn_models') 53 | parser.add_argument('--device', type=int, default=0) 54 | parser.add_argument('--dataset', type=str, default='XYGraphP1') 55 | parser.add_argument('--log_steps', type=int, default=10) 56 | parser.add_argument('--model', type=str, default='mlp') 57 | parser.add_argument('--epochs', type=int, default=100) 58 | 59 | args = parser.parse_args() 60 | print(args) 61 | 62 | no_conv = False 63 | if args.model in ['mlp']: 64 | no_conv = True 65 | 66 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' 67 | device = torch.device(device) 68 | 69 | dataset = XYGraphP1_no_valid( 70 | root='./', name='xydata', transform=T.ToSparseTensor()) 71 | 72 | nlabels = dataset.num_classes 73 | if args.dataset == 'XYGraphP1': 74 | nlabels = 2 75 | 76 | data = dataset[0] 77 | data.adj_t = data.adj_t.to_symmetric() 78 | 79 | if args.dataset in ['XYGraphP1']: 80 | x = data.x 81 | x = (x-x.mean(0))/x.std(0) 82 | data.x = x 83 | if data.y.dim() == 2: 84 | data.y = data.y.squeeze(1) 85 | 86 | data = data.to(device) 87 | 88 | layer_loader = NeighborSampler( 89 | data.adj_t, node_idx=None, sizes=[-1], batch_size=4096, shuffle=False, num_workers=12) 90 | 91 | if args.model == 'sage_neighsampler': 92 | para_dict = sage_neighsampler_parameters 93 | model_para = sage_neighsampler_parameters.copy() 94 | model_para.pop('lr') 95 | model_para.pop('l2') 96 | model = SAGE_NeighSampler( 97 | in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device) 98 | if args.model == 'gat_neighsampler': 99 | para_dict = gat_neighsampler_parameters 100 | model_para = gat_neighsampler_parameters.copy() 101 | model_para.pop('lr') 102 | model_para.pop('l2') 103 | model = GAT_NeighSampler( 104 | in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device) 105 | if args.model == 'gatv2_neighsampler': 106 | para_dict = gatv2_neighsampler_parameters 107 | model_para = gatv2_neighsampler_parameters.copy() 108 | model_para.pop('lr') 109 | model_para.pop('l2') 110 | model = GATv2_NeighSampler( 111 | in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device) 112 | 113 | print(f'Model {args.model} initialized') 114 | 115 | model_file = './model_files/{}/{}/model.pt'.format( 116 | args.dataset, args.model) 117 | print('model_file:', model_file) 118 | model.load_state_dict(torch.load(model_file)) 119 | 120 | out = to_embedding(layer_loader, model, data, device, no_conv) 121 | 122 | embedding_train, embedding_test = out[data.train_mask], out[data.test_mask] 123 | y_train = data.y[data.train_mask] 124 | 125 | print(embedding_train.shape) 126 | print(y_train.shape) 127 | print(embedding_test.shape) 128 | 129 | with open("xydata/embedding/64_train_embedding_new.pickle", 'wb') as file: 130 | pickle.dump(embedding_train, file) 131 | file.close() 132 | 133 | with open("xydata/embedding/64_embedding_test_new.pickle", 'wb') as file: 134 | pickle.dump(embedding_test, file) 135 | file.close() 136 | 137 | with open("xydata/embedding/64_y_train_new.pickle", 'wb') as file: 138 | pickle.dump(y_train, file) 139 | file.close() 140 | 141 | with open('xydata/embedding/64_embedding_test_new.pickle', 'rb') as file: 142 | x_test_all = pickle.load(file) 143 | file.close() 144 | with open('xydata/embedding/64_train_embedding_new.pickle', 'rb') as file: 145 | x_train_all = pickle.load(file) 146 | file.close() 147 | with open('xydata/embedding/64_y_train_new.pickle', 'rb') as file: 148 | y_train_all = pickle.load(file) 149 | file.close() 150 | 151 | items = np.load('xydata/raw/phase1_gdata.npz') 152 | x = items['x'] 153 | y = items['y'].reshape(-1, 1) 154 | edge_index = items['edge_index'] 155 | edge_type = items['edge_type'] 156 | np.random.seed(42) 157 | train_mask_t = items['train_mask'] 158 | np.random.shuffle(train_mask_t) 159 | test_mask = items['test_mask'] 160 | 161 | x_train_add = torch.tensor(x[train_mask_t], dtype=torch.float).contiguous() 162 | x_test_add = torch.tensor(x[test_mask], dtype=torch.float).contiguous() 163 | 164 | set_train_mask_t = set(list(train_mask_t)) 165 | set_test_mask_t = set(list(test_mask)) 166 | set_used = set_train_mask_t | set_test_mask_t 167 | 168 | x_edge_add = np.zeros([x.shape[0], 22]) 169 | 170 | edge_type = edge_type-1 171 | 172 | x_time_add = np.zeros([x.shape[0], 90]) 173 | edge_timestamp = items['edge_timestamp'] 174 | edge_timestamp = (edge_timestamp.astype(np.int32) / 13).astype(np.int32) 175 | 176 | x_pointLabel_add = np.zeros([x.shape[0], 8]) 177 | 178 | for i in tqdm(range(len(edge_index))): 179 | if edge_index[i][0] in set_used or edge_index[i][1] in set_used: 180 | x_edge_add[edge_index[i][0]][edge_type[i]] += 1 181 | x_edge_add[edge_index[i][1]][edge_type[i]+10] += 1 182 | 183 | x_time_add[edge_index[i][0]][:edge_timestamp[i]+1] += 1 184 | x_time_add[edge_index[i][1]][45:edge_timestamp[i]+1+45] += 1 185 | 186 | if y[edge_index[i][1]] != -100: 187 | x_pointLabel_add[edge_index[i][0]][y[edge_index[i][1]]] += 1 188 | else: 189 | x_pointLabel_add[edge_index[i][0]][:2] += 1 190 | 191 | if y[edge_index[i][0]] != -100: 192 | x_pointLabel_add[edge_index[i][1]][y[edge_index[i][0]]+4] += 1 193 | else: 194 | x_pointLabel_add[edge_index[i][1]][4:6] += 1 195 | 196 | train_x_edge_add = torch.from_numpy(x_edge_add[train_mask_t]) 197 | test_x_edge_add = torch.from_numpy(x_edge_add[test_mask]) 198 | 199 | train_x_time_add = torch.from_numpy(x_time_add[train_mask_t]) 200 | test_x_time_add = torch.from_numpy(x_time_add[test_mask]) 201 | 202 | train_x_pointLabel_add = torch.from_numpy(x_pointLabel_add[train_mask_t]) 203 | test_x_pointLabel_add = torch.from_numpy(x_pointLabel_add[test_mask]) 204 | 205 | x_train_all = torch.cat((x_train_add, x_train_all, train_x_edge_add, 206 | train_x_time_add, train_x_pointLabel_add), 1) 207 | x_test_all = torch.cat( 208 | (x_test_add, x_test_all, test_x_edge_add, test_x_time_add, test_x_pointLabel_add), 1) 209 | 210 | X_train, y_train = x_train_all, y_train_all 211 | 212 | gbm = LGBMClassifier(objective='binary', 213 | # subsample=0.8, 214 | # colsample_bytree=0.8, 215 | verbosity=2, metric='auc', 216 | learning_rate=0.01, 217 | n_estimators=1200, 218 | min_child_samples=125, 219 | max_depth=7, 220 | num_leaves=128, 221 | reg_alpha=0.1, 222 | reg_lambda=0.1, 223 | # scale_pos_weight=83.7 224 | ) 225 | 226 | gbm.fit(X_train, y_train) 227 | 228 | joblib.dump(gbm, 'model_files/LGBM_model.pkl') 229 | gbm = joblib.load('model_files/LGBM_model.pkl') 230 | 231 | y_pred = gbm.predict_proba(x_test_all, num_iteration=gbm.best_iteration_) 232 | print(y_pred.shape) 233 | np.save("submit/output.npy", y_pred) 234 | 235 | 236 | if __name__ == "__main__": 237 | main() 238 | -------------------------------------------------------------------------------- /models/gat_neighsampler.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from torch import Tensor 4 | from torch_sparse import SparseTensor 5 | import torch 6 | import torch.nn.functional as F 7 | from torch_geometric.nn import GATConv, GATv2Conv 8 | from tqdm import tqdm 9 | 10 | class GAT_NeighSampler(torch.nn.Module): 11 | def __init__(self 12 | , in_channels 13 | , hidden_channels 14 | , out_channels 15 | , num_layers 16 | , dropout 17 | , layer_heads = [] 18 | , batchnorm=True): 19 | super(GAT_NeighSampler, self).__init__() 20 | 21 | self.convs = torch.nn.ModuleList() 22 | self.batchnorm = batchnorm 23 | self.num_layers = num_layers 24 | 25 | if len(layer_heads)>1: 26 | self.convs.append(GATConv(in_channels, hidden_channels, heads=layer_heads[0], concat=True)) 27 | if self.batchnorm: 28 | self.bns = torch.nn.ModuleList() 29 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[0])) 30 | for i in range(num_layers - 2): 31 | self.convs.append(GATConv(hidden_channels*layer_heads[i-1], hidden_channels, heads=layer_heads[i], concat=True)) 32 | if self.batchnorm: 33 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[i-1])) 34 | self.convs.append(GATConv(hidden_channels*layer_heads[num_layers-2] 35 | , out_channels 36 | , heads=layer_heads[num_layers-1] 37 | , concat=False)) 38 | else: 39 | self.convs.append(GATConv(in_channels, out_channels, heads=layer_heads[0], concat=False)) 40 | 41 | self.dropout = dropout 42 | 43 | def reset_parameters(self): 44 | for conv in self.convs: 45 | conv.reset_parameters() 46 | if self.batchnorm: 47 | for bn in self.bns: 48 | bn.reset_parameters() 49 | 50 | 51 | def forward(self, x, adjs): 52 | for i, (edge_index, _, size) in enumerate(adjs): 53 | x_target = x[:size[1]] 54 | x = self.convs[i]((x, x_target), edge_index) 55 | if i != self.num_layers-1: 56 | if self.batchnorm: 57 | x = self.bns[i](x) 58 | x = F.relu(x) 59 | x = F.dropout(x, p=0.5, training=self.training) 60 | 61 | return x.log_softmax(dim=-1) 62 | 63 | ''' 64 | subgraph_loader: size = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1], 65 | batch_size=**, shuffle=False, 66 | num_workers=12) 67 | You can also sample the complete k-hop neighborhood, but this is rather expensive (especially for Reddit). 68 | We apply here trick here to compute the node embeddings efficiently: 69 | Instead of sampling multiple layers for a mini-batch, we instead compute the node embeddings layer-wise. 70 | Doing this exactly k times mimics a k-layer GNN. 71 | ''' 72 | 73 | def inference_all(self, data): 74 | x, adj_t = data.x, data.adj_t 75 | for i, conv in enumerate(self.convs[:-1]): 76 | x = conv(x, adj_t) 77 | if self.batchnorm: 78 | x = self.bns[i](x) 79 | x = F.relu(x) 80 | x = F.dropout(x, p=self.dropout, training=self.training) 81 | x = self.convs[-1](x, adj_t) 82 | return x.log_softmax(dim=-1) 83 | 84 | def inference(self, x_all, layer_loader, device): 85 | pbar = tqdm(total=x_all.size(0) * self.num_layers, ncols=80) 86 | pbar.set_description('Evaluating') 87 | 88 | # Compute representations of nodes layer by layer, using *all* 89 | # available edges. This leads to faster computation in contrast to 90 | # immediately computing the final representations of each batch. 91 | for i in range(self.num_layers): 92 | xs = [] 93 | for batch_size, n_id, adj in layer_loader: 94 | edge_index, _, size = adj.to(device) 95 | x = x_all[n_id].to(device) 96 | x_target = x[:size[1]] 97 | x = self.convs[i]((x, x_target), edge_index) 98 | if i != self.num_layers - 1: 99 | x = F.relu(x) 100 | if self.batchnorm: 101 | x = self.bns[i](x) 102 | xs.append(x) 103 | 104 | pbar.update(batch_size) 105 | 106 | x_all = torch.cat(xs, dim=0) 107 | 108 | pbar.close() 109 | 110 | return x_all.log_softmax(dim=-1) 111 | 112 | 113 | 114 | class GATv2_NeighSampler(torch.nn.Module): 115 | def __init__(self 116 | , in_channels 117 | , hidden_channels 118 | , out_channels 119 | , num_layers 120 | , dropout 121 | , layer_heads = [] 122 | , batchnorm=True): 123 | super(GATv2_NeighSampler, self).__init__() 124 | 125 | self.convs = torch.nn.ModuleList() 126 | self.batchnorm = batchnorm 127 | self.num_layers = num_layers 128 | 129 | if len(layer_heads)>1: 130 | self.convs.append(GATv2Conv(in_channels, hidden_channels, heads=layer_heads[0], concat=True)) 131 | if self.batchnorm: 132 | self.bns = torch.nn.ModuleList() 133 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[0])) 134 | for i in range(num_layers - 2): 135 | self.convs.append(GATv2Conv(hidden_channels*layer_heads[i-1], hidden_channels, heads=layer_heads[i], concat=True)) 136 | if self.batchnorm: 137 | self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[i-1])) 138 | self.convs.append(GATv2Conv(hidden_channels*layer_heads[num_layers-2] 139 | , out_channels 140 | , heads=layer_heads[num_layers-1] 141 | , concat=False)) 142 | else: 143 | self.convs.append(GATv2Conv(in_channels, out_channels, heads=layer_heads[0], concat=False)) 144 | 145 | self.dropout = dropout 146 | 147 | def reset_parameters(self): 148 | for conv in self.convs: 149 | conv.reset_parameters() 150 | if self.batchnorm: 151 | for bn in self.bns: 152 | bn.reset_parameters() 153 | 154 | 155 | def forward(self, x, adjs): 156 | for i, (edge_index, _, size) in enumerate(adjs): 157 | x_target = x[:size[1]] 158 | x = self.convs[i]((x, x_target), edge_index) 159 | if i != self.num_layers-1: 160 | if self.batchnorm: 161 | x = self.bns[i](x) 162 | x = F.relu(x) 163 | x = F.dropout(x, p=0.5, training=self.training) 164 | 165 | return x.log_softmax(dim=-1) 166 | 167 | ''' 168 | subgraph_loader: size = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1], 169 | batch_size=**, shuffle=False, 170 | num_workers=12) 171 | You can also sample the complete k-hop neighborhood, but this is rather expensive (especially for Reddit). 172 | We apply here trick here to compute the node embeddings efficiently: 173 | Instead of sampling multiple layers for a mini-batch, we instead compute the node embeddings layer-wise. 174 | Doing this exactly k times mimics a k-layer GNN. 175 | ''' 176 | 177 | def inference_all(self, data): 178 | x, adj_t = data.x, data.adj_t 179 | for i, conv in enumerate(self.convs[:-1]): 180 | x = conv(x, adj_t) 181 | if self.batchnorm: 182 | x = self.bns[i](x) 183 | x = F.relu(x) 184 | x = F.dropout(x, p=self.dropout, training=self.training) 185 | x = self.convs[-1](x, adj_t) 186 | return x.log_softmax(dim=-1) 187 | 188 | def inference(self, x_all, layer_loader, device): 189 | pbar = tqdm(total=x_all.size(0) * self.num_layers, ncols=80) 190 | pbar.set_description('Evaluating') 191 | 192 | # Compute representations of nodes layer by layer, using *all* 193 | # available edges. This leads to faster computation in contrast to 194 | # immediately computing the final representations of each batch. 195 | for i in range(self.num_layers): 196 | xs = [] 197 | for batch_size, n_id, adj in layer_loader: 198 | edge_index, _, size = adj.to(device) 199 | x = x_all[n_id].to(device) 200 | x_target = x[:size[1]] 201 | x = self.convs[i]((x, x_target), edge_index) 202 | if i != self.num_layers - 1: 203 | x = F.relu(x) 204 | if self.batchnorm: 205 | x = self.bns[i](x) 206 | xs.append(x) 207 | 208 | pbar.update(batch_size) 209 | 210 | x_all = torch.cat(xs, dim=0) 211 | 212 | pbar.close() 213 | 214 | return x_all.log_softmax(dim=-1) 215 | 216 | --------------------------------------------------------------------------------