├── xydata
    └── raw
    │   └── .gitignore
├── model_files
    └── .gitignore
├── submit
    └── .gitignore
├── utils
    ├── __init__.py
    ├── utils.py
    ├── evaluator.py
    ├── xygraph.py
    └── xygraph_no_valid.py
├── models
    ├── __init__.py
    ├── sage.py
    ├── gcn.py
    ├── mlp.py
    ├── gat.py
    ├── sage_neighsampler.py
    └── gat_neighsampler.py
├── README.md
├── requirements.txt
├── .gitignore
├── train_mini_batch.py
└── inference_mini_batch.py


/xydata/raw/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/model_files/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/submit/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !submit_demo.npy
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .xygraph import XYGraphP1
2 | from .xygraph_no_valid import XYGraphP1_no_valid
3 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlp import MLP, MLPLinear
2 | from .gcn import GCN
3 | from .sage import SAGE
4 | from .sage_neighsampler import SAGE_NeighSampler
5 | from .gat import GAT, GATv2
6 | from .gat_neighsampler import GAT_NeighSampler, GATv2_NeighSampler


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | from datetime import datetime
 4 | import shutil
 5 | 
 6 | 
 7 | def prepare_folder(name, model_name):
 8 |     model_dir = f'./model_files/{name}/{model_name}/'
 9 |    
10 |     if os.path.exists(model_dir):
11 |         shutil.rmtree(model_dir)
12 |     os.makedirs(model_dir)
13 |     return model_dir
14 | 
15 | def prepare_tune_folder(name, model_name):
16 |     str_time = datetime.strftime(datetime.now(), '%Y%m%d_%H%M%S')
17 |     tune_model_dir = f'./tune_results/{name}/{model_name}/{str_time}/'
18 |    
19 |     if os.path.exists(tune_model_dir):
20 |         print(f'rm tune_model_dir {tune_model_dir}')
21 |         shutil.rmtree(tune_model_dir)
22 |     os.makedirs(tune_model_dir)
23 |     print(f'make tune_model_dir {tune_model_dir}')
24 |     return tune_model_dir
25 | 
26 | def save_preds_and_params(parameters, preds, model, file):
27 |     save_dict = {'parameters':parameters, 'preds': preds, 'params': model.state_dict()
28 |            , 'nparams': sum(p.numel() for p in model.parameters())}
29 |     torch.save(save_dict, file)
30 |     return 
31 |     
32 |     
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/models/sage.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from torch import Tensor
 4 | from torch_sparse import SparseTensor
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch_geometric.nn import SAGEConv
 8 | 
 9 | class SAGE(torch.nn.Module):
10 |     def __init__(self
11 |                  , in_channels
12 |                  , hidden_channels
13 |                  , out_channels
14 |                  , num_layers
15 |                  , dropout
16 |                  , batchnorm=True):
17 |         super(SAGE, self).__init__()
18 | 
19 |         self.convs = torch.nn.ModuleList()
20 |         self.convs.append(SAGEConv(in_channels, hidden_channels))
21 |         self.bns = torch.nn.ModuleList()
22 |         self.batchnorm = batchnorm
23 |         if self.batchnorm:
24 |             self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
25 |         for _ in range(num_layers - 2):
26 |             self.convs.append(SAGEConv(hidden_channels, hidden_channels))
27 |             if self.batchnorm:
28 |                 self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
29 |         self.convs.append(SAGEConv(hidden_channels, out_channels))
30 | 
31 |         self.dropout = dropout
32 | 
33 |     def reset_parameters(self):
34 |         for conv in self.convs:
35 |             conv.reset_parameters()
36 |         if self.batchnorm:
37 |             for bn in self.bns:
38 |                 bn.reset_parameters()
39 | 
40 |     def forward(self, x, edge_index: Union[Tensor, SparseTensor]):
41 |         for i, conv in enumerate(self.convs[:-1]):
42 |             x = conv(x, edge_index)
43 |             if self.batchnorm: 
44 |                 x = self.bns[i](x)
45 |             x = F.relu(x)
46 |             x = F.dropout(x, p=self.dropout, training=self.training)
47 |         x = self.convs[-1](x, edge_index)
48 |         return x.log_softmax(dim=-1)
49 |     


--------------------------------------------------------------------------------
/models/gcn.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from torch import Tensor
 4 | from torch_sparse import SparseTensor
 5 | import torch
 6 | import torch.nn.functional as F
 7 | from torch_geometric.nn import GCNConv
 8 | 
 9 | '''
10 | 此模型邻居矩阵采用sparse tensor的形式，可以大大减少计算量，
11 | 如果不使用sparse tensor形式传递，将adj_t替换成edge_index
12 | '''
13 | class GCN(torch.nn.Module):
14 |     def __init__(self
15 |                  , in_channels
16 |                  , hidden_channels
17 |                  , out_channels
18 |                  , num_layers
19 |                  , dropout
20 |                  , batchnorm=True):
21 |         super(GCN, self).__init__()
22 | 
23 |         self.convs = torch.nn.ModuleList()
24 |         self.convs.append(GCNConv(in_channels, hidden_channels, cached=True))
25 |         self.batchnorm = batchnorm
26 |         if self.batchnorm:
27 |             self.bns = torch.nn.ModuleList()
28 |             self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
29 |         for _ in range(num_layers - 2):
30 |             self.convs.append(
31 |                 GCNConv(hidden_channels, hidden_channels, cached=True))
32 |             if self.batchnorm: 
33 |                 self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
34 |         self.convs.append(GCNConv(hidden_channels, out_channels, cached=True))
35 | 
36 |         self.dropout = dropout
37 | 
38 |     def reset_parameters(self):
39 |         for conv in self.convs:
40 |             conv.reset_parameters()
41 |         if self.batchnorm:
42 |             for bn in self.bns:
43 |                 bn.reset_parameters()
44 | 
45 |     def forward(self, x, edge_index: Union[Tensor, SparseTensor]):
46 |         for i, conv in enumerate(self.convs[:-1]):
47 |             x = conv(x, edge_index)
48 |             if self.batchnorm: 
49 |                 x = self.bns[i](x)
50 |             x = F.relu(x)
51 |             x = F.dropout(x, p=self.dropout, training=self.training)
52 |         x = self.convs[-1](x, edge_index)
53 |         return x.log_softmax(dim=-1)


--------------------------------------------------------------------------------
/models/mlp.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | class MLP(torch.nn.Module):
 5 |     def __init__(self
 6 |                  , in_channels
 7 |                  , hidden_channels
 8 |                  , out_channels
 9 |                  , num_layers
10 |                  , dropout
11 |                  , batchnorm=True):
12 |         super(MLP, self).__init__()
13 |         self.lins = torch.nn.ModuleList()
14 |         self.lins.append(torch.nn.Linear(in_channels, hidden_channels))
15 |         self.batchnorm = batchnorm
16 |         if self.batchnorm:
17 |             self.bns = torch.nn.ModuleList()
18 |             self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
19 |         for _ in range(num_layers - 2):
20 |             self.lins.append(torch.nn.Linear(hidden_channels, hidden_channels))
21 |             if self.batchnorm:
22 |                 self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
23 |         self.lins.append(torch.nn.Linear(hidden_channels, out_channels))
24 | 
25 |         self.dropout = dropout
26 | 
27 |     def reset_parameters(self):
28 |         for lin in self.lins:
29 |             lin.reset_parameters()
30 |         if self.batchnorm:
31 |             for bn in self.bns:
32 |                 bn.reset_parameters()
33 | 
34 |     def forward(self, x):    
35 |         for i, lin in enumerate(self.lins[:-1]):
36 |             x = lin(x)
37 |             if self.batchnorm:
38 |                 x = self.bns[i](x)
39 |             x = F.relu(x)
40 |             x = F.dropout(x, p=self.dropout, training=self.training)
41 |         x = self.lins[-1](x)
42 |         return F.log_softmax(x, dim=-1)
43 |     
44 |     
45 | 
46 | class MLPLinear(torch.nn.Module):
47 |     def __init__(self, in_channels, out_channels):
48 |         super(MLPLinear, self).__init__()
49 |         self.lin = torch.nn.Linear(in_channels, out_channels)
50 | 
51 |     def reset_parameters(self):
52 |         self.lin.reset_parameters()
53 | 
54 |     def forward(self, x):
55 |         return F.log_softmax(self.lin(x), dim=-1)
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 第七届信也科技杯 RobertAckley 初赛代码
 2 | 这是第七届信也科技杯-欺诈用户风险识别RobertAckley的初赛代码。 测评AUC最终为0.83631,排名33。本代码主要参照比赛baseline代码 https://github.com/DGraphXinye/2022_finvcup_baseline   
 3 | 请在比赛网站上下载"初赛数据集.zip"文件，将zip文件中的"phase1_gdata.npz"放到路径'./xydata/raw'中。  
 4 | 
 5 | 
 6 | ## Environments
 7 | Implementing environment:  
 8 | - python = 3.7.6
 9 | - numpy = 1.21.2  
10 | - pytorch = 1.6.0  
11 | - torch_geometric = 1.7.2  
12 | - torch_scatter = 2.0.8  
13 | - torch_sparse = 0.6.9
14 | - networkx = 2.6.3
15 | - pandas = 1.3.5
16 | - scikit-learn = 1.0.2
17 | - lightgbm = 3.3.2
18 | - torchvision = 0.7.0
19 | - tqdm = 4.64.0
20 | 
21 | 详细见requirements.txt
22 | 
23 | - GPU: RTX A4000   
24 | 
25 | 
26 | ## Training
27 | 
28 | - **GraphSAGE (NeighborSampler)**
29 | ```bash
30 | python train_mini_batch.py --model sage_neighsampler --epochs 200 --device 0
31 | python inference_mini_batch.py --model sage_neighsampler --device 0
32 | ```
33 | 
34 | ## Results:
35 | 
36 | | Methods   | Valid AUC  | Test AUC  |
37 | |  :----  |  ---- | ---- |
38 | | GraphSAGE (NeighborSampler) embedding + features + LightGBM  | 0.8518 | **0.8363** |
39 | 
40 | ## 解决方案及算法方案
41 | 
42 | 本次比赛团队的解决方案主要包括三步：基于GraphSAGE的节点Embedding（与baseline一致），手工加入时序等特征，通过LightGBM分类
43 | 
44 | 1. 基于GraphSAGE的节点Embedding（与baseline一致）
45 | 
46 |    ​	基于baseline代码中GraphSAGE（NeighborSampler）的AUC最高，团队使用该网络对数据集中的节点进行embedding。网络训练与baseline中一致，修改的点为将17-128-2的GraphSAGE原模型修改为12-128-64-2的新模型，最后一层为64-2的线性层作为分类器。网络训练完成后，只取GraphSAGE的前两层，如此便将所有节点转换成了64维的向量。
47 | 
48 |    ​	值得注意的是，得到embedding时不再采用验证集（能使测试AUC上升0.003左右），因此inference_mini_batch.py中使用的是XYGraphP1_no_valid作为数据集类。
49 | 
50 | 2. 手工加入时序等特征
51 | 
52 |    最终每个节点为201维的特征向量（由以下特征向量直接拼接得到），其中：
53 | 
54 |    - 点的embedding特征向量，64维
55 |    - 节点本身的特征向量，17维
56 |    - 节点的相邻节点的类别的数量，有4类节点，测试节点为未知节点，算作0类和1类各一个，共4维。考虑到数据集为单向边，因此将节点出边入边分别分开计算，因此总共4x2为8维。
57 |    - 节点的边的类别，共有11类边，统计节点的边的不同类别的数量。同理出边入边分别分开统计，共11x2为22维。
58 |    - 节点的随时间的边的数量。将边的时序特征均分为45份，统计改节点45个时间段边的数量。同理出边入边分别分开统计，共45x2为90维。
59 | 
60 | 3. LightGBM分类
61 | 
62 |    ​	最终通过LightGBM分类器分类，参数设置具体见代码，1200个epoch的设置是由之前包含验证集的相同设置的实验推测而来。
63 | 
64 | ## 复现流程
65 | 
66 | 1. 首先通过python train_mini_batch.py --model sage_neighsampler --epochs 200 --device 0命名训练网络。
67 | 2. 网络Embedding，手工特征，LightGBM分类只需要运行python inference_mini_batch.py --model sage_neighsampler --device 0命令即可。
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/utils/evaluator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn.metrics import roc_auc_score
 5 | try:
 6 |     import torch
 7 | except ImportError:
 8 |     torch = None   
 9 |     
10 | ### Evaluator for node property prediction
11 | class Evaluator:
12 |     def __init__(self, eval_metric):
13 |         if eval_metric not in ['acc', 'auc']:
14 |             raise ValueError('eval_metric should be acc or auc')
15 |             
16 |         self.eval_metric = eval_metric
17 | 
18 |     def _check_input(self, y_true, y_pred):
19 |         '''
20 |             y_true: numpy ndarray or torch tensor of shape (num_node)
21 |             y_pred: numpy ndarray or torch tensor of shape (num_node, num_tasks)
22 |         '''
23 | 
24 |         # converting to torch.Tensor to numpy on cpu
25 |         if torch is not None and isinstance(y_true, torch.Tensor):
26 |             y_true = y_true.detach().cpu().numpy()
27 | 
28 |         if torch is not None and isinstance(y_pred, torch.Tensor):
29 |             y_pred = y_pred.detach().cpu().numpy()
30 | 
31 |         ## check type
32 |         if not (isinstance(y_true, np.ndarray) and isinstance(y_true, np.ndarray)):
33 |             raise RuntimeError('Arguments to Evaluator need to be either numpy ndarray or torch tensor')
34 | 
35 |         if not y_pred.ndim == 2:
36 |             raise RuntimeError('y_pred must to 2-dim arrray, {}-dim array given'.format(y_true.ndim))
37 | 
38 |         return y_true, y_pred
39 | 
40 |     def eval(self, y_true, y_pred):
41 |         if self.eval_metric == 'auc':
42 |             y_true, y_pred = self._check_input(y_true, y_pred)
43 |             return self._eval_rocauc(y_true, y_pred)
44 |         if self.eval_metric == 'acc':
45 |             y_true, y_pred = self._check_input(y_true, y_pred)
46 |             return self._eval_acc(y_true, y_pred)
47 | 
48 | 
49 |     def _eval_rocauc(self, y_true, y_pred):
50 |         '''
51 |             compute ROC-AUC and AP score averaged across tasks
52 |         '''
53 |         
54 |         if y_pred.shape[1] ==2:
55 |             auc = roc_auc_score(y_true, y_pred[:, 1])
56 |         else:
57 |             onehot_code = np.eye(y_pred.shape[1])
58 |             y_true_onehot = onehot_code[y_true]
59 |             auc = roc_auc_score(y_true_onehot, y_pred)
60 | 
61 |         return {'auc': auc}
62 | 
63 |     def _eval_acc(self, y_true, y_pred):
64 |         y_pred = y_pred.argmax(axis=-1)
65 | 
66 |         correct = y_true == y_pred
67 |         acc = float(np.sum(correct))/len(correct)
68 | 
69 |         return {'acc': acc}
70 | 
71 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
 2 | certifi @ file:///opt/conda/conda-bld/certifi_1655968806487/work/certifi
 3 | charset-normalizer==2.1.0
 4 | cycler @ file:///tmp/build/80754af9/cycler_1637851556182/work
 5 | debugpy @ file:///tmp/build/80754af9/debugpy_1637091426235/work
 6 | decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
 7 | entrypoints @ file:///tmp/build/80754af9/entrypoints_1649908585034/work
 8 | fonttools==4.25.0
 9 | googledrivedownloader==0.4
10 | idna==3.3
11 | importlib-metadata==4.12.0
12 | ipykernel @ file:///tmp/build/80754af9/ipykernel_1646983213839/work/dist/ipykernel-6.9.1-py3-none-any.whl
13 | ipython @ file:///tmp/build/80754af9/ipython_1643818144432/work
14 | isodate==0.6.1
15 | jedi @ file:///tmp/build/80754af9/jedi_1644299024593/work
16 | Jinja2==3.1.2
17 | joblib==1.1.0
18 | jupyter-client @ file:///opt/conda/conda-bld/jupyter_client_1650622202839/work
19 | jupyter-core @ file:///opt/conda/conda-bld/jupyter_core_1651671229925/work
20 | kiwisolver @ file:///opt/conda/conda-bld/kiwisolver_1653292039266/work
21 | lightgbm==3.3.2
22 | MarkupSafe==2.1.1
23 | matplotlib @ file:///tmp/build/80754af9/matplotlib-suite_1647441664166/work
24 | matplotlib-inline @ file:///tmp/build/80754af9/matplotlib-inline_1628242447089/work
25 | mkl-fft==1.3.1
26 | mkl-random @ file:///tmp/build/80754af9/mkl_random_1626179032232/work
27 | mkl-service==2.4.0
28 | munkres==1.1.4
29 | nest-asyncio @ file:///tmp/build/80754af9/nest-asyncio_1649847911453/work
30 | networkx==2.6.3
31 | numpy @ file:///opt/conda/conda-bld/numpy_and_numpy_base_1653915516269/work
32 | packaging @ file:///tmp/build/80754af9/packaging_1637314298585/work
33 | pandas==1.3.5
34 | parso @ file:///opt/conda/conda-bld/parso_1641458642106/work
35 | pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
36 | pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work
37 | Pillow==9.0.1
38 | prompt-toolkit @ file:///tmp/build/80754af9/prompt-toolkit_1633440160888/work
39 | ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
40 | Pygments @ file:///opt/conda/conda-bld/pygments_1644249106324/work
41 | pyparsing==3.0.9
42 | python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
43 | python-louvain==0.16
44 | pytz==2022.1
45 | pyzmq @ file:///tmp/build/80754af9/pyzmq_1638434924971/work
46 | rdflib==6.1.1
47 | requests==2.28.1
48 | scikit-learn==1.0.2
49 | scipy==1.7.3
50 | six @ file:///tmp/build/80754af9/six_1644875935023/work
51 | threadpoolctl==3.1.0
52 | torch==1.6.0
53 | torch-geometric==1.7.2
54 | torch-scatter==2.0.8
55 | torch-sparse==0.6.9
56 | torch-spline-conv==1.2.1
57 | torchvision==0.7.0
58 | tornado @ file:///tmp/build/80754af9/tornado_1606942283357/work
59 | tqdm==4.64.0
60 | traitlets @ file:///tmp/build/80754af9/traitlets_1636710298902/work
61 | typing_extensions==4.3.0
62 | urllib3==1.26.9
63 | wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work
64 | zipp==3.8.0
65 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | xydata/processed
  2 | tmp.ipynb
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
108 | __pypackages__/
109 | 
110 | # Celery stuff
111 | celerybeat-schedule
112 | celerybeat.pid
113 | 
114 | # SageMath parsed files
115 | *.sage.py
116 | 
117 | # Environments
118 | .env
119 | .venv
120 | env/
121 | venv/
122 | ENV/
123 | env.bak/
124 | venv.bak/
125 | 
126 | # Spyder project settings
127 | .spyderproject
128 | .spyproject
129 | 
130 | # Rope project settings
131 | .ropeproject
132 | 
133 | # mkdocs documentation
134 | /site
135 | 
136 | # mypy
137 | .mypy_cache/
138 | .dmypy.json
139 | dmypy.json
140 | 
141 | # Pyre type checker
142 | .pyre/
143 | 
144 | # pytype static type analyzer
145 | .pytype/
146 | 
147 | # Cython debug symbols
148 | cython_debug/
149 | 
150 | # PyCharm
151 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
152 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
153 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
154 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
155 | #.idea/
156 | 


--------------------------------------------------------------------------------
/utils/xygraph.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Callable, List
 2 | import os.path as osp
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | from torch_geometric.data import InMemoryDataset
 7 | from torch_geometric.data import Data
 8 | 
 9 | 
10 | def read_xygraphp1(folder):
11 |     print('read_xygraphp1')
12 |     names = ['phase1_gdata.npz']
13 |     items = [np.load(folder+'/'+name) for name in names]
14 | 
15 |     x = items[0]['x']
16 |     y = items[0]['y'].reshape(-1, 1)
17 |     edge_index = items[0]['edge_index']
18 |     edge_type = items[0]['edge_type']
19 |     np.random.seed(42)
20 |     train_mask_t = items[0]['train_mask']
21 |     np.random.shuffle(train_mask_t)
22 |     train_mask = train_mask_t[:int(len(train_mask_t)/10*6)]
23 |     valid_mask = train_mask_t[int(len(train_mask_t)/10*6):]
24 |     test_mask = items[0]['test_mask']
25 | 
26 |     x = torch.tensor(x, dtype=torch.float).contiguous()
27 |     y = torch.tensor(y, dtype=torch.int64)
28 |     edge_index = torch.tensor(edge_index.transpose(),
29 |                               dtype=torch.int64).contiguous()
30 |     edge_type = torch.tensor(edge_type, dtype=torch.float)
31 |     train_mask = torch.tensor(train_mask, dtype=torch.int64)
32 |     valid_mask = torch.tensor(valid_mask, dtype=torch.int64)
33 |     test_mask = torch.tensor(test_mask, dtype=torch.int64)
34 | 
35 |     data = Data(x=x, edge_index=edge_index, edge_attr=edge_type, y=y)
36 |     data.train_mask = train_mask
37 |     data.valid_mask = valid_mask
38 |     data.test_mask = test_mask
39 | 
40 |     return data
41 | 
42 | 
43 | class XYGraphP1(InMemoryDataset):
44 |     r"""
45 |     Args:
46 |         root (string): Root directory where the dataset should be saved.
47 |         name (string): The name of the dataset (:obj:`"xygraphp1"`).
48 |         transform (callable, optional): A function/transform that takes in an
49 |             :obj:`torch_geometric.data.Data` object and returns a transformed
50 |             version. The data object will be transformed before every access.
51 |             (default: :obj:`None`)
52 |         pre_transform (callable, optional): A function/transform that takes in
53 |             an :obj:`torch_geometric.data.Data` object and returns a
54 |             transformed version. The data object will be transformed before
55 |             being saved to disk. (default: :obj:`None`)
56 |     """
57 | 
58 |     url = ''
59 | 
60 |     def __init__(self, root: str, name: str,
61 |                  transform: Optional[Callable] = None,
62 |                  pre_transform: Optional[Callable] = None):
63 | 
64 |         self.name = name
65 |         super().__init__(root, transform, pre_transform)
66 |         self.data, self.slices = torch.load(self.processed_paths[0])
67 | 
68 |     @property
69 |     def raw_dir(self) -> str:
70 |         return osp.join(self.root, self.name, 'raw')
71 | 
72 |     @property
73 |     def processed_dir(self) -> str:
74 |         return osp.join(self.root, self.name, 'processed')
75 | 
76 |     @property
77 |     def raw_file_names(self) -> List[str]:
78 |         names = ['phase1_gdata.npz']
79 |         return names
80 | 
81 |     @property
82 |     def processed_file_names(self) -> str:
83 |         return 'data.pt'
84 | 
85 |     def download(self):
86 |         pass
87 | #         for name in self.raw_file_names:
88 | #             download_url('{}/{}'.format(self.url, name), self.raw_dir)
89 | 
90 |     def process(self):
91 |         data = read_xygraphp1(self.raw_dir)
92 |         data = data if self.pre_transform is None else self.pre_transform(data)
93 |         torch.save(self.collate([data]), self.processed_paths[0])
94 | 
95 |     def __repr__(self) -> str:
96 |         return f'{self.name}()'
97 | 


--------------------------------------------------------------------------------
/utils/xygraph_no_valid.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Callable, List
 2 | import os.path as osp
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | from torch_geometric.data import InMemoryDataset
 7 | from torch_geometric.data import Data
 8 | 
 9 | 
10 | def read_xygraphp1(folder):
11 |     print('read_xygraphp_no_valid')
12 |     names = ['phase1_gdata.npz']
13 |     items = [np.load(folder+'/'+name) for name in names]
14 | 
15 |     x = items[0]['x']
16 |     y = items[0]['y'].reshape(-1, 1)
17 |     edge_index = items[0]['edge_index']
18 |     edge_type = items[0]['edge_type']
19 |     np.random.seed(42)
20 |     train_mask_t = items[0]['train_mask']
21 |     np.random.shuffle(train_mask_t)
22 |     # train_mask = train_mask_t[:int(len(train_mask_t)/10*6)]
23 |     # valid_mask = train_mask_t[int(len(train_mask_t)/10*6):]
24 |     test_mask = items[0]['test_mask']
25 | 
26 |     x = torch.tensor(x, dtype=torch.float).contiguous()
27 |     y = torch.tensor(y, dtype=torch.int64)
28 |     edge_index = torch.tensor(edge_index.transpose(),
29 |                               dtype=torch.int64).contiguous()
30 |     edge_type = torch.tensor(edge_type, dtype=torch.float)
31 |     train_mask_t = torch.tensor(train_mask_t, dtype=torch.float)
32 |     # train_mask = torch.tensor(train_mask, dtype=torch.int64)
33 |     # valid_mask = torch.tensor(valid_mask, dtype=torch.int64)
34 |     test_mask = torch.tensor(test_mask, dtype=torch.int64)
35 | 
36 |     data = Data(x=x, edge_index=edge_index, edge_attr=edge_type, y=y)
37 |     data.train_mask = train_mask_t
38 |     # data.train_mask = train_mask
39 |     # data.valid_mask = valid_mask
40 |     data.test_mask = test_mask
41 | 
42 |     return data
43 | 
44 | 
45 | class XYGraphP1_no_valid(InMemoryDataset):
46 |     r"""
47 |     Args:
48 |         root (string): Root directory where the dataset should be saved.
49 |         name (string): The name of the dataset (:obj:`"xygraphp1"`).
50 |         transform (callable, optional): A function/transform that takes in an
51 |             :obj:`torch_geometric.data.Data` object and returns a transformed
52 |             version. The data object will be transformed before every access.
53 |             (default: :obj:`None`)
54 |         pre_transform (callable, optional): A function/transform that takes in
55 |             an :obj:`torch_geometric.data.Data` object and returns a
56 |             transformed version. The data object will be transformed before
57 |             being saved to disk. (default: :obj:`None`)
58 |     """
59 | 
60 |     url = ''
61 | 
62 |     def __init__(self, root: str, name: str,
63 |                  transform: Optional[Callable] = None,
64 |                  pre_transform: Optional[Callable] = None):
65 | 
66 |         self.name = name
67 |         super().__init__(root, transform, pre_transform)
68 |         self.data, self.slices = torch.load(self.processed_paths[0])
69 | 
70 |     @property
71 |     def raw_dir(self) -> str:
72 |         return osp.join(self.root, self.name, 'raw_no_valid')
73 | 
74 |     @property
75 |     def processed_dir(self) -> str:
76 |         return osp.join(self.root, self.name, 'processed_no_valid')
77 | 
78 |     @property
79 |     def raw_file_names(self) -> List[str]:
80 |         names = ['phase1_gdata.npz']
81 |         return names
82 | 
83 |     @property
84 |     def processed_file_names(self) -> str:
85 |         return 'data.pt'
86 | 
87 |     def download(self):
88 |         pass
89 | #         for name in self.raw_file_names:
90 | #             download_url('{}/{}'.format(self.url, name), self.raw_dir)
91 | 
92 |     def process(self):
93 |         data = read_xygraphp1(self.raw_dir)
94 |         data = data if self.pre_transform is None else self.pre_transform(data)
95 |         torch.save(self.collate([data]), self.processed_paths[0])
96 | 
97 |     def __repr__(self) -> str:
98 |         return f'{self.name}()'
99 | 


--------------------------------------------------------------------------------
/models/gat.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | from torch import Tensor
  4 | from torch_sparse import SparseTensor
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torch_geometric.nn import GATConv, GATv2Conv
  8 | 
  9 | class GAT(torch.nn.Module):
 10 |     def __init__(self
 11 |                  , in_channels
 12 |                  , hidden_channels
 13 |                  , out_channels
 14 |                  , num_layers
 15 |                  , dropout
 16 |                  , layer_heads = []
 17 |                  , batchnorm=True):
 18 |         super(GAT, self).__init__()
 19 | 
 20 |         self.convs = torch.nn.ModuleList()
 21 |         self.convs.append(GATConv(in_channels, hidden_channels, heads=layer_heads[0], concat=True))
 22 |         self.bns = torch.nn.ModuleList()
 23 |         self.batchnorm = batchnorm
 24 |         if self.batchnorm:
 25 |             self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[0]))
 26 |         for _ in range(num_layers - 2):
 27 |             self.convs.append(GATConv(hidden_channels*layer_heads[i-1], hidden_channels, heads=layer_heads[i], concat=True))
 28 |             if self.batchnorm:
 29 |                 self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[i-1]))
 30 |         self.convs.append(GATConv(hidden_channels*layer_heads[num_layers-2]
 31 |                           , out_channels
 32 |                           , heads=layer_heads[num_layers-1]
 33 |                           , concat=False))
 34 | 
 35 |         self.dropout = dropout
 36 | 
 37 |     def reset_parameters(self):
 38 |         for conv in self.convs:
 39 |             conv.reset_parameters()
 40 |         if self.batchnorm:
 41 |             for bn in self.bns:
 42 |                 bn.reset_parameters()
 43 | 
 44 |     def forward(self, x, edge_index: Union[Tensor, SparseTensor]):
 45 |         for i, conv in enumerate(self.convs[:-1]):
 46 |             x = conv(x, edge_index)
 47 |             if self.batchnorm: 
 48 |                 x = self.bns[i](x)
 49 |             x = F.relu(x)
 50 |             x = F.dropout(x, p=self.dropout, training=self.training)
 51 |         x = self.convs[-1](x, edge_index)
 52 |         return x.log_softmax(dim=-1)
 53 |     
 54 |     
 55 |     
 56 |     
 57 | class GATv2(torch.nn.Module):
 58 |     def __init__(self
 59 |                  , in_channels
 60 |                  , hidden_channels
 61 |                  , out_channels
 62 |                  , num_layers
 63 |                  , dropout
 64 |                  , layer_heads = []
 65 |                  , batchnorm=True):
 66 |         super(GATv2, self).__init__()
 67 | 
 68 |         self.convs = torch.nn.ModuleList()
 69 |         self.convs.append(GATv2Conv(in_channels, hidden_channels, heads=layer_heads[0], concat=True))
 70 |         self.bns = torch.nn.ModuleList()
 71 |         self.batchnorm = batchnorm
 72 |         if self.batchnorm:
 73 |             self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[0]))
 74 |         for _ in range(num_layers - 2):
 75 |             self.convs.append(GATv2Conv(hidden_channels*layer_heads[i-1], hidden_channels, heads=layer_heads[i], concat=True))
 76 |             if self.batchnorm:
 77 |                 self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[i-1]))
 78 |         self.convs.append(GATv2Conv(hidden_channels*layer_heads[num_layers-2]
 79 |                           , out_channels
 80 |                           , heads=layer_heads[num_layers-1]
 81 |                           , concat=False))
 82 | 
 83 |         self.dropout = dropout
 84 | 
 85 |     def reset_parameters(self):
 86 |         for conv in self.convs:
 87 |             conv.reset_parameters()
 88 |         if self.batchnorm:
 89 |             for bn in self.bns:
 90 |                 bn.reset_parameters()
 91 | 
 92 |     def forward(self, x, edge_index: Union[Tensor, SparseTensor]):
 93 |         for i, conv in enumerate(self.convs[:-1]):
 94 |             x = conv(x, edge_index)
 95 |             if self.batchnorm: 
 96 |                 x = self.bns[i](x)
 97 |             x = F.relu(x)
 98 |             x = F.dropout(x, p=self.dropout, training=self.training)
 99 |         x = self.convs[-1](x, edge_index)
100 |         return x.log_softmax(dim=-1)


--------------------------------------------------------------------------------
/models/sage_neighsampler.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | from torch import Tensor
  4 | from torch_sparse import SparseTensor
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch_geometric.nn import SAGEConv
  9 | from tqdm import tqdm
 10 | 
 11 | 
 12 | class SAGE_NeighSampler(torch.nn.Module):
 13 |     def __init__(self, in_channels, hidden_channels, out_channels, num_layers, dropout, batchnorm=True):
 14 |         super(SAGE_NeighSampler, self).__init__()
 15 | 
 16 |         self.convs = torch.nn.ModuleList()
 17 |         self.convs.append(SAGEConv(in_channels, hidden_channels))
 18 |         self.bns = torch.nn.ModuleList()
 19 |         self.batchnorm = batchnorm
 20 |         self.num_layers = num_layers
 21 |         if self.batchnorm:
 22 |             self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
 23 |         for i in range(num_layers - 2):
 24 |             self.convs.append(SAGEConv(hidden_channels, hidden_channels))
 25 |             if self.batchnorm:
 26 |                 self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
 27 |         self.convs.append(SAGEConv(hidden_channels, int(hidden_channels/2)))
 28 | 
 29 |         self.dropout = dropout
 30 | 
 31 |         self.fc = nn.Linear(int(hidden_channels/2), out_channels)
 32 | 
 33 |     def reset_parameters(self):
 34 |         for conv in self.convs:
 35 |             conv.reset_parameters()
 36 |         if self.batchnorm:
 37 |             for bn in self.bns:
 38 |                 bn.reset_parameters()
 39 | 
 40 |     def forward(self, x, adjs):
 41 |         for i, (edge_index, _, size) in enumerate(adjs):
 42 |             x_target = x[:size[1]]
 43 |             x = self.convs[i]((x, x_target), edge_index)
 44 |             if i != self.num_layers-1:
 45 |                 if self.batchnorm:
 46 |                     x = self.bns[i](x)
 47 |                 x = F.relu(x)
 48 |                 x = F.dropout(x, p=0.5, training=self.training)
 49 | 
 50 |         x = self.fc(x)
 51 | 
 52 |         return x.log_softmax(dim=-1)
 53 | 
 54 |     '''
 55 |     subgraph_loader: size = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1],
 56 |                                   batch_size=**, shuffle=False,
 57 |                                   num_workers=12)
 58 |     You can also sample the complete k-hop neighborhood, but this is rather expensive (especially for Reddit). 
 59 |     We apply here trick here to compute the node embeddings efficiently: 
 60 |        Instead of sampling multiple layers for a mini-batch, we instead compute the node embeddings layer-wise. 
 61 |        Doing this exactly k times mimics a k-layer GNN.  
 62 |     '''
 63 | 
 64 |     def inference_all(self, data):
 65 |         x, adj_t = data.x, data.adj_t
 66 |         for i, conv in enumerate(self.convs[:-1]):
 67 |             x = conv(x, adj_t)
 68 |             if self.batchnorm:
 69 |                 x = self.bns[i](x)
 70 |             x = F.relu(x)
 71 |             x = F.dropout(x, p=self.dropout, training=self.training)
 72 |         x = self.convs[-1](x, adj_t)
 73 | 
 74 |         x = self.fc(x)
 75 | 
 76 |         return x.log_softmax(dim=-1)
 77 | 
 78 |     def inference(self, x_all, layer_loader, device):
 79 |         pbar = tqdm(total=x_all.size(0) * self.num_layers, ncols=80)
 80 |         pbar.set_description('Evaluating')
 81 | 
 82 |         # Compute representations of nodes layer by layer, using *all*
 83 |         # available edges. This leads to faster computation in contrast to
 84 |         # immediately computing the final representations of each batch.
 85 |         for i in range(self.num_layers):
 86 |             xs = []
 87 |             for batch_size, n_id, adj in layer_loader:
 88 |                 edge_index, _, size = adj.to(device)
 89 |                 x = x_all[n_id].to(device)
 90 |                 x_target = x[:size[1]]
 91 |                 x = self.convs[i]((x, x_target), edge_index)
 92 |                 if i != self.num_layers - 1:
 93 |                     x = F.relu(x)
 94 |                     if self.batchnorm:
 95 |                         x = self.bns[i](x)
 96 |                 xs.append(x)
 97 | 
 98 |                 pbar.update(batch_size)
 99 | 
100 |             x_all = torch.cat(xs, dim=0)
101 | 
102 |         x_all = self.fc(x_all)
103 | 
104 |         pbar.close()
105 | 
106 |         return x_all.log_softmax(dim=-1)
107 | 
108 |     def to_embedding(self, x_all, layer_loader, device):
109 |         pbar = tqdm(total=x_all.size(0) * self.num_layers, ncols=80)
110 |         pbar.set_description('Evaluating')
111 | 
112 |         for i in range(self.num_layers):
113 |             xs = []
114 |             for batch_size, n_id, adj in layer_loader:
115 |                 edge_index, _, size = adj.to(device)
116 |                 x = x_all[n_id].to(device)
117 |                 x_target = x[:size[1]]
118 |                 x = self.convs[i]((x, x_target), edge_index)
119 |                 if i != self.num_layers - 1:
120 |                     x = F.relu(x)
121 |                     if self.batchnorm:
122 |                         x = self.bns[i](x)
123 |                 xs.append(x)
124 | 
125 |                 pbar.update(batch_size)
126 | 
127 |             x_all = torch.cat(xs, dim=0)
128 | 
129 |         pbar.close()
130 | 
131 |         return x_all
132 | 


--------------------------------------------------------------------------------
/train_mini_batch.py:
--------------------------------------------------------------------------------
  1 | # dataset name: XYGraphP1
  2 | 
  3 | from utils import XYGraphP1
  4 | from utils.utils import prepare_folder
  5 | from utils.evaluator import Evaluator
  6 | from torch_geometric.data import NeighborSampler
  7 | from models import SAGE_NeighSampler, GAT_NeighSampler, GATv2_NeighSampler
  8 | from tqdm import tqdm
  9 | 
 10 | import argparse
 11 | 
 12 | import torch
 13 | import torch.nn.functional as F
 14 | import torch.nn as nn
 15 | 
 16 | import torch_geometric.transforms as T
 17 | from torch_sparse import SparseTensor
 18 | from torch_geometric.utils import to_undirected
 19 | import pandas as pd
 20 | 
 21 | eval_metric = 'auc'
 22 | 
 23 | sage_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-7
 24 |                                 }
 25 | 
 26 | gat_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-7, 'layer_heads': [4, 1]
 27 |                                }
 28 | 
 29 | gatv2_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-6, 'layer_heads': [4, 1]
 30 |                                  }
 31 | 
 32 | 
 33 | def train(epoch, train_loader, model, data, train_idx, optimizer, device, no_conv=False):
 34 |     model.train()
 35 | 
 36 |     pbar = tqdm(total=train_idx.size(0), ncols=80)
 37 |     pbar.set_description(f'Epoch {epoch:02d}')
 38 | 
 39 |     total_loss = total_correct = 0
 40 |     for batch_size, n_id, adjs in train_loader:
 41 |         # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
 42 |         adjs = [adj.to(device) for adj in adjs]
 43 | 
 44 |         optimizer.zero_grad()
 45 |         out = model(data.x[n_id], adjs)
 46 |         loss = F.nll_loss(out, data.y[n_id[:batch_size]])
 47 |         loss.backward()
 48 |         optimizer.step()
 49 | 
 50 |         total_loss += float(loss)
 51 |         pbar.update(batch_size)
 52 | 
 53 |     pbar.close()
 54 |     loss = total_loss / len(train_loader)
 55 | 
 56 |     return loss
 57 | 
 58 | 
 59 | @torch.no_grad()
 60 | def test(layer_loader, model, data, split_idx, device, no_conv=False):
 61 |     # data.y is labels of shape (N, )
 62 |     model.eval()
 63 | 
 64 |     out = model.inference(data.x, layer_loader, device)
 65 | #     out = model.inference_all(data)
 66 |     y_pred = out.exp()  # (N,num_classes)
 67 | 
 68 |     losses = dict()
 69 |     for key in ['train', 'valid', 'test']:
 70 |         node_id = split_idx[key]
 71 |         node_id = node_id.to(device)
 72 |         losses[key] = F.nll_loss(out[node_id], data.y[node_id]).item()
 73 | 
 74 |     return losses, y_pred
 75 | 
 76 | 
 77 | def main():
 78 |     parser = argparse.ArgumentParser(description='minibatch_gnn_models')
 79 |     parser.add_argument('--device', type=int, default=0)
 80 |     parser.add_argument('--dataset', type=str, default='XYGraphP1')
 81 |     parser.add_argument('--log_steps', type=int, default=10)
 82 |     parser.add_argument('--model', type=str, default='mlp')
 83 |     parser.add_argument('--epochs', type=int, default=100)
 84 | 
 85 |     args = parser.parse_args()
 86 |     print(args)
 87 | 
 88 |     no_conv = False
 89 |     if args.model in ['mlp']:
 90 |         no_conv = True
 91 | 
 92 |     device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
 93 |     device = torch.device(device)
 94 | 
 95 |     dataset = XYGraphP1(root='./', name='xydata', transform=T.ToSparseTensor())
 96 | 
 97 |     nlabels = dataset.num_classes
 98 |     if args.dataset == 'XYGraphP1':
 99 |         nlabels = 2
100 | 
101 |     data = dataset[0]
102 |     data.adj_t = data.adj_t.to_symmetric()
103 | 
104 |     if args.dataset in ['XYGraphP1']:
105 |         x = data.x
106 |         x = (x-x.mean(0))/x.std(0)
107 |         data.x = x
108 |     if data.y.dim() == 2:
109 |         data.y = data.y.squeeze(1)
110 | 
111 |     split_idx = {'train': data.train_mask,
112 |                  'valid': data.valid_mask, 'test': data.test_mask}
113 | 
114 |     data = data.to(device)
115 |     train_idx = split_idx['train'].to(device)
116 | 
117 |     model_dir = prepare_folder(args.dataset, args.model)
118 |     print('model_dir:', model_dir)
119 | 
120 |     train_loader = NeighborSampler(data.adj_t, node_idx=train_idx, sizes=[
121 |                                    10, 5], batch_size=1024, shuffle=True, num_workers=12)
122 |     layer_loader = NeighborSampler(
123 |         data.adj_t, node_idx=None, sizes=[-1], batch_size=4096, shuffle=False, num_workers=12)
124 | 
125 |     if args.model == 'sage_neighsampler':
126 |         para_dict = sage_neighsampler_parameters
127 |         model_para = sage_neighsampler_parameters.copy()
128 |         model_para.pop('lr')
129 |         model_para.pop('l2')
130 |         model = SAGE_NeighSampler(
131 |             in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device)
132 |     if args.model == 'gat_neighsampler':
133 |         para_dict = gat_neighsampler_parameters
134 |         model_para = gat_neighsampler_parameters.copy()
135 |         model_para.pop('lr')
136 |         model_para.pop('l2')
137 |         model = GAT_NeighSampler(
138 |             in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device)
139 |     if args.model == 'gatv2_neighsampler':
140 |         para_dict = gatv2_neighsampler_parameters
141 |         model_para = gatv2_neighsampler_parameters.copy()
142 |         model_para.pop('lr')
143 |         model_para.pop('l2')
144 |         model = GATv2_NeighSampler(
145 |             in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device)
146 | 
147 |     print(f'Model {args.model} initialized')
148 | 
149 |     model.reset_parameters()
150 |     optimizer = torch.optim.Adam(
151 |         model.parameters(), lr=para_dict['lr'], weight_decay=para_dict['l2'])
152 |     min_valid_loss = 1e8
153 | 
154 |     for epoch in range(1, args.epochs+1):
155 |         loss = train(epoch, train_loader, model, data,
156 |                      train_idx, optimizer, device, no_conv)
157 |         losses, out = test(layer_loader, model, data,
158 |                            split_idx, device, no_conv)
159 |         train_loss, valid_loss, test_loss = losses['train'], losses['valid'], losses['test']
160 | 
161 |         if valid_loss < min_valid_loss:
162 |             min_valid_loss = valid_loss
163 |             torch.save(model.state_dict(), model_dir+'model.pt')
164 | 
165 |         if epoch % args.log_steps == 0:
166 |             print(f'Epoch: {epoch:02d}, '
167 |                   f'Loss: {loss:.4f}, '
168 |                   f'Train: {100 * train_loss:.3f}%, '
169 |                   f'Valid: {100 * valid_loss:.3f}% '
170 |                   f'Test: {100 * test_loss:.3f}%')
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     main()
175 | 


--------------------------------------------------------------------------------
/inference_mini_batch.py:
--------------------------------------------------------------------------------
  1 | # dataset name: XYGraphP1_no_valid
  2 | 
  3 | import pickle
  4 | from utils import XYGraphP1_no_valid
  5 | from utils.utils import prepare_folder
  6 | from utils.evaluator import Evaluator
  7 | from torch_geometric.data import NeighborSampler
  8 | from models import SAGE_NeighSampler, GAT_NeighSampler, GATv2_NeighSampler
  9 | from tqdm import tqdm
 10 | 
 11 | import argparse
 12 | 
 13 | import torch
 14 | import torch.nn.functional as F
 15 | import torch.nn as nn
 16 | 
 17 | import torch_geometric.transforms as T
 18 | from torch_sparse import SparseTensor
 19 | from torch_geometric.utils import to_undirected
 20 | import pandas as pd
 21 | import numpy as np
 22 | 
 23 | 
 24 | from lightgbm import LGBMClassifier
 25 | from sklearn.metrics import accuracy_score
 26 | from sklearn.model_selection import GridSearchCV
 27 | import joblib
 28 | 
 29 | 
 30 | sage_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-7
 31 |                                 }
 32 | 
 33 | gat_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-7, 'layer_heads': [4, 1]
 34 |                                }
 35 | 
 36 | gatv2_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-6, 'layer_heads': [4, 1]
 37 |                                  }
 38 | 
 39 | 
 40 | @torch.no_grad()
 41 | def to_embedding(layer_loader, model, data, device, no_conv=False):
 42 |     # data.y is labels of shape (N, )
 43 |     model.eval()
 44 | 
 45 |     out = model.to_embedding(data.x, layer_loader, device)
 46 |     print("Model embedding data : ", out.shape)
 47 | 
 48 |     return out
 49 | 
 50 | 
 51 | def main():
 52 |     parser = argparse.ArgumentParser(description='minibatch_gnn_models')
 53 |     parser.add_argument('--device', type=int, default=0)
 54 |     parser.add_argument('--dataset', type=str, default='XYGraphP1')
 55 |     parser.add_argument('--log_steps', type=int, default=10)
 56 |     parser.add_argument('--model', type=str, default='mlp')
 57 |     parser.add_argument('--epochs', type=int, default=100)
 58 | 
 59 |     args = parser.parse_args()
 60 |     print(args)
 61 | 
 62 |     no_conv = False
 63 |     if args.model in ['mlp']:
 64 |         no_conv = True
 65 | 
 66 |     device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
 67 |     device = torch.device(device)
 68 | 
 69 |     dataset = XYGraphP1_no_valid(
 70 |         root='./', name='xydata', transform=T.ToSparseTensor())
 71 | 
 72 |     nlabels = dataset.num_classes
 73 |     if args.dataset == 'XYGraphP1':
 74 |         nlabels = 2
 75 | 
 76 |     data = dataset[0]
 77 |     data.adj_t = data.adj_t.to_symmetric()
 78 | 
 79 |     if args.dataset in ['XYGraphP1']:
 80 |         x = data.x
 81 |         x = (x-x.mean(0))/x.std(0)
 82 |         data.x = x
 83 |     if data.y.dim() == 2:
 84 |         data.y = data.y.squeeze(1)
 85 | 
 86 |     data = data.to(device)
 87 | 
 88 |     layer_loader = NeighborSampler(
 89 |         data.adj_t, node_idx=None, sizes=[-1], batch_size=4096, shuffle=False, num_workers=12)
 90 | 
 91 |     if args.model == 'sage_neighsampler':
 92 |         para_dict = sage_neighsampler_parameters
 93 |         model_para = sage_neighsampler_parameters.copy()
 94 |         model_para.pop('lr')
 95 |         model_para.pop('l2')
 96 |         model = SAGE_NeighSampler(
 97 |             in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device)
 98 |     if args.model == 'gat_neighsampler':
 99 |         para_dict = gat_neighsampler_parameters
100 |         model_para = gat_neighsampler_parameters.copy()
101 |         model_para.pop('lr')
102 |         model_para.pop('l2')
103 |         model = GAT_NeighSampler(
104 |             in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device)
105 |     if args.model == 'gatv2_neighsampler':
106 |         para_dict = gatv2_neighsampler_parameters
107 |         model_para = gatv2_neighsampler_parameters.copy()
108 |         model_para.pop('lr')
109 |         model_para.pop('l2')
110 |         model = GATv2_NeighSampler(
111 |             in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device)
112 | 
113 |     print(f'Model {args.model} initialized')
114 | 
115 |     model_file = './model_files/{}/{}/model.pt'.format(
116 |         args.dataset, args.model)
117 |     print('model_file:', model_file)
118 |     model.load_state_dict(torch.load(model_file))
119 | 
120 |     out = to_embedding(layer_loader, model, data, device, no_conv)
121 | 
122 |     embedding_train, embedding_test = out[data.train_mask], out[data.test_mask]
123 |     y_train = data.y[data.train_mask]
124 | 
125 |     print(embedding_train.shape)
126 |     print(y_train.shape)
127 |     print(embedding_test.shape)
128 | 
129 |     with open("xydata/embedding/64_train_embedding_new.pickle", 'wb') as file:
130 |         pickle.dump(embedding_train, file)
131 |     file.close()
132 | 
133 |     with open("xydata/embedding/64_embedding_test_new.pickle", 'wb') as file:
134 |         pickle.dump(embedding_test, file)
135 |     file.close()
136 | 
137 |     with open("xydata/embedding/64_y_train_new.pickle", 'wb') as file:
138 |         pickle.dump(y_train, file)
139 |     file.close()
140 | 
141 |     with open('xydata/embedding/64_embedding_test_new.pickle', 'rb') as file:
142 |         x_test_all = pickle.load(file)
143 |         file.close()
144 |     with open('xydata/embedding/64_train_embedding_new.pickle', 'rb') as file:
145 |         x_train_all = pickle.load(file)
146 |         file.close()
147 |     with open('xydata/embedding/64_y_train_new.pickle', 'rb') as file:
148 |         y_train_all = pickle.load(file)
149 |         file.close()
150 | 
151 |     items = np.load('xydata/raw/phase1_gdata.npz')
152 |     x = items['x']
153 |     y = items['y'].reshape(-1, 1)
154 |     edge_index = items['edge_index']
155 |     edge_type = items['edge_type']
156 |     np.random.seed(42)
157 |     train_mask_t = items['train_mask']
158 |     np.random.shuffle(train_mask_t)
159 |     test_mask = items['test_mask']
160 | 
161 |     x_train_add = torch.tensor(x[train_mask_t], dtype=torch.float).contiguous()
162 |     x_test_add = torch.tensor(x[test_mask], dtype=torch.float).contiguous()
163 | 
164 |     set_train_mask_t = set(list(train_mask_t))
165 |     set_test_mask_t = set(list(test_mask))
166 |     set_used = set_train_mask_t | set_test_mask_t
167 | 
168 |     x_edge_add = np.zeros([x.shape[0], 22])
169 | 
170 |     edge_type = edge_type-1
171 | 
172 |     x_time_add = np.zeros([x.shape[0], 90])
173 |     edge_timestamp = items['edge_timestamp']
174 |     edge_timestamp = (edge_timestamp.astype(np.int32) / 13).astype(np.int32)
175 | 
176 |     x_pointLabel_add = np.zeros([x.shape[0], 8])
177 | 
178 |     for i in tqdm(range(len(edge_index))):
179 |         if edge_index[i][0] in set_used or edge_index[i][1] in set_used:
180 |             x_edge_add[edge_index[i][0]][edge_type[i]] += 1
181 |             x_edge_add[edge_index[i][1]][edge_type[i]+10] += 1
182 | 
183 |             x_time_add[edge_index[i][0]][:edge_timestamp[i]+1] += 1
184 |             x_time_add[edge_index[i][1]][45:edge_timestamp[i]+1+45] += 1
185 | 
186 |             if y[edge_index[i][1]] != -100:
187 |                 x_pointLabel_add[edge_index[i][0]][y[edge_index[i][1]]] += 1
188 |             else:
189 |                 x_pointLabel_add[edge_index[i][0]][:2] += 1
190 | 
191 |             if y[edge_index[i][0]] != -100:
192 |                 x_pointLabel_add[edge_index[i][1]][y[edge_index[i][0]]+4] += 1
193 |             else:
194 |                 x_pointLabel_add[edge_index[i][1]][4:6] += 1
195 | 
196 |     train_x_edge_add = torch.from_numpy(x_edge_add[train_mask_t])
197 |     test_x_edge_add = torch.from_numpy(x_edge_add[test_mask])
198 | 
199 |     train_x_time_add = torch.from_numpy(x_time_add[train_mask_t])
200 |     test_x_time_add = torch.from_numpy(x_time_add[test_mask])
201 | 
202 |     train_x_pointLabel_add = torch.from_numpy(x_pointLabel_add[train_mask_t])
203 |     test_x_pointLabel_add = torch.from_numpy(x_pointLabel_add[test_mask])
204 | 
205 |     x_train_all = torch.cat((x_train_add, x_train_all, train_x_edge_add,
206 |                             train_x_time_add, train_x_pointLabel_add), 1)
207 |     x_test_all = torch.cat(
208 |         (x_test_add, x_test_all, test_x_edge_add, test_x_time_add, test_x_pointLabel_add), 1)
209 | 
210 |     X_train, y_train = x_train_all, y_train_all
211 | 
212 |     gbm = LGBMClassifier(objective='binary',
213 |                          # subsample=0.8,
214 |                          # colsample_bytree=0.8,
215 |                          verbosity=2, metric='auc',
216 |                          learning_rate=0.01,
217 |                          n_estimators=1200,
218 |                          min_child_samples=125,
219 |                          max_depth=7,
220 |                          num_leaves=128,
221 |                          reg_alpha=0.1,
222 |                          reg_lambda=0.1,
223 |                          # scale_pos_weight=83.7
224 |                          )
225 | 
226 |     gbm.fit(X_train, y_train)
227 | 
228 |     joblib.dump(gbm, 'model_files/LGBM_model.pkl')
229 |     gbm = joblib.load('model_files/LGBM_model.pkl')
230 | 
231 |     y_pred = gbm.predict_proba(x_test_all, num_iteration=gbm.best_iteration_)
232 |     print(y_pred.shape)
233 |     np.save("submit/output.npy", y_pred)
234 | 
235 | 
236 | if __name__ == "__main__":
237 |     main()
238 | 


--------------------------------------------------------------------------------
/models/gat_neighsampler.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | from torch import Tensor
  4 | from torch_sparse import SparseTensor
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torch_geometric.nn import GATConv, GATv2Conv
  8 | from tqdm import tqdm
  9 | 
 10 | class GAT_NeighSampler(torch.nn.Module):
 11 |     def __init__(self
 12 |                  , in_channels
 13 |                  , hidden_channels
 14 |                  , out_channels
 15 |                  , num_layers
 16 |                  , dropout
 17 |                  , layer_heads = []
 18 |                  , batchnorm=True):
 19 |         super(GAT_NeighSampler, self).__init__()
 20 | 
 21 |         self.convs = torch.nn.ModuleList()
 22 |         self.batchnorm = batchnorm
 23 |         self.num_layers = num_layers
 24 |         
 25 |         if len(layer_heads)>1:
 26 |             self.convs.append(GATConv(in_channels, hidden_channels, heads=layer_heads[0], concat=True))
 27 |             if self.batchnorm:
 28 |                 self.bns = torch.nn.ModuleList()
 29 |                 self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[0]))
 30 |             for i in range(num_layers - 2):
 31 |                 self.convs.append(GATConv(hidden_channels*layer_heads[i-1], hidden_channels, heads=layer_heads[i], concat=True))
 32 |                 if self.batchnorm:
 33 |                     self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[i-1]))
 34 |             self.convs.append(GATConv(hidden_channels*layer_heads[num_layers-2]
 35 |                               , out_channels
 36 |                               , heads=layer_heads[num_layers-1]
 37 |                               , concat=False))
 38 |         else:
 39 |             self.convs.append(GATConv(in_channels, out_channels, heads=layer_heads[0], concat=False))        
 40 | 
 41 |         self.dropout = dropout
 42 |         
 43 |     def reset_parameters(self):
 44 |         for conv in self.convs:
 45 |             conv.reset_parameters()
 46 |         if self.batchnorm:
 47 |             for bn in self.bns:
 48 |                 bn.reset_parameters()        
 49 |         
 50 |         
 51 |     def forward(self, x, adjs):
 52 |         for i, (edge_index, _, size) in enumerate(adjs):
 53 |             x_target = x[:size[1]]
 54 |             x = self.convs[i]((x, x_target), edge_index)
 55 |             if i != self.num_layers-1:
 56 |                 if self.batchnorm:
 57 |                     x = self.bns[i](x)
 58 |                 x = F.relu(x)
 59 |                 x = F.dropout(x, p=0.5, training=self.training)
 60 |                 
 61 |         return x.log_softmax(dim=-1)
 62 |     
 63 |     '''
 64 |     subgraph_loader: size = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1],
 65 |                                   batch_size=**, shuffle=False,
 66 |                                   num_workers=12)
 67 |     You can also sample the complete k-hop neighborhood, but this is rather expensive (especially for Reddit). 
 68 |     We apply here trick here to compute the node embeddings efficiently: 
 69 |        Instead of sampling multiple layers for a mini-batch, we instead compute the node embeddings layer-wise. 
 70 |        Doing this exactly k times mimics a k-layer GNN.  
 71 |     '''
 72 |     
 73 |     def inference_all(self, data):
 74 |         x, adj_t = data.x, data.adj_t
 75 |         for i, conv in enumerate(self.convs[:-1]):
 76 |             x = conv(x, adj_t)
 77 |             if self.batchnorm: 
 78 |                 x = self.bns[i](x)
 79 |             x = F.relu(x)
 80 |             x = F.dropout(x, p=self.dropout, training=self.training)
 81 |         x = self.convs[-1](x, adj_t)
 82 |         return x.log_softmax(dim=-1)
 83 |     
 84 |     def inference(self, x_all, layer_loader, device):
 85 |         pbar = tqdm(total=x_all.size(0) * self.num_layers, ncols=80)
 86 |         pbar.set_description('Evaluating')
 87 | 
 88 |         # Compute representations of nodes layer by layer, using *all*
 89 |         # available edges. This leads to faster computation in contrast to
 90 |         # immediately computing the final representations of each batch.
 91 |         for i in range(self.num_layers):
 92 |             xs = []
 93 |             for batch_size, n_id, adj in layer_loader:
 94 |                 edge_index, _, size = adj.to(device)
 95 |                 x = x_all[n_id].to(device)
 96 |                 x_target = x[:size[1]]
 97 |                 x = self.convs[i]((x, x_target), edge_index)
 98 |                 if i != self.num_layers - 1:
 99 |                     x = F.relu(x)
100 |                     if self.batchnorm: 
101 |                         x = self.bns[i](x)
102 |                 xs.append(x)
103 | 
104 |                 pbar.update(batch_size)
105 | 
106 |             x_all = torch.cat(xs, dim=0)
107 | 
108 |         pbar.close()
109 | 
110 |         return x_all.log_softmax(dim=-1)
111 | 
112 | 
113 | 
114 | class GATv2_NeighSampler(torch.nn.Module):
115 |     def __init__(self
116 |                  , in_channels
117 |                  , hidden_channels
118 |                  , out_channels
119 |                  , num_layers
120 |                  , dropout
121 |                  , layer_heads = []
122 |                  , batchnorm=True):
123 |         super(GATv2_NeighSampler, self).__init__()
124 | 
125 |         self.convs = torch.nn.ModuleList()
126 |         self.batchnorm = batchnorm
127 |         self.num_layers = num_layers
128 |         
129 |         if len(layer_heads)>1:
130 |             self.convs.append(GATv2Conv(in_channels, hidden_channels, heads=layer_heads[0], concat=True))
131 |             if self.batchnorm:
132 |                 self.bns = torch.nn.ModuleList()
133 |                 self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[0]))
134 |             for i in range(num_layers - 2):
135 |                 self.convs.append(GATv2Conv(hidden_channels*layer_heads[i-1], hidden_channels, heads=layer_heads[i], concat=True))
136 |                 if self.batchnorm:
137 |                     self.bns.append(torch.nn.BatchNorm1d(hidden_channels*layer_heads[i-1]))
138 |             self.convs.append(GATv2Conv(hidden_channels*layer_heads[num_layers-2]
139 |                               , out_channels
140 |                               , heads=layer_heads[num_layers-1]
141 |                               , concat=False))
142 |         else:
143 |             self.convs.append(GATv2Conv(in_channels, out_channels, heads=layer_heads[0], concat=False))        
144 | 
145 |         self.dropout = dropout
146 |         
147 |     def reset_parameters(self):
148 |         for conv in self.convs:
149 |             conv.reset_parameters()
150 |         if self.batchnorm:
151 |             for bn in self.bns:
152 |                 bn.reset_parameters()        
153 |         
154 |         
155 |     def forward(self, x, adjs):
156 |         for i, (edge_index, _, size) in enumerate(adjs):
157 |             x_target = x[:size[1]]
158 |             x = self.convs[i]((x, x_target), edge_index)
159 |             if i != self.num_layers-1:
160 |                 if self.batchnorm:
161 |                     x = self.bns[i](x)
162 |                 x = F.relu(x)
163 |                 x = F.dropout(x, p=0.5, training=self.training)
164 |                 
165 |         return x.log_softmax(dim=-1)
166 |     
167 |     '''
168 |     subgraph_loader: size = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1],
169 |                                   batch_size=**, shuffle=False,
170 |                                   num_workers=12)
171 |     You can also sample the complete k-hop neighborhood, but this is rather expensive (especially for Reddit). 
172 |     We apply here trick here to compute the node embeddings efficiently: 
173 |        Instead of sampling multiple layers for a mini-batch, we instead compute the node embeddings layer-wise. 
174 |        Doing this exactly k times mimics a k-layer GNN.  
175 |     '''
176 |     
177 |     def inference_all(self, data):
178 |         x, adj_t = data.x, data.adj_t
179 |         for i, conv in enumerate(self.convs[:-1]):
180 |             x = conv(x, adj_t)
181 |             if self.batchnorm: 
182 |                 x = self.bns[i](x)
183 |             x = F.relu(x)
184 |             x = F.dropout(x, p=self.dropout, training=self.training)
185 |         x = self.convs[-1](x, adj_t)
186 |         return x.log_softmax(dim=-1)
187 |     
188 |     def inference(self, x_all, layer_loader, device):
189 |         pbar = tqdm(total=x_all.size(0) * self.num_layers, ncols=80)
190 |         pbar.set_description('Evaluating')
191 | 
192 |         # Compute representations of nodes layer by layer, using *all*
193 |         # available edges. This leads to faster computation in contrast to
194 |         # immediately computing the final representations of each batch.
195 |         for i in range(self.num_layers):
196 |             xs = []
197 |             for batch_size, n_id, adj in layer_loader:
198 |                 edge_index, _, size = adj.to(device)
199 |                 x = x_all[n_id].to(device)
200 |                 x_target = x[:size[1]]
201 |                 x = self.convs[i]((x, x_target), edge_index)
202 |                 if i != self.num_layers - 1:
203 |                     x = F.relu(x)
204 |                     if self.batchnorm: 
205 |                         x = self.bns[i](x)
206 |                 xs.append(x)
207 | 
208 |                 pbar.update(batch_size)
209 | 
210 |             x_all = torch.cat(xs, dim=0)
211 | 
212 |         pbar.close()
213 | 
214 |         return x_all.log_softmax(dim=-1)
215 | 
216 | 


--------------------------------------------------------------------------------