├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── examples
├── README.md
├── graphproppred
│ ├── code2
│ │ ├── README.md
│ │ ├── conv.py
│ │ ├── gnn.py
│ │ ├── main_pyg.py
│ │ ├── py2graph.py
│ │ └── utils.py
│ ├── mol
│ │ ├── README.md
│ │ ├── conv.py
│ │ ├── gnn.py
│ │ └── main_pyg.py
│ └── ppa
│ │ ├── README.md
│ │ ├── conv.py
│ │ ├── gnn.py
│ │ └── main_pyg.py
├── linkproppred
│ ├── biokg
│ │ ├── README.md
│ │ ├── dataloader.py
│ │ ├── examples.sh
│ │ ├── model.py
│ │ └── run.py
│ ├── citation2
│ │ ├── README.md
│ │ ├── cluster_gcn.py
│ │ ├── gnn.py
│ │ ├── graph_saint.py
│ │ ├── logger.py
│ │ ├── mf.py
│ │ ├── mlp.py
│ │ ├── node2vec.py
│ │ └── sampler.py
│ ├── collab
│ │ ├── README.md
│ │ ├── gnn.py
│ │ ├── logger.py
│ │ ├── mf.py
│ │ ├── mlp.py
│ │ └── node2vec.py
│ ├── ddi
│ │ ├── README.md
│ │ ├── gnn.py
│ │ ├── logger.py
│ │ ├── mf.py
│ │ ├── mlp.py
│ │ └── node2vec.py
│ ├── ppa
│ │ ├── README.md
│ │ ├── gnn.py
│ │ ├── logger.py
│ │ ├── mf.py
│ │ ├── mlp.py
│ │ └── node2vec.py
│ ├── vessel
│ │ ├── README.md
│ │ ├── gnn.py
│ │ ├── gnn_config.yaml
│ │ ├── logger.py
│ │ ├── mf.py
│ │ ├── mf_config.yaml
│ │ ├── mlp.py
│ │ ├── mlp_config.yaml
│ │ ├── node2vec.py
│ │ └── node2vec_config.yaml
│ └── wikikg2
│ │ ├── README.md
│ │ ├── dataloader.py
│ │ ├── examples.sh
│ │ ├── model.py
│ │ └── run.py
├── lsc
│ ├── mag240m
│ │ ├── README.md
│ │ ├── correct_and_smooth.py
│ │ ├── gnn.py
│ │ ├── label_prop.py
│ │ ├── mlp.py
│ │ ├── preprocess_correct_and_smooth.py
│ │ ├── preprocess_sgc.py
│ │ ├── rgnn.py
│ │ ├── root.py
│ │ ├── sgc.py
│ │ └── sign.py
│ ├── pcqm4m-v2
│ │ ├── README.md
│ │ ├── conv.py
│ │ ├── gnn.py
│ │ ├── main_gnn.py
│ │ ├── main_gnn_multi_gpu.py
│ │ ├── main_mlpfp.py
│ │ └── test_inference_gnn.py
│ ├── pcqm4m
│ │ ├── README.md
│ │ ├── conv.py
│ │ ├── gnn.py
│ │ ├── main_gnn.py
│ │ ├── main_mlpfp.py
│ │ └── test_inference_gnn.py
│ ├── wikikg90m-v2
│ │ └── README.md
│ └── wikikg90m
│ │ ├── README.md
│ │ ├── dgl-ke-ogb-lsc
│ │ ├── .gitignore
│ │ ├── CODE_OF_CONDUCT.md
│ │ ├── CONTRIBUTING.md
│ │ ├── CONTRIBUTORS.md
│ │ ├── Jenkinsfile
│ │ ├── LICENSE
│ │ ├── NOTICE
│ │ ├── README.md
│ │ ├── conda
│ │ │ └── README.md
│ │ ├── docker
│ │ │ └── README.md
│ │ ├── docs
│ │ │ ├── .gitignore
│ │ │ ├── Makefile
│ │ │ ├── images
│ │ │ │ ├── dgl_ke_arch.png
│ │ │ │ ├── dist_train.png
│ │ │ │ ├── distmult.png
│ │ │ │ ├── kg_example.png
│ │ │ │ ├── kge_scores.png
│ │ │ │ ├── metis.png
│ │ │ │ ├── multi-core.png
│ │ │ │ ├── multi-gpu.png
│ │ │ │ ├── multi-gpu.svg
│ │ │ │ ├── rescal.png
│ │ │ │ ├── rescal2.png
│ │ │ │ ├── rescal3.png
│ │ │ │ ├── rotate.png
│ │ │ │ ├── transe.png
│ │ │ │ ├── transr.png
│ │ │ │ ├── vs-gv-fb15k.png
│ │ │ │ └── vs-pbg-fb.png
│ │ │ ├── make.bat
│ │ │ └── source
│ │ │ │ ├── benchmarks.rst
│ │ │ │ ├── commands.rst
│ │ │ │ ├── conf.py
│ │ │ │ ├── dist_train.rst
│ │ │ │ ├── emb_sim.rst
│ │ │ │ ├── eval.rst
│ │ │ │ ├── format_kg.rst
│ │ │ │ ├── format_out.rst
│ │ │ │ ├── index.rst
│ │ │ │ ├── install.rst
│ │ │ │ ├── kg.rst
│ │ │ │ ├── partition.rst
│ │ │ │ ├── predict.rst
│ │ │ │ ├── profile.rst
│ │ │ │ ├── train.rst
│ │ │ │ └── train_user_data.rst
│ │ ├── examples
│ │ │ ├── README.md
│ │ │ ├── wn18
│ │ │ │ ├── head.list
│ │ │ │ ├── raw_head.list
│ │ │ │ ├── raw_rel.list
│ │ │ │ ├── raw_tail.list
│ │ │ │ ├── rel.list
│ │ │ │ └── tail.list
│ │ │ └── wn18_weighted
│ │ │ │ └── README.md
│ │ ├── img
│ │ │ ├── dgl_ke_arch.PNG
│ │ │ ├── logo.png
│ │ │ ├── vs-gv-fb15k.png
│ │ │ ├── vs-gv-wn18.png
│ │ │ └── vs-pbg-fb.png
│ │ └── python
│ │ │ ├── dglke
│ │ │ ├── VERSION.txt
│ │ │ ├── __init__.py
│ │ │ ├── dataloader
│ │ │ │ ├── KGDataset.py
│ │ │ │ ├── __init__.py
│ │ │ │ └── sampler.py
│ │ │ ├── dist_train.py
│ │ │ ├── eval.py
│ │ │ ├── infer_emb_sim.py
│ │ │ ├── infer_score.py
│ │ │ ├── kvclient.py
│ │ │ ├── kvserver.py
│ │ │ ├── models
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base_loss.py
│ │ │ │ ├── general_models.py
│ │ │ │ ├── infer.py
│ │ │ │ ├── ke_model.py
│ │ │ │ ├── mxnet
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── loss.py
│ │ │ │ │ ├── score_fun.py
│ │ │ │ │ └── tensor_models.py
│ │ │ │ └── pytorch
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── ke_tensor.py
│ │ │ │ │ ├── loss.py
│ │ │ │ │ ├── score_fun.py
│ │ │ │ │ └── tensor_models.py
│ │ │ ├── partition.py
│ │ │ ├── train.py
│ │ │ ├── train_mxnet.py
│ │ │ ├── train_pytorch.py
│ │ │ └── utils.py
│ │ │ ├── setup.cfg
│ │ │ └── setup.py
│ │ ├── run.sh
│ │ └── save_test_submission.py
└── nodeproppred
│ ├── arxiv
│ ├── README.md
│ ├── gnn.py
│ ├── logger.py
│ ├── mlp.py
│ └── node2vec.py
│ ├── mag
│ ├── README.md
│ ├── cluster_gcn.py
│ ├── gnn.py
│ ├── graph_saint.py
│ ├── logger.py
│ ├── metapath.py
│ ├── mlp.py
│ ├── rgcn.py
│ └── sampler.py
│ ├── papers100M
│ ├── README.md
│ ├── logger.py
│ ├── mlp.py
│ ├── node2vec.py
│ └── sgc.py
│ ├── products
│ ├── README.md
│ ├── cluster_gcn.py
│ ├── gnn.py
│ ├── graph_saint.py
│ ├── logger.py
│ ├── mlp.py
│ ├── node2vec.py
│ └── sign.py
│ └── proteins
│ ├── README.md
│ ├── gnn.py
│ ├── logger.py
│ ├── mlp.py
│ └── node2vec.py
├── ogb
├── __init__.py
├── graphproppred
│ ├── __init__.py
│ ├── dataset.py
│ ├── dataset_dgl.py
│ ├── dataset_pyg.py
│ ├── evaluate.py
│ ├── make_master_file.py
│ ├── master.csv
│ └── mol_encoder.py
├── io
│ ├── README.md
│ ├── __init__.py
│ ├── read_graph_dgl.py
│ ├── read_graph_pyg.py
│ ├── read_graph_raw.py
│ └── save_dataset.py
├── linkproppred
│ ├── __init__.py
│ ├── dataset.py
│ ├── dataset_dgl.py
│ ├── dataset_pyg.py
│ ├── evaluate.py
│ ├── make_master_file.py
│ └── master.csv
├── lsc
│ ├── __init__.py
│ ├── mag240m.py
│ ├── pcqm4m.py
│ ├── pcqm4m_dgl.py
│ ├── pcqm4m_pyg.py
│ ├── pcqm4mv2.py
│ ├── pcqm4mv2_dgl.py
│ ├── pcqm4mv2_pyg.py
│ ├── utils.py
│ ├── wikikg90m.py
│ └── wikikg90mv2.py
├── nodeproppred
│ ├── __init__.py
│ ├── dataset.py
│ ├── dataset_dgl.py
│ ├── dataset_pyg.py
│ ├── evaluate.py
│ ├── make_master_file.py
│ └── master.csv
├── utils
│ ├── __init__.py
│ ├── features.py
│ ├── mol.py
│ ├── torch_util.py
│ └── url.py
└── version.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | build/
3 | dist/
4 | *.egg-info/
5 | dataset/
6 | *.swp
7 | *.vscode
8 | *.DS_Store
9 | *.pt
10 | *.so
11 | *trial*
12 | *.ipynb
13 | *.sh
14 | *analyze*
15 | *random.py
16 | *RELEASE_*
17 | *.csv.gz
18 | *.zip
19 | *submission_
20 | *.npz
21 | *.npy
22 | **/convert.py
23 | **/mapping/README.md
24 | **.json
25 | **/checkpoint
26 | **/events.out.*
27 | **/test*
28 | **/pcqm4m-scaf/**
29 | **/logs/**
30 | **/lightning_logs/**
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 OGB Team
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include ogb/graphproppred/master.csv
2 | include ogb/nodeproppred/master.csv
3 | include ogb/linkproppred/master.csv
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # OGB Examples
2 |
3 | We provide baseline experiments/example scripts for **[node property prediction tasks](https://github.com/snap-stanford/ogb/tree/master/examples/nodeproppred)**, **[link property prediction tasks](https://github.com/snap-stanford/ogb/tree/master/examples/linkproppred)** and **[graph property prediction tasks](https://github.com/snap-stanford/ogb/tree/master/examples/graphproppred)** on *all* datasets currently included in OGB.
4 | We additionally provide **[our baseline code](https://github.com/snap-stanford/ogb/tree/master/examples/lsc)** for our **[OGB-LSC @ KDD Cup 2021](https://ogb.stanford.edu/kddcup2021/)**.
5 |
6 | Each dataset is hosted in its own folder and provides separate examples for each individual model.
7 | Unless otherwise specified, the default hyper-parameters are what we used to generate the results in our paper and leaderboards.
8 | Please read their respective `README.md` files for further information on how to run them.
9 |
--------------------------------------------------------------------------------
/examples/graphproppred/code2/README.md:
--------------------------------------------------------------------------------
1 | # ogbg-code2
2 |
3 | This repository includes the scripts for GNN baselines for `ogbg-code2` dataset.
4 | This code requires Pytorch Geometric version>=`2.0.2` and torch version>=`1.10.1`.
5 |
6 | **Note (Feb 24, 2021)**: The older version `ogbg-code` is deprecated due to prediction target (i.e., method name) leakage in input AST. `ogbg-code2` (available from `ogb>=1.2.5` ) fixes this issue, where the method name and its recursive definition in AST are replace with a special token `_mask_`. The leaderboard results of `ogbg-code` and `ogbg-code2` are *not* comparable.
7 |
8 | ## Training & Evaluation
9 |
10 | ```
11 | # Run with default config.
12 | # $GNN_TYPE and $FILENAME are described below.
13 | python main_pyg.py --gnn $GNN_TYPE --filename $FILENAME
14 | ```
15 |
16 | ### `$GNN_TYPE`
17 | `$GNN_TYPE` specified the GNN architecture. It should be one of the followings:
18 | - `gin`: GIN [1]
19 | - `gin-virtual`: GIN over graphs augmented with virtual nodes\* [3]
20 | - `gcn`: GCN [2]
21 | - `gin-virtual`: GCN over graphs augmented with virtual nodes\* [3]
22 |
23 | \* Additional nodes that are connected to all the nodes in the original graphs.
24 |
25 | ### `$FILENAME`: Specifying output files.
26 | `$FILENAME` specifies the filename to save the result. The result is a dictionary containing (1) best training performance (`'BestTrain'`), (2) best validation performance (`'Val'`), (3) test performance at the best validation epoch (`'Test'`), and (4) training performance at the best validation epoch (`'Train'`).
27 |
28 |
29 | ## Converting code snippet into OGB graph object
30 | [`py2graph.py`](https://github.com/snap-stanford/ogb/blob/master/examples/graphproppred/code2/py2graph.py) is a script that converts a Python code snippet into a graph object that is fully compatible with the OGB code dataset.
31 | This script can be used when one wants to perform transfer learning from external Python code datasets.
32 |
33 | ## References
34 | [1] Xu, K., Hu, W., Leskovec, J., & Jegelka, S. (2019). How powerful are graph neural networks?. ICLR 2019
35 |
36 | [2] Kipf, T. N., & Welling, M. (2017). Semi-supervised classification with graph convolutional networks. ICLR 2017
37 |
38 | [3] Gilmer, J., Schoenholz, S. S., Riley, P. F., Vinyals, O., & Dahl, G. E. Neural message passing for quantum chemistry. ICML 2017.
39 |
--------------------------------------------------------------------------------
/examples/graphproppred/code2/gnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch_geometric.nn import MessagePassing
3 | from torch_geometric.nn import global_add_pool, global_mean_pool, global_max_pool, GlobalAttention, Set2Set
4 | import torch.nn.functional as F
5 | from torch_geometric.nn.inits import uniform
6 |
7 | from conv import GNN_node, GNN_node_Virtualnode
8 |
9 | class GNN(torch.nn.Module):
10 |
11 | def __init__(self, num_vocab, max_seq_len, node_encoder, num_layer = 5, emb_dim = 300,
12 | gnn_type = 'gin', virtual_node = True, residual = False, drop_ratio = 0.5, JK = "last", graph_pooling = "mean"):
13 | '''
14 | num_tasks (int): number of labels to be predicted
15 | virtual_node (bool): whether to add virtual node or not
16 | '''
17 |
18 | super(GNN, self).__init__()
19 |
20 | self.num_layer = num_layer
21 | self.drop_ratio = drop_ratio
22 | self.JK = JK
23 | self.emb_dim = emb_dim
24 | self.num_vocab = num_vocab
25 | self.max_seq_len = max_seq_len
26 | self.graph_pooling = graph_pooling
27 |
28 | if self.num_layer < 2:
29 | raise ValueError("Number of GNN layers must be greater than 1.")
30 |
31 | ### GNN to generate node embeddings
32 | if virtual_node:
33 | self.gnn_node = GNN_node_Virtualnode(num_layer, emb_dim, node_encoder, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
34 | else:
35 | self.gnn_node = GNN_node(num_layer, emb_dim, node_encoder, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
36 |
37 |
38 | ### Pooling function to generate whole-graph embeddings
39 | if self.graph_pooling == "sum":
40 | self.pool = global_add_pool
41 | elif self.graph_pooling == "mean":
42 | self.pool = global_mean_pool
43 | elif self.graph_pooling == "max":
44 | self.pool = global_max_pool
45 | elif self.graph_pooling == "attention":
46 | self.pool = GlobalAttention(gate_nn = torch.nn.Sequential(torch.nn.Linear(emb_dim, 2*emb_dim), torch.nn.BatchNorm1d(2*emb_dim), torch.nn.ReLU(), torch.nn.Linear(2*emb_dim, 1)))
47 | elif self.graph_pooling == "set2set":
48 | self.pool = Set2Set(emb_dim, processing_steps = 2)
49 | else:
50 | raise ValueError("Invalid graph pooling type.")
51 |
52 | self.graph_pred_linear_list = torch.nn.ModuleList()
53 |
54 | if graph_pooling == "set2set":
55 | for i in range(max_seq_len):
56 | self.graph_pred_linear_list.append(torch.nn.Linear(2*emb_dim, self.num_vocab))
57 |
58 | else:
59 | for i in range(max_seq_len):
60 | self.graph_pred_linear_list.append(torch.nn.Linear(emb_dim, self.num_vocab))
61 |
62 |
63 | def forward(self, batched_data):
64 | '''
65 | Return:
66 | A list of predictions.
67 | i-th element represents prediction at i-th position of the sequence.
68 | '''
69 |
70 | h_node = self.gnn_node(batched_data)
71 |
72 | h_graph = self.pool(h_node, batched_data.batch)
73 |
74 | pred_list = []
75 |
76 | for i in range(self.max_seq_len):
77 | pred_list.append(self.graph_pred_linear_list[i](h_graph))
78 |
79 | return pred_list
80 |
81 | if __name__ == '__main__':
82 | pass
--------------------------------------------------------------------------------
/examples/graphproppred/mol/README.md:
--------------------------------------------------------------------------------
1 | # ogbg-mol
2 |
3 | This repository includes the scripts for GNN baselines for `ogbg-mol*` dataset.
4 |
5 | ## Training & Evaluation
6 |
7 | ```
8 | # Run with default config.
9 | # $DATASET, $GNN_TYPE, and $FILENAME are described below.
10 | python main_pyg.py --dataset $DATASET --gnn $GNN_TYPE --filename $FILENAME
11 | ```
12 |
13 | ### `$DATASET`
14 | `$DATASET` specified the name of the molecule dataset. It should be one of the followings:
15 | - `ogbg-molhiv`
16 | - `ogbg-molpcba`
17 |
18 | Additionally we provide the smaller molecule datasets from MoleculeNet [1].
19 | - `ogbg-molbace`
20 | - `ogbg-molbbbp`
21 | - `ogbg-molclintox`
22 | - `ogbg-molmuv`
23 | - `ogbg-molsider`
24 | - `ogbg-moltox21`
25 | - `ogbg-moltoxcast`
26 | - `ogbg-molesol`
27 | - `ogbg-molfreesolv`
28 | - `ogbg-mollipo`
29 |
30 | The last three datasets (`ogbg-molesol`, `ogbg-molfreesolv`, `ogbg-mollipo`) are for regression, and the rest are for binary classification.
31 |
32 | ### `$GNN_TYPE`
33 | `$GNN_TYPE` specified the GNN architecture. It should be one of the followings:
34 | - `gin`: GIN [2]
35 | - `gin-virtual`: GIN over graphs augmented with virtual nodes\* [4]
36 | - `gcn`: GCN [3]
37 | - `gcn-virtual`: GCN over graphs augmented with virtual nodes\* [4]
38 |
39 | \* Additional nodes that are connected to all the nodes in the original graphs.
40 |
41 | ### `$FILENAME`: Specifying output files.
42 | `$FILENAME` specifies the filename to save the result. The result is a dictionary containing (1) best training performance (`'BestTrain'`), (2) best validation performance (`'Val'`), (3) test performance at the best validation epoch (`'Test'`), and (4) training performance at the best validation epoch (`'Train'`).
43 |
44 | ## Converting SMILES string into OGB graph object
45 | Molecules are typically represented as [SMILES strings](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system).
46 | OGB package provides the utility to transform the SMILES string into a graph object that is fully compatible with the OGB molecule datasets. This can be used when one wants to perform transfer learning from external molecular datasets.
47 |
48 | ```python
49 | from ogb.utils import smiles2graph
50 | graph = smiles2graph('O1C=C[C@H]([C@H]1O2)c3c2cc(OC)c4c3OC(=O)C5=C4CCC(=O)5')
51 | ```
52 |
53 | ## References
54 | [1] Wu, Z., Ramsundar, B., Feinberg, E. N., Gomes, J., Geniesse, C., Pappu, A. S., ... & Pande, V. (2018). MoleculeNet: a benchmark for molecular machine learning. Chemical science, 9(2), 513-530.
55 |
56 | [2] Xu, K., Hu, W., Leskovec, J., & Jegelka, S. (2019). How powerful are graph neural networks?. ICLR 2019
57 |
58 | [3] Kipf, T. N., & Welling, M. (2017). Semi-supervised classification with graph convolutional networks. ICLR 2017
59 |
60 | [4] Gilmer, J., Schoenholz, S. S., Riley, P. F., Vinyals, O., & Dahl, G. E. Neural message passing for quantum chemistry. ICML 2017.
61 |
--------------------------------------------------------------------------------
/examples/graphproppred/mol/gnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch_geometric.nn import MessagePassing
3 | from torch_geometric.nn import global_add_pool, global_mean_pool, global_max_pool, GlobalAttention, Set2Set
4 | import torch.nn.functional as F
5 | from torch_geometric.nn.inits import uniform
6 |
7 | from conv import GNN_node, GNN_node_Virtualnode
8 |
9 | class GNN(torch.nn.Module):
10 |
11 | def __init__(self, num_tasks, num_layer = 5, emb_dim = 300,
12 | gnn_type = 'gin', virtual_node = True, residual = False, drop_ratio = 0.5, JK = "last", graph_pooling = "mean"):
13 | '''
14 | num_tasks (int): number of labels to be predicted
15 | virtual_node (bool): whether to add virtual node or not
16 | '''
17 |
18 | super(GNN, self).__init__()
19 |
20 | self.num_layer = num_layer
21 | self.drop_ratio = drop_ratio
22 | self.JK = JK
23 | self.emb_dim = emb_dim
24 | self.num_tasks = num_tasks
25 | self.graph_pooling = graph_pooling
26 |
27 | if self.num_layer < 2:
28 | raise ValueError("Number of GNN layers must be greater than 1.")
29 |
30 | ### GNN to generate node embeddings
31 | if virtual_node:
32 | self.gnn_node = GNN_node_Virtualnode(num_layer, emb_dim, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
33 | else:
34 | self.gnn_node = GNN_node(num_layer, emb_dim, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
35 |
36 |
37 | ### Pooling function to generate whole-graph embeddings
38 | if self.graph_pooling == "sum":
39 | self.pool = global_add_pool
40 | elif self.graph_pooling == "mean":
41 | self.pool = global_mean_pool
42 | elif self.graph_pooling == "max":
43 | self.pool = global_max_pool
44 | elif self.graph_pooling == "attention":
45 | self.pool = GlobalAttention(gate_nn = torch.nn.Sequential(torch.nn.Linear(emb_dim, 2*emb_dim), torch.nn.BatchNorm1d(2*emb_dim), torch.nn.ReLU(), torch.nn.Linear(2*emb_dim, 1)))
46 | elif self.graph_pooling == "set2set":
47 | self.pool = Set2Set(emb_dim, processing_steps = 2)
48 | else:
49 | raise ValueError("Invalid graph pooling type.")
50 |
51 | if graph_pooling == "set2set":
52 | self.graph_pred_linear = torch.nn.Linear(2*self.emb_dim, self.num_tasks)
53 | else:
54 | self.graph_pred_linear = torch.nn.Linear(self.emb_dim, self.num_tasks)
55 |
56 | def forward(self, batched_data):
57 | h_node = self.gnn_node(batched_data)
58 |
59 | h_graph = self.pool(h_node, batched_data.batch)
60 |
61 | return self.graph_pred_linear(h_graph)
62 |
63 |
64 | if __name__ == '__main__':
65 | GNN(num_tasks = 10)
--------------------------------------------------------------------------------
/examples/graphproppred/ppa/README.md:
--------------------------------------------------------------------------------
1 | # ogbg-ppa
2 |
3 | This repository includes the scripts for GNN baselines for `ogbg-ppa` dataset.
4 |
5 | ## Training & Evaluation
6 |
7 | ```
8 | # Run with default config.
9 | # $GNN_TYPE and $FILENAME are described below.
10 | python main_pyg.py --gnn $GNN_TYPE --filename $FILENAME
11 | ```
12 |
13 | ### `$GNN_TYPE`
14 | `$GNN_TYPE` specified the GNN architecture. It should be one of the followings:
15 | - `gin`: GIN [1]
16 | - `gin-virtual`: GIN over graphs augmented with virtual nodes\* [3]
17 | - `gcn`: GCN [2]
18 | - `gin-virtual`: GCN over graphs augmented with virtual nodes\* [3]
19 |
20 | \* Additional nodes that are connected to all the nodes in the original graphs.
21 |
22 | ### `$FILENAME`: Specifying output files.
23 | `$FILENAME` specifies the filename to save the result. The result is a dictionary containing (1) best training performance (`'BestTrain'`), (2) best validation performance (`'Val'`), (3) test performance at the best validation epoch (`'Test'`), and (4) training performance at the best validation epoch (`'Train'`).
24 |
25 |
26 | ## References
27 | [1] Xu, K., Hu, W., Leskovec, J., & Jegelka, S. (2019). How powerful are graph neural networks?. ICLR 2019
28 |
29 | [2] Kipf, T. N., & Welling, M. (2017). Semi-supervised classification with graph convolutional networks. ICLR 2017
30 |
31 | [3] Gilmer, J., Schoenholz, S. S., Riley, P. F., Vinyals, O., & Dahl, G. E. Neural message passing for quantum chemistry. ICML 2017.
32 |
--------------------------------------------------------------------------------
/examples/graphproppred/ppa/gnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch_geometric.nn import MessagePassing
3 | from torch_geometric.nn import global_add_pool, global_mean_pool, global_max_pool, GlobalAttention, Set2Set
4 | import torch.nn.functional as F
5 | from torch_geometric.nn.inits import uniform
6 |
7 | from conv import GNN_node, GNN_node_Virtualnode
8 |
9 | class GNN(torch.nn.Module):
10 |
11 | def __init__(self, num_class, num_layer = 5, emb_dim = 300,
12 | gnn_type = 'gin', virtual_node = True, residual = False, drop_ratio = 0.5, JK = "last", graph_pooling = "mean"):
13 | '''
14 | num_tasks (int): number of labels to be predicted
15 | virtual_node (bool): whether to add virtual node or not
16 | '''
17 |
18 | super(GNN, self).__init__()
19 |
20 | self.num_layer = num_layer
21 | self.drop_ratio = drop_ratio
22 | self.JK = JK
23 | self.emb_dim = emb_dim
24 | self.num_class = num_class
25 | self.graph_pooling = graph_pooling
26 |
27 | if self.num_layer < 2:
28 | raise ValueError("Number of GNN layers must be greater than 1.")
29 |
30 | ### GNN to generate node embeddings
31 | if virtual_node:
32 | self.gnn_node = GNN_node_Virtualnode(num_layer, emb_dim, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
33 | else:
34 | self.gnn_node = GNN_node(num_layer, emb_dim, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
35 |
36 |
37 | ### Pooling function to generate whole-graph embeddings
38 | if self.graph_pooling == "sum":
39 | self.pool = global_add_pool
40 | elif self.graph_pooling == "mean":
41 | self.pool = global_mean_pool
42 | elif self.graph_pooling == "max":
43 | self.pool = global_max_pool
44 | elif self.graph_pooling == "attention":
45 | self.pool = GlobalAttention(gate_nn = torch.nn.Sequential(torch.nn.Linear(emb_dim, 2*emb_dim), torch.nn.BatchNorm1d(2*emb_dim), torch.nn.ReLU(), torch.nn.Linear(2*emb_dim, 1)))
46 | elif self.graph_pooling == "set2set":
47 | self.pool = Set2Set(emb_dim, processing_steps = 2)
48 | else:
49 | raise ValueError("Invalid graph pooling type.")
50 |
51 | if graph_pooling == "set2set":
52 | self.graph_pred_linear = torch.nn.Linear(2*self.emb_dim, self.num_class)
53 | else:
54 | self.graph_pred_linear = torch.nn.Linear(self.emb_dim, self.num_class)
55 |
56 | def forward(self, batched_data):
57 | h_node = self.gnn_node(batched_data)
58 |
59 | h_graph = self.pool(h_node, batched_data.batch)
60 |
61 | return self.graph_pred_linear(h_graph)
62 |
63 |
64 | if __name__ == '__main__':
65 | GNN(num_class = 10)
--------------------------------------------------------------------------------
/examples/linkproppred/biokg/README.md:
--------------------------------------------------------------------------------
1 | # ogbl-biokg
2 |
3 | This code includes implementation of TransE, DistMult, ComplEx and RotatE with OGB evaluator. It is based on this [repository](https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding).
4 |
5 | ## Training & Evaluation
6 |
7 | ```
8 | # Run with default config
9 | bash examples.sh
10 | ```
--------------------------------------------------------------------------------
/examples/linkproppred/biokg/examples.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | CUDA_VISIBLE_DEVICES=0 python3.5 run.py --do_train --cuda --do_valid --do_test --evaluate_train \
5 | --model TransE -n 128 -b 512 -d 2000 -g 20 -a 1.0 -adv \
6 | -lr 0.0001 --max_steps 300000 --cpu_num 2 --test_batch_size 32
7 |
8 | CUDA_VISIBLE_DEVICES=1 python3.5 run.py --do_train --cuda --do_valid --do_test --evaluate_train \
9 | --model DistMult -n 128 -b 512 -d 2000 -g 500 -a 1.0 -adv \
10 | -lr 0.001 --max_steps 300000 --cpu_num 2 --test_batch_size 32 -r 0.000002
11 |
12 | CUDA_VISIBLE_DEVICES=2 python3.5 run.py --do_train --cuda --do_valid --do_test --evaluate_train \
13 | --model RotatE -n 128 -b 512 -d 1000 -g 20 -a 1.0 -adv \
14 | -lr 0.0001 --max_steps 300000 --cpu_num 2 --test_batch_size 32 -de
15 |
16 | CUDA_VISIBLE_DEVICES=3 python3.5 run.py --do_train --cuda --do_valid --do_test --evaluate_train \
17 | --model ComplEx -n 128 -b 512 -d 1000 -g 500 -a 1.0 -adv \
18 | -lr 0.001 --max_steps 300000 --cpu_num 2 --test_batch_size 32 -de -dr -r 0.000002
19 |
--------------------------------------------------------------------------------
/examples/linkproppred/citation2/README.md:
--------------------------------------------------------------------------------
1 | # ogbl-citation2
2 |
3 | **Note (Dec 29, 2020)**: The older version `ogbl-citation` is deprecated because negative samples used in validation and test sets are found to be quite biased (i.e., half of the entity nodes are never sampled as negative examples). `ogbl-citation2` (available from `ogb>=1.2.4` ) fixes this issue while retaining everyelse the same. The leaderboard results of `ogbl-citation` and `ogbl-citation2` are *not* comparable.
4 |
5 | This repository includes the following example scripts:
6 |
7 | * **[MLP](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/citation2/mlp.py)**: Full-batch MLP training based on paper features and optional Node2Vec features (`--use_node_embedding`). For training with Node2Vec features, this script requires node embeddings be saved in `embedding.pt`. To generate them, please run `python node2vec.py` [requires `torch-geometric>= 1.5.0`].
8 | * **[GNN](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/citation2/gnn.py)**: Full-batch GNN training using either the GCN or GraphSAGE operator (`--use_sage`). This script will require large amounts of GPU memory [requires `torch-geometric>=1.6.0`].
9 | * **[Matrix Factorization](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/citation2/mf.py)**: Full-batch Matrix Factorization training.
10 | * **[NeighborSampler](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/citation2/sampler.py)**: Mini-batch GNN training using neighbor sampling [requires `torch-geometric>=1.5.0`].
11 | * **[Cluster-GCN](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/citation2/cluster_gcn.py)**: Mini-batch GCN training using the Cluster-GCN algorithm [requires `torch-geometric>= 1.4.3`].
12 | * **[GraphSAINT](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/citation2/graph_saint.py)**: Mini-batch GCN training using the GraphSAINT algorithm [requires `torch-geometric>=1.5.0`].
13 |
14 | ## Training & Evaluation
15 |
16 | ```
17 | # Run with default config
18 | python cluster_gcn.py
19 |
20 | # Run with custom config
21 | python cluster_gcn.py --hidden_channels=128
22 | ```
23 |
--------------------------------------------------------------------------------
/examples/linkproppred/citation2/logger.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class Logger(object):
5 | def __init__(self, runs, info=None):
6 | self.info = info
7 | self.results = [[] for _ in range(runs)]
8 |
9 | def add_result(self, run, result):
10 | assert len(result) == 3
11 | assert run >= 0 and run < len(self.results)
12 | self.results[run].append(result)
13 |
14 | def reset(self, run):
15 | assert run >= 0 and run < len(self.results)
16 | self.results[run] = []
17 |
18 | def print_statistics(self, run=None):
19 | if run is not None:
20 | result = torch.tensor(self.results[run])
21 | argmax = result[:, 1].argmax().item()
22 | print(f'Run {run + 1:02d}:')
23 | print(f'Highest Train: {result[:, 0].max():.4f}')
24 | print(f'Highest Valid: {result[:, 1].max():.4f}')
25 | print(f' Final Train: {result[argmax, 0]:.4f}')
26 | print(f' Final Test: {result[argmax, 2]:.4f}')
27 | else:
28 | result = torch.tensor(self.results)
29 |
30 | best_results = []
31 | for r in result:
32 | train1 = r[:, 0].max().item()
33 | valid = r[:, 1].max().item()
34 | train2 = r[r[:, 1].argmax(), 0].item()
35 | test = r[r[:, 1].argmax(), 2].item()
36 | best_results.append((train1, valid, train2, test))
37 |
38 | best_result = torch.tensor(best_results)
39 | print(best_result)
40 |
41 | print(f'All runs:')
42 | r = best_result[:, 0]
43 | print(f'Highest Train: {r.mean():.4f} ± {r.std():.4f}')
44 | r = best_result[:, 1]
45 | print(f'Highest Valid: {r.mean():.4f} ± {r.std():.4f}')
46 | r = best_result[:, 2]
47 | print(f' Final Train: {r.mean():.4f} ± {r.std():.4f}')
48 | r = best_result[:, 3]
49 | print(f' Final Test: {r.mean():.4f} ± {r.std():.4f}')
50 |
--------------------------------------------------------------------------------
/examples/linkproppred/citation2/node2vec.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch_geometric.nn import Node2Vec
5 | from torch_geometric.utils import to_undirected
6 |
7 | from ogb.linkproppred import PygLinkPropPredDataset
8 |
9 |
10 | def save_embedding(model):
11 | torch.save(model.embedding.weight.data.cpu(), 'embedding.pt')
12 |
13 |
14 | def main():
15 | parser = argparse.ArgumentParser(description='OGBL-Citation2 (Node2Vec)')
16 | parser.add_argument('--device', type=int, default=0)
17 | parser.add_argument('--embedding_dim', type=int, default=128)
18 | parser.add_argument('--walk_length', type=int, default=40)
19 | parser.add_argument('--context_size', type=int, default=20)
20 | parser.add_argument('--walks_per_node', type=int, default=10)
21 | parser.add_argument('--batch_size', type=int, default=256)
22 | parser.add_argument('--lr', type=float, default=0.01)
23 | parser.add_argument('--epochs', type=int, default=1)
24 | parser.add_argument('--log_steps', type=int, default=1)
25 | args = parser.parse_args()
26 |
27 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
28 | device = torch.device(device)
29 |
30 | dataset = PygLinkPropPredDataset(name='ogbl-citation2')
31 | data = dataset[0]
32 | data.edge_index = to_undirected(data.edge_index, data.num_nodes)
33 |
34 | model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length,
35 | args.context_size, args.walks_per_node,
36 | sparse=True).to(device)
37 |
38 | loader = model.loader(batch_size=args.batch_size, shuffle=True,
39 | num_workers=4)
40 | optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr)
41 |
42 | model.train()
43 | for epoch in range(1, args.epochs + 1):
44 | for i, (pos_rw, neg_rw) in enumerate(loader):
45 | optimizer.zero_grad()
46 | loss = model.loss(pos_rw.to(device), neg_rw.to(device))
47 | loss.backward()
48 | optimizer.step()
49 |
50 | if (i + 1) % args.log_steps == 0:
51 | print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
52 | f'Loss: {loss:.4f}')
53 |
54 | if (i + 1) % 100 == 0: # Save model every 100 steps.
55 | save_embedding(model)
56 | save_embedding(model)
57 |
58 |
59 | if __name__ == "__main__":
60 | main()
61 |
--------------------------------------------------------------------------------
/examples/linkproppred/collab/README.md:
--------------------------------------------------------------------------------
1 | # ogbl-collab
2 |
3 | This repository includes the following example scripts:
4 |
5 | * **[MLP](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/collab/mlp.py)**: Full-batch MLP training based on author features and optional Node2Vec features (`--use_node_embedding`). For training with Node2Vec features, this script requires node embeddings be saved in `embedding.pt`. To generate them, please run `python node2vec.py` [requires `torch-geometric>=1.5.0`].
6 | * **[GNN](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/collab/gnn.py)**: Full-batch GNN training using either the GCN or GraphSAGE operator (`--use_sage`) [requires `torch-geometric>=1.6.0`]. Setting `--use_valedges_as_input` would allow models to use validation edges at inference time. See [here](https://ogb.stanford.edu/docs/leader_rules/) for the rules of using validation labels.
7 | * **[Matrix Factorization](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/collab/mf.py)**: Full-batch Matrix Factorization training.
8 |
9 | ## Training & Evaluation
10 |
11 | ```
12 | # Run with default config
13 | python gnn.py
14 |
15 | # Run with inference using validation edges
16 | python gnn.py --use_valedges_as_input
17 | ```
18 |
--------------------------------------------------------------------------------
/examples/linkproppred/collab/logger.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class Logger(object):
5 | def __init__(self, runs, info=None):
6 | self.info = info
7 | self.results = [[] for _ in range(runs)]
8 |
9 | def add_result(self, run, result):
10 | assert len(result) == 3
11 | assert run >= 0 and run < len(self.results)
12 | self.results[run].append(result)
13 |
14 | def print_statistics(self, run=None):
15 | if run is not None:
16 | result = 100 * torch.tensor(self.results[run])
17 | argmax = result[:, 1].argmax().item()
18 | print(f'Run {run + 1:02d}:')
19 | print(f'Highest Train: {result[:, 0].max():.2f}')
20 | print(f'Highest Valid: {result[:, 1].max():.2f}')
21 | print(f' Final Train: {result[argmax, 0]:.2f}')
22 | print(f' Final Test: {result[argmax, 2]:.2f}')
23 | else:
24 | result = 100 * torch.tensor(self.results)
25 |
26 | best_results = []
27 | for r in result:
28 | train1 = r[:, 0].max().item()
29 | valid = r[:, 1].max().item()
30 | train2 = r[r[:, 1].argmax(), 0].item()
31 | test = r[r[:, 1].argmax(), 2].item()
32 | best_results.append((train1, valid, train2, test))
33 |
34 | best_result = torch.tensor(best_results)
35 |
36 | print(f'All runs:')
37 | r = best_result[:, 0]
38 | print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
39 | r = best_result[:, 1]
40 | print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
41 | r = best_result[:, 2]
42 | print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}')
43 | r = best_result[:, 3]
44 | print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}')
45 |
--------------------------------------------------------------------------------
/examples/linkproppred/collab/node2vec.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch_geometric.nn import Node2Vec
5 |
6 | from ogb.linkproppred import PygLinkPropPredDataset
7 |
8 |
9 | def save_embedding(model):
10 | torch.save(model.embedding.weight.data.cpu(), 'embedding.pt')
11 |
12 |
13 | def main():
14 | parser = argparse.ArgumentParser(description='OGBL-COLLAB (Node2Vec)')
15 | parser.add_argument('--device', type=int, default=0)
16 | parser.add_argument('--embedding_dim', type=int, default=128)
17 | parser.add_argument('--walk_length', type=int, default=40)
18 | parser.add_argument('--context_size', type=int, default=20)
19 | parser.add_argument('--walks_per_node', type=int, default=10)
20 | parser.add_argument('--batch_size', type=int, default=256)
21 | parser.add_argument('--lr', type=float, default=0.01)
22 | parser.add_argument('--epochs', type=int, default=2)
23 | parser.add_argument('--log_steps', type=int, default=1)
24 | args = parser.parse_args()
25 |
26 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
27 | device = torch.device(device)
28 |
29 | dataset = PygLinkPropPredDataset(name='ogbl-collab')
30 | data = dataset[0]
31 |
32 | model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length,
33 | args.context_size, args.walks_per_node,
34 | sparse=True).to(device)
35 |
36 | loader = model.loader(batch_size=args.batch_size, shuffle=True,
37 | num_workers=4)
38 | optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr)
39 |
40 | model.train()
41 | for epoch in range(1, args.epochs + 1):
42 | for i, (pos_rw, neg_rw) in enumerate(loader):
43 | optimizer.zero_grad()
44 | loss = model.loss(pos_rw.to(device), neg_rw.to(device))
45 | loss.backward()
46 | optimizer.step()
47 |
48 | if (i + 1) % args.log_steps == 0:
49 | print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
50 | f'Loss: {loss:.4f}')
51 |
52 | if (i + 1) % 100 == 0: # Save model every 100 steps.
53 | save_embedding(model)
54 | save_embedding(model)
55 |
56 |
57 | if __name__ == "__main__":
58 | main()
59 |
--------------------------------------------------------------------------------
/examples/linkproppred/ddi/README.md:
--------------------------------------------------------------------------------
1 | # ogbl-ddi
2 |
3 | This repository includes the following example scripts:
4 |
5 | * **[MLP](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/ddi/mlp.py)**: Full-batch MLP training based on Node2Vec features. This script requires node embeddings be saved in `embedding.pt`. To generate them, please run `python node2vec.py` [requires `torch-geometric>=1.5.0`].
6 | * **[GNN](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/ddi/gnn.py)**: Full-batch GNN training using either the GCN or GraphSAGE operator (`--use_sage`) [requires `torch-geometric>=1.6.0`].
7 | * **[Matrix Factorization](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/ddi/mf.py)**: Full-batch Matrix Factorization training.
8 |
9 | ## Training & Evaluation
10 |
11 | ```
12 | # Run with default config
13 | python mlp.py
14 |
15 | # Run with custom config
16 | python mlp.py --hidden_channels=128
17 | ```
18 |
--------------------------------------------------------------------------------
/examples/linkproppred/ddi/logger.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class Logger(object):
5 | def __init__(self, runs, info=None):
6 | self.info = info
7 | self.results = [[] for _ in range(runs)]
8 |
9 | def add_result(self, run, result):
10 | assert len(result) == 3
11 | assert run >= 0 and run < len(self.results)
12 | self.results[run].append(result)
13 |
14 | def print_statistics(self, run=None):
15 | if run is not None:
16 | result = 100 * torch.tensor(self.results[run])
17 | argmax = result[:, 1].argmax().item()
18 | print(f'Run {run + 1:02d}:')
19 | print(f'Highest Train: {result[:, 0].max():.2f}')
20 | print(f'Highest Valid: {result[:, 1].max():.2f}')
21 | print(f' Final Train: {result[argmax, 0]:.2f}')
22 | print(f' Final Test: {result[argmax, 2]:.2f}')
23 | else:
24 | result = 100 * torch.tensor(self.results)
25 |
26 | best_results = []
27 | for r in result:
28 | train1 = r[:, 0].max().item()
29 | valid = r[:, 1].max().item()
30 | train2 = r[r[:, 1].argmax(), 0].item()
31 | test = r[r[:, 1].argmax(), 2].item()
32 | best_results.append((train1, valid, train2, test))
33 |
34 | best_result = torch.tensor(best_results)
35 |
36 | print(f'All runs:')
37 | r = best_result[:, 0]
38 | print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
39 | r = best_result[:, 1]
40 | print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
41 | r = best_result[:, 2]
42 | print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}')
43 | r = best_result[:, 3]
44 | print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}')
45 |
--------------------------------------------------------------------------------
/examples/linkproppred/ddi/node2vec.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch_geometric.nn import Node2Vec
5 |
6 | from ogb.linkproppred import PygLinkPropPredDataset
7 |
8 |
9 | def save_embedding(model):
10 | torch.save(model.embedding.weight.data.cpu(), 'embedding.pt')
11 |
12 |
13 | def main():
14 | parser = argparse.ArgumentParser(description='OGBL-DDI (Node2Vec)')
15 | parser.add_argument('--device', type=int, default=0)
16 | parser.add_argument('--embedding_dim', type=int, default=128)
17 | parser.add_argument('--walk_length', type=int, default=40)
18 | parser.add_argument('--context_size', type=int, default=20)
19 | parser.add_argument('--walks_per_node', type=int, default=10)
20 | parser.add_argument('--batch_size', type=int, default=256)
21 | parser.add_argument('--lr', type=float, default=0.01)
22 | parser.add_argument('--epochs', type=int, default=100)
23 | parser.add_argument('--log_steps', type=int, default=1)
24 | args = parser.parse_args()
25 |
26 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
27 | device = torch.device(device)
28 |
29 | dataset = PygLinkPropPredDataset(name='ogbl-ddi')
30 | data = dataset[0]
31 |
32 | model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length,
33 | args.context_size, args.walks_per_node,
34 | sparse=True).to(device)
35 |
36 | loader = model.loader(batch_size=args.batch_size, shuffle=True,
37 | num_workers=4)
38 | optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr)
39 |
40 | model.train()
41 | for epoch in range(1, args.epochs + 1):
42 | for i, (pos_rw, neg_rw) in enumerate(loader):
43 | optimizer.zero_grad()
44 | loss = model.loss(pos_rw.to(device), neg_rw.to(device))
45 | loss.backward()
46 | optimizer.step()
47 |
48 | if (i + 1) % args.log_steps == 0:
49 | print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
50 | f'Loss: {loss:.4f}')
51 |
52 | if (i + 1) % 100 == 0: # Save model every 100 steps.
53 | save_embedding(model)
54 | save_embedding(model)
55 |
56 |
57 | if __name__ == "__main__":
58 | main()
59 |
--------------------------------------------------------------------------------
/examples/linkproppred/ppa/README.md:
--------------------------------------------------------------------------------
1 | # ogbl-ppa
2 |
3 | This repository includes the following example scripts:
4 |
5 | * **[MLP](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/ppa/mlp.py)**: Full-batch MLP training based on Node2Vec features. This script requires node embeddings be saved in `embedding.pt`. To generate them, please run `python node2vec.py` [requires `torch-geometric>=1.5.0`].
6 | * **[GNN](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/ppa/gnn.py)**: Full-batch GNN training using either the GCN or GraphSAGE operator (`--use_sage`) [requires `torch-geometric>=1.6.0`].
7 | * **[Matrix Factorization](https://github.com/snap-stanford/ogb/blob/master/examples/linkproppred/ppa/mf.py)**: Full-batch Matrix Factorization training.
8 |
9 | ## Training & Evaluation
10 |
11 | ```
12 | # Run with default config
13 | python mlp.py
14 |
15 | # Run with custom config
16 | python mlp.py --hidden_channels=128
17 | ```
18 |
--------------------------------------------------------------------------------
/examples/linkproppred/ppa/logger.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class Logger(object):
5 | def __init__(self, runs, info=None):
6 | self.info = info
7 | self.results = [[] for _ in range(runs)]
8 |
9 | def add_result(self, run, result):
10 | assert len(result) == 3
11 | assert run >= 0 and run < len(self.results)
12 | self.results[run].append(result)
13 |
14 | def print_statistics(self, run=None):
15 | if run is not None:
16 | result = 100 * torch.tensor(self.results[run])
17 | argmax = result[:, 1].argmax().item()
18 | print(f'Run {run + 1:02d}:')
19 | print(f'Highest Train: {result[:, 0].max():.2f}')
20 | print(f'Highest Valid: {result[:, 1].max():.2f}')
21 | print(f' Final Train: {result[argmax, 0]:.2f}')
22 | print(f' Final Test: {result[argmax, 2]:.2f}')
23 | else:
24 | result = 100 * torch.tensor(self.results)
25 |
26 | best_results = []
27 | for r in result:
28 | train1 = r[:, 0].max().item()
29 | valid = r[:, 1].max().item()
30 | train2 = r[r[:, 1].argmax(), 0].item()
31 | test = r[r[:, 1].argmax(), 2].item()
32 | best_results.append((train1, valid, train2, test))
33 |
34 | best_result = torch.tensor(best_results)
35 |
36 | print(f'All runs:')
37 | r = best_result[:, 0]
38 | print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
39 | r = best_result[:, 1]
40 | print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
41 | r = best_result[:, 2]
42 | print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}')
43 | r = best_result[:, 3]
44 | print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}')
45 |
--------------------------------------------------------------------------------
/examples/linkproppred/ppa/node2vec.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch_geometric.nn import Node2Vec
5 |
6 | from ogb.linkproppred import PygLinkPropPredDataset
7 |
8 |
9 | def save_embedding(model):
10 | torch.save(model.embedding.weight.data.cpu(), 'embedding.pt')
11 |
12 |
13 | def main():
14 | parser = argparse.ArgumentParser(description='OGBL-PPA (Node2Vec)')
15 | parser.add_argument('--device', type=int, default=0)
16 | parser.add_argument('--embedding_dim', type=int, default=128)
17 | parser.add_argument('--walk_length', type=int, default=40)
18 | parser.add_argument('--context_size', type=int, default=20)
19 | parser.add_argument('--walks_per_node', type=int, default=10)
20 | parser.add_argument('--batch_size', type=int, default=256)
21 | parser.add_argument('--lr', type=float, default=0.01)
22 | parser.add_argument('--epochs', type=int, default=2)
23 | parser.add_argument('--log_steps', type=int, default=1)
24 | args = parser.parse_args()
25 |
26 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
27 | device = torch.device(device)
28 |
29 | dataset = PygLinkPropPredDataset(name='ogbl-ppa')
30 | data = dataset[0]
31 |
32 | model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length,
33 | args.context_size, args.walks_per_node,
34 | sparse=True).to(device)
35 |
36 | loader = model.loader(batch_size=args.batch_size, shuffle=True,
37 | num_workers=4)
38 | optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr)
39 |
40 | model.train()
41 | for epoch in range(1, args.epochs + 1):
42 | for i, (pos_rw, neg_rw) in enumerate(loader):
43 | optimizer.zero_grad()
44 | loss = model.loss(pos_rw.to(device), neg_rw.to(device))
45 | loss.backward()
46 | optimizer.step()
47 |
48 | if (i + 1) % args.log_steps == 0:
49 | print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
50 | f'Loss: {loss:.4f}')
51 |
52 | if (i + 1) % 100 == 0: # Save model every 100 steps.
53 | save_embedding(model)
54 | save_embedding(model)
55 |
56 |
57 | if __name__ == "__main__":
58 | main()
59 |
--------------------------------------------------------------------------------
/examples/linkproppred/vessel/gnn_config.yaml:
--------------------------------------------------------------------------------
1 |
2 | program: gnn_wandb.py
3 | method: grid
4 | metric:
5 | name: loss
6 | goal: minimize
7 | parameters:
8 | num_layers:
9 | values: [1, 2, 3, 4]
10 | hidden_channels:
11 | values: [32, 64, 128, 256]
12 | dropout:
13 | values: [0, 0.2, 0.5]
14 | lr:
15 | values: [1e-2, 1e-3, 1e-4]
16 | batch_size:
17 | values: [512, 1024, 65536]
--------------------------------------------------------------------------------
/examples/linkproppred/vessel/logger.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class Logger(object):
5 | def __init__(self, runs, info=None):
6 | self.info = info
7 | self.results = [[] for _ in range(runs)]
8 |
9 | def add_result(self, run, result):
10 | assert len(result) == 3
11 | assert run >= 0 and run < len(self.results)
12 | self.results[run].append(result)
13 |
14 | def print_statistics(self, run=None):
15 | if run is not None:
16 | result = 100 * torch.tensor(self.results[run])
17 | argmax = result[:, 1].argmax().item()
18 | print(f'Run {run + 1:02d}:')
19 | print(f'Highest Train: {result[:, 0].max():.2f}')
20 | print(f'Highest Valid: {result[:, 1].max():.2f}')
21 | print(f' Final Train: {result[argmax, 0]:.2f}')
22 | print(f' Final Test: {result[argmax, 2]:.2f}')
23 | else:
24 | result = 100 * torch.tensor(self.results)
25 |
26 | best_results = []
27 | for r in result:
28 | train1 = r[:, 0].max().item()
29 | valid = r[:, 1].max().item()
30 | train2 = r[r[:, 1].argmax(), 0].item()
31 | test = r[r[:, 1].argmax(), 2].item()
32 | best_results.append((train1, valid, train2, test))
33 |
34 | best_result = torch.tensor(best_results)
35 |
36 | print(f'All runs:')
37 | r = best_result[:, 0]
38 | print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
39 | r = best_result[:, 1]
40 | print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
41 | r = best_result[:, 2]
42 | print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}')
43 | r = best_result[:, 3]
44 | print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}')
45 |
--------------------------------------------------------------------------------
/examples/linkproppred/vessel/mf_config.yaml:
--------------------------------------------------------------------------------
1 | program: mf_wandb.py
2 | method: grid
3 | metric:
4 | name: loss
5 | goal: minimize
6 | parameters:
7 | num_layers:
8 | values: [1, 2, 3, 4]
9 | hidden_channels:
10 | values: [32, 64, 128, 256]
11 | dropout:
12 | values: [0, 0.2, 0.5]
13 | lr:
14 | values: [1e-2, 1e-3, 1e-4]
15 | batch_size:
16 | values: [512, 1024, 65536]
--------------------------------------------------------------------------------
/examples/linkproppred/vessel/mlp_config.yaml:
--------------------------------------------------------------------------------
1 |
2 | program: mlp_wandb.py
3 | method: grid
4 | metric:
5 | name: loss
6 | goal: minimize
7 | parameters:
8 | num_layers:
9 | values: [1, 2, 3, 4]
10 | hidden_channels:
11 | values: [32, 64, 128, 256]
12 | lr:
13 | values: [1e-2, 1e-3, 1e-4]
14 | batch_size:
15 | values: [512, 1024, 65536]
16 | dropout:
17 | values: [0, 0.2, 0.5]
--------------------------------------------------------------------------------
/examples/linkproppred/vessel/node2vec.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch_geometric.nn import Node2Vec
5 |
6 | from ogb.linkproppred import PygLinkPropPredDataset
7 |
8 |
9 | def save_embedding(model):
10 | torch.save(model.embedding.weight.data.cpu(), 'embedding.pt')
11 |
12 |
13 | def main():
14 |
15 | parser = argparse.ArgumentParser(description='OGBL-Vessel (Node2Vec)')
16 | parser.add_argument('--device', type=int, default=0)
17 | parser.add_argument('--embedding_dim', type=int, default=64)
18 | parser.add_argument('--walk_length', type=int, default=5)
19 | parser.add_argument('--context_size', type=int, default=5)
20 | parser.add_argument('--walks_per_node', type=int, default=10)
21 | parser.add_argument('--batch_size', type=int, default=256)
22 | parser.add_argument('--lr', type=float, default=1e-6)
23 | parser.add_argument('--epochs', type=int, default=2)
24 | parser.add_argument('--log_steps', type=int, default=1)
25 | args = parser.parse_args()
26 |
27 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
28 | device = torch.device(device)
29 |
30 | dataset = PygLinkPropPredDataset(name='ogbl-vessel')
31 | data = dataset[0]
32 |
33 | model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length,
34 | args.context_size, args.walks_per_node,
35 | sparse=True).to(device)
36 |
37 | loader = model.loader(batch_size=args.batch_size, shuffle=True,
38 | num_workers=4)
39 | optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr)
40 |
41 | model.train()
42 | for epoch in range(1, args.epochs + 1):
43 | for i, (pos_rw, neg_rw) in enumerate(loader):
44 | optimizer.zero_grad()
45 | loss = model.loss(pos_rw.to(device), neg_rw.to(device))
46 | loss.backward()
47 | optimizer.step()
48 |
49 | if (i + 1) % args.log_steps == 0:
50 | print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
51 | f'Loss: {loss:.4f}')
52 |
53 | if (i + 1) % 100 == 0: # Save model every 100 steps.
54 | save_embedding(model)
55 | save_embedding(model)
56 |
57 |
58 | if __name__ == "__main__":
59 | main()
60 |
--------------------------------------------------------------------------------
/examples/linkproppred/vessel/node2vec_config.yaml:
--------------------------------------------------------------------------------
1 | program: node2vec_wandb.py
2 | method: grid
3 | metric:
4 | name: loss
5 | goal: minimize
6 | parameters:
7 | lr:
8 | values: [1e-2, 1e-3, 1e-4, 1e-5]
9 | batch_size:
10 | values: [256, 512, 1024, 2048, 65536]
11 | walk_length:
12 | values: [3,5,10,15,20]
13 | context_size:
14 | values: [3,5,10,15,20]
15 | embedding_dim:
16 | values: [4,8,16,32,64,128]
17 | walks_per_node:
18 | values: [3,5,7,10,15,20]
--------------------------------------------------------------------------------
/examples/linkproppred/wikikg2/README.md:
--------------------------------------------------------------------------------
1 | # ogbl-wikikg2
2 |
3 | **Note (Dec 29, 2020)**: The older version `ogbl-wikikg` is deprecated because negative samples used in validation and test sets are found to be quite biased (i.e., half of the entity nodes are never sampled as negative examples). `ogbl-wikikg2` (available from `ogb>=1.2.4` ) fixes this issue while retaining everyelse the same. The leaderboard results of `ogbl-wikikg` and `ogbl-wikikg2` are *not* comparable.
4 |
5 | This code includes implementation of TransE, DistMult, ComplEx and RotatE with OGB evaluator. It is based on this [repository](https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding).
6 |
7 | ## Training & Evaluation
8 |
9 | ```
10 | # Run with default config
11 | bash examples.sh
12 | ```
--------------------------------------------------------------------------------
/examples/linkproppred/wikikg2/examples.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | # 100 dimension
5 | python run.py --do_train --cuda --do_valid --do_test --evaluate_train \
6 | --model TransE -n 128 -b 512 -d 100 -g 30 -a 1.0 -adv \
7 | -lr 0.0001 --max_steps 200000 --cpu_num 2 --test_batch_size 32
8 |
9 | python run.py --do_train --cuda --do_valid --do_test --evaluate_train \
10 | --model DistMult -n 128 -b 512 -d 100 -g 500 -a 1.0 -adv \
11 | -lr 0.001 --max_steps 200000 --cpu_num 2 --test_batch_size 32 -r 0.000002
12 |
13 | python run.py --do_train --cuda --do_valid --do_test --evaluate_train \
14 | --model RotatE -n 128 -b 512 -d 50 -g 5 -a 1.0 -adv \
15 | -lr 0.0001 --max_steps 200000 --cpu_num 2 --test_batch_size 32 -de
16 |
17 | python run.py --do_train --cuda --do_valid --do_test --evaluate_train \
18 | --model ComplEx -n 128 -b 512 -d 50 -g 500 -a 1.0 -adv \
19 | -lr 0.001 --max_steps 200000 --cpu_num 2 --test_batch_size 32 -de -dr -r 0.000002
20 |
21 | # 600 dimension
22 | python run.py --do_train --cuda --do_valid --do_test --evaluate_train \
23 | --model TransE -n 128 -b 512 -d 500 -g 30 -a 1.0 -adv \
24 | -lr 0.0001 --max_steps 200000 --cpu_num 2 --test_batch_size 32
25 |
26 | python run.py --do_train --cuda --do_valid --do_test --evaluate_train \
27 | --model DistMult -n 128 -b 512 -d 500 -g 500 -a 1.0 -adv \
28 | -lr 0.001 --max_steps 200000 --cpu_num 2 --test_batch_size 32 -r 0.000002
29 |
30 | python run.py --do_train --cuda --do_valid --do_test --evaluate_train \
31 | --model RotatE -n 128 -b 512 -d 250 -g 5 -a 1.0 -adv \
32 | -lr 0.0001 --max_steps 200000 --cpu_num 2 --test_batch_size 32 -de
33 |
34 | python run.py --do_train --cuda --do_valid --do_test --evaluate_train \
35 | --model ComplEx -n 128 -b 512 -d 250 -g 500 -a 1.0 -adv \
36 | -lr 0.001 --max_steps 200000 --cpu_num 2 --test_batch_size 32 -de -dr -r 0.000002
37 |
--------------------------------------------------------------------------------
/examples/lsc/mag240m/correct_and_smooth.py:
--------------------------------------------------------------------------------
1 | # NOTE: 256GB CPU memory required to run this script.
2 |
3 | import os.path as osp
4 | import time
5 | import argparse
6 |
7 | import torch
8 | import numpy as np
9 | from torch_sparse import SparseTensor
10 | from torch_geometric.nn import CorrectAndSmooth
11 | from torch_geometric.nn.conv.gcn_conv import gcn_norm
12 | from ogb.lsc import MAG240MDataset, MAG240MEvaluator
13 | from root import ROOT
14 |
15 | if __name__ == '__main__':
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument('--num_correction_layers', type=int, default=3)
18 | parser.add_argument('--correction_alpha', type=float, default=1.0)
19 | parser.add_argument('--num_smoothing_layers', type=int, default=2)
20 | parser.add_argument('--smoothing_alpha', type=float, default=0.8)
21 | args = parser.parse_args()
22 | print(args)
23 |
24 | dataset = MAG240MDataset(ROOT)
25 | evaluator = MAG240MEvaluator()
26 |
27 | t = time.perf_counter()
28 | print('Reading graph-agnostic predictions...', end=' ', flush=True)
29 | y_pred = torch.from_numpy(np.load('results/cs/pred.npy'))
30 | print(f'Done! [{time.perf_counter() - t:.2f}s]')
31 |
32 | t = time.perf_counter()
33 | print('Reading adjacency matrix...', end=' ', flush=True)
34 | path = f'{dataset.dir}/paper_to_paper_symmetric_gcn.pt'
35 | if osp.exists(path):
36 | adj_t = torch.load(path)
37 | else:
38 | path_sym = f'{dataset.dir}/paper_to_paper_symmetric.pt'
39 | if osp.exists(path_sym):
40 | adj_t = torch.load(path_sym)
41 | else:
42 | edge_index = dataset.edge_index('paper', 'cites', 'paper')
43 | edge_index = torch.from_numpy(edge_index)
44 | adj_t = SparseTensor(
45 | row=edge_index[0], col=edge_index[1],
46 | sparse_sizes=(dataset.num_papers, dataset.num_papers),
47 | is_sorted=True)
48 | adj_t = adj_t.to_symmetric()
49 | torch.save(adj_t, path_sym)
50 | adj_t = gcn_norm(adj_t, add_self_loops=True)
51 | torch.save(adj_t, path)
52 | print(f'Done! [{time.perf_counter() - t:.2f}s]')
53 |
54 | train_idx = torch.from_numpy(dataset.get_idx_split('train'))
55 | valid_idx = torch.from_numpy(dataset.get_idx_split('valid'))
56 | test_idx = torch.from_numpy(dataset.get_idx_split('test-dev'))
57 |
58 | y_train = torch.from_numpy(dataset.paper_label[train_idx]).to(torch.long)
59 | y_valid = torch.from_numpy(dataset.paper_label[valid_idx]).to(torch.long)
60 |
61 | model = CorrectAndSmooth(args.num_correction_layers, args.correction_alpha,
62 | args.num_smoothing_layers, args.smoothing_alpha,
63 | autoscale=True)
64 |
65 | t = time.perf_counter()
66 | print('Correcting predictions...', end=' ', flush=True)
67 | y_pred = model.correct(y_pred, y_train, train_idx, adj_t)
68 | print(f'Done! [{time.perf_counter() - t:.2f}s]')
69 |
70 | t = time.perf_counter()
71 | print('Smoothing predictions...', end=' ', flush=True)
72 | y_pred = model.smooth(y_pred, y_train, train_idx, adj_t)
73 | print(f'Done! [{time.perf_counter() - t:.2f}s]')
74 |
75 | train_acc = evaluator.eval({
76 | 'y_true': y_train,
77 | 'y_pred': y_pred[train_idx].argmax(dim=-1)
78 | })['acc']
79 | valid_acc = evaluator.eval({
80 | 'y_true': y_valid,
81 | 'y_pred': y_pred[valid_idx].argmax(dim=-1)
82 | })['acc']
83 | print(f'Train: {train_acc:.4f}, Valid: {valid_acc:.4f}')
84 |
85 | res = {'y_pred': y_pred[test_idx].argmax(dim=-1)}
86 | evaluator.save_test_submission(res, 'results/cs', mode = 'test-dev')
87 |
--------------------------------------------------------------------------------
/examples/lsc/mag240m/label_prop.py:
--------------------------------------------------------------------------------
1 | # NOTE: More than 256GB CPU memory required to run this script.
2 | # Use `--low-memory` to reduce memory consumption by using half-precision
3 |
4 | import os.path as osp
5 | import time
6 | import argparse
7 |
8 | import torch
9 | import torch.nn.functional as F
10 | from torch_sparse import SparseTensor
11 | from torch_geometric.nn import LabelPropagation
12 | from torch_geometric.nn.conv.gcn_conv import gcn_norm
13 | from ogb.lsc import MAG240MDataset, MAG240MEvaluator
14 | from root import ROOT
15 |
16 | if __name__ == '__main__':
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--num_layers', type=int, default=3),
19 | parser.add_argument('--alpha', type=float, default=0.9),
20 | parser.add_argument('--low-memory', action='store_true'),
21 | args = parser.parse_args()
22 | print(args)
23 |
24 | dataset = MAG240MDataset(ROOT)
25 | evaluator = MAG240MEvaluator()
26 |
27 | t = time.perf_counter()
28 | print('Reading adjacency matrix...', end=' ', flush=True)
29 | path = f'{dataset.dir}/paper_to_paper_symmetric.pt'
30 | if osp.exists(path):
31 | adj_t = torch.load(path)
32 | else:
33 | edge_index = dataset.edge_index('paper', 'cites', 'paper')
34 | edge_index = torch.from_numpy(edge_index)
35 | adj_t = SparseTensor(
36 | row=edge_index[0], col=edge_index[1],
37 | sparse_sizes=(dataset.num_papers, dataset.num_papers),
38 | is_sorted=True)
39 | adj_t = adj_t.to_symmetric()
40 | torch.save(adj_t, path)
41 | adj_t = gcn_norm(adj_t, add_self_loops=False)
42 | if args.low_memory:
43 | adj_t = adj_t.to(torch.half)
44 | print(f'Done! [{time.perf_counter() - t:.2f}s]')
45 |
46 | train_idx = dataset.get_idx_split('train')
47 | valid_idx = dataset.get_idx_split('valid')
48 | test_idx = dataset.get_idx_split('test-dev')
49 |
50 | y_train = torch.from_numpy(dataset.paper_label[train_idx]).to(torch.long)
51 | y_valid = torch.from_numpy(dataset.paper_label[valid_idx]).to(torch.long)
52 |
53 | model = LabelPropagation(args.num_layers, args.alpha)
54 |
55 | N, C = dataset.num_papers, dataset.num_classes
56 |
57 | t = time.perf_counter()
58 | print('Propagating labels...', end=' ', flush=True)
59 | if args.low_memory:
60 | y = torch.zeros(N, C, dtype=torch.half)
61 | y[train_idx] = F.one_hot(y_train, C).to(torch.half)
62 | out = model(y, adj_t, post_step=lambda x: x)
63 | y_pred = out.argmax(dim=-1)
64 | else:
65 | y = torch.zeros(N, C)
66 | y[train_idx] = F.one_hot(y_train, C).to(torch.float)
67 | out = model(y, adj_t)
68 | y_pred = out.argmax(dim=-1)
69 | print(f'Done! [{time.perf_counter() - t:.2f}s]')
70 |
71 | train_acc = evaluator.eval({
72 | 'y_true': y_train,
73 | 'y_pred': y_pred[train_idx]
74 | })['acc']
75 | valid_acc = evaluator.eval({
76 | 'y_true': y_valid,
77 | 'y_pred': y_pred[valid_idx]
78 | })['acc']
79 | print(f'Train: {train_acc:.4f}, Valid: {valid_acc:.4f}')
80 |
81 | res = {'y_pred': y_pred[test_idx]}
82 | evaluator.save_test_submission(res, 'results/label_prop', mode = 'test-dev')
83 |
--------------------------------------------------------------------------------
/examples/lsc/mag240m/preprocess_sgc.py:
--------------------------------------------------------------------------------
1 | # NOTE: 128-256GB CPU memory required to run this script.
2 |
3 | import os
4 | import time
5 | import argparse
6 | import os.path as osp
7 | from tqdm.auto import tqdm
8 |
9 | import torch
10 | import numpy as np
11 | from torch_sparse import SparseTensor
12 | from torch_geometric.nn.conv.gcn_conv import gcn_norm
13 | from ogb.lsc import MAG240MDataset
14 | from root import ROOT
15 |
16 | if __name__ == '__main__':
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--num_layers', type=int, default=3),
19 | args = parser.parse_args()
20 | print(args)
21 |
22 | dataset = MAG240MDataset(ROOT)
23 |
24 | t = time.perf_counter()
25 | print('Reading adjacency matrix...', end=' ', flush=True)
26 | path = f'{dataset.dir}/paper_to_paper_symmetric_gcn.pt'
27 | if osp.exists(path):
28 | adj_t = torch.load(path)
29 | else:
30 | path_sym = f'{dataset.dir}/paper_to_paper_symmetric.pt'
31 | if osp.exists(path_sym):
32 | adj_t = torch.load(path_sym)
33 | else:
34 | edge_index = dataset.edge_index('paper', 'cites', 'paper')
35 | edge_index = torch.from_numpy(edge_index)
36 | adj_t = SparseTensor(
37 | row=edge_index[0], col=edge_index[1],
38 | sparse_sizes=(dataset.num_papers, dataset.num_papers),
39 | is_sorted=True)
40 | adj_t = adj_t.to_symmetric()
41 | torch.save(adj_t, path_sym)
42 | adj_t = gcn_norm(adj_t, add_self_loops=True)
43 | torch.save(adj_t, path)
44 | print(f'Done! [{time.perf_counter() - t:.2f}s]')
45 |
46 | train_idx = dataset.get_idx_split('train')
47 | valid_idx = dataset.get_idx_split('valid')
48 | test_idx = dataset.get_idx_split('test-dev')
49 | num_features = dataset.num_paper_features
50 |
51 | pbar = tqdm(total=args.num_layers * (num_features // 128))
52 | pbar.set_description('Pre-processing node features')
53 |
54 | for j in range(0, num_features, 128): # Run spmm in column-wise chunks...
55 | x = dataset.paper_feat[:, j:min(j + 128, num_features)]
56 | x = torch.from_numpy(x.astype(np.float32))
57 |
58 | for i in range(1, args.num_layers + 1):
59 | x = adj_t @ x
60 | np.save(f'{dataset.dir}/x_train_{i}_{j}.npy', x[train_idx].numpy())
61 | np.save(f'{dataset.dir}/x_valid_{i}_{j}.npy', x[valid_idx].numpy())
62 | np.save(f'{dataset.dir}/x_test_{i}_{j}.npy', x[test_idx].numpy())
63 | pbar.update(1)
64 | pbar.close()
65 |
66 | t = time.perf_counter()
67 | print('Merging node features...', end=' ', flush=True)
68 | for i in range(1, args.num_layers + 1):
69 | x_train, x_valid, x_test = [], [], []
70 | for j in range(0, num_features, 128):
71 | x_train += [np.load(f'{dataset.dir}/x_train_{i}_{j}.npy')]
72 | x_valid += [np.load(f'{dataset.dir}/x_valid_{i}_{j}.npy')]
73 | x_test += [np.load(f'{dataset.dir}/x_test_{i}_{j}.npy')]
74 | x_train = np.concatenate(x_train, axis=-1)
75 | x_valid = np.concatenate(x_valid, axis=-1)
76 | x_test = np.concatenate(x_test, axis=-1)
77 | np.save(f'{dataset.dir}/x_train_{i}.npy', x_train)
78 | np.save(f'{dataset.dir}/x_valid_{i}.npy', x_valid)
79 | np.save(f'{dataset.dir}/x_test_{i}.npy', x_test)
80 | print(f'Done! [{time.perf_counter() - t:.2f}s]')
81 |
82 | t = time.perf_counter()
83 | print('Cleaning up...', end=' ', flush=True)
84 | for i in range(1, args.num_layers + 1):
85 | for j in range(0, num_features, 128):
86 | os.remove(f'{dataset.dir}/x_train_{i}_{j}.npy')
87 | os.remove(f'{dataset.dir}/x_valid_{i}_{j}.npy')
88 | os.remove(f'{dataset.dir}/x_test_{i}_{j}.npy')
89 | print(f'Done! [{time.perf_counter() - t:.2f}s]')
90 |
--------------------------------------------------------------------------------
/examples/lsc/mag240m/root.py:
--------------------------------------------------------------------------------
1 | ROOT = '/dfs/user/weihuahu/ogb/ogb/lsc/dataset'
2 |
--------------------------------------------------------------------------------
/examples/lsc/pcqm4m-v2/README.md:
--------------------------------------------------------------------------------
1 | # Baseline code for PCQM4Mv2
2 |
3 | - Please refer to the **[OGB-LSC paper](https://arxiv.org/abs/2103.09430)** for the detailed setting.
4 | - Baseline code based on **[DGL](https://www.dgl.ai/)** is available **[here](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/PCQM4M)**.
5 |
6 | ## Installation requirements
7 | ```
8 | ogb>=1.3.2
9 | rdkit>=2021.03.1
10 | torch>=1.7.0
11 | ```
12 |
13 | ## Basic commandline arguments
14 | - `LOG_DIR`: Tensorboard log directory.
15 | - `CHECKPOINT_DIR`: Directory to save the best validation checkpoint. The checkpoint file will be saved at `${CHECKPOINT_DIR}/checkpoint.pt`.
16 | - `TEST_DIR`: Directory path to save the test submission. The test file will be saved at `${TEST_DIR}/y_pred_pcqm4mv2.npz`.
17 |
18 | ## Baseline models
19 |
20 | ### GIN [1]
21 | ```bash
22 | python main_gnn.py --gnn gin --log_dir $LOG_DIR --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
23 | ```
24 |
25 | ### GIN-virtual [1,3]
26 | ```bash
27 | python main_gnn.py --gnn gin-virtual --log_dir $LOG_DIR --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
28 | ```
29 |
30 | ### GCN [2]
31 | ```bash
32 | python main_gnn.py --gnn gcn --log_dir $LOG_DIR --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
33 | ```
34 |
35 | ### GCN-virtual [2,3]
36 | ```bash
37 | python main_gnn.py --gnn gcn-virtual --log_dir $LOG_DIR --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
38 | ```
39 |
40 | ### MLP + Morgan fingerprint baseline [4]
41 | ```bash
42 | python main_mlpfp.py --log_dir $LOG_DIR --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
43 | ```
44 |
45 | ## Measuring the Test Inference Time
46 | The code below takes **the raw SMILES strings as input**, uses the saved checkpoint, and perform inference over for all the 147,037 test-dev (147,432 test-challenge) molecules.
47 | ```bash
48 | python test_inference_gnn.py --gnn $GNN --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
49 | ```
50 |
51 | For GIN-virtual, the total inference (from SMILES strings to target values) takes around 1 minute on a single GeForce RTX 2080 GPU and an Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz.
52 | For your model, **the total inference time needs to be less than 4 hours on a single GPU and a CPU**. Ideally, you should use the GPU/CPU with the same spec as ours. However, we also allow the use of other GPU/CPU specs, as long as the specs are clearly reported in the final submission.
53 |
54 | ## Performance
55 |
56 | | Model |Valid MAE | Test-dev MAE* | \#Parameters | Hardware |
57 | |:------------------ |:-------------- |:---------------| --------------:|----------|
58 | | GIN | 0.1195 | 0.1218 | 3.8M | GeForce RTX 2080 (11GB GPU) |
59 | | GIN-virtual | 0.1083 | 0.1084 | 6.7M | GeForce RTX 2080 (11GB GPU) |
60 | | GCN | 0.1379 | 0.1398 | 2.0M | GeForce RTX 2080 (11GB GPU) |
61 | | GCN-virtual | 0.1153 | 0.1152 | 4.9M | GeForce RTX 2080 (11GB GPU) |
62 | | MLP+Fingerprint | 0.1753 | 0.1760 | 16.1M | GeForce RTX 2080 (11GB GPU) |
63 |
64 | \* Test MAE is evaluated on the **hidden test-dev set.**
65 |
66 | ## 3D graphs
67 |
68 | We further provide the equilibrium 3D graph structure for training molecules. The zipped folder can be downloaded **[here](http://ogb-data.stanford.edu/data/lsc/pcqm4m-v2_xyz.zip)** (2.7GB). The folder contains the xyz coordinate files of all the training molecules. For `i`-th molecule, the corresponding xyz file is `i.xyz`, e.g., xyz file of the 1234-th molecule is named `1234.xyz`. The community should feel free to exploit 3D structural information to improve their model performance. Note that 3D information is *not* provided for validation and test molecules, and test-time inference needs to be performed without explicit 3D information.
69 |
70 | ## References
71 | [1] Xu, K., Hu, W., Leskovec, J., & Jegelka, S. (2019). How powerful are graph neural networks?. ICLR 2019
72 |
73 | [2] Kipf, T. N., & Welling, M. (2017). Semi-supervised classification with graph convolutional networks. ICLR 2017
74 |
75 | [3] Gilmer, J., Schoenholz, S. S., Riley, P. F., Vinyals, O., & Dahl, G. E. Neural message passing for quantum chemistry. ICML 2017.
76 |
77 | [4] Morgan, Harry L. "The generation of a unique machine description for chemical structures-a technique developed at chemical abstracts service." Journal of Chemical Documentation 5.2 (1965): 107-113.
78 |
--------------------------------------------------------------------------------
/examples/lsc/pcqm4m-v2/gnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch_geometric.nn import MessagePassing
3 | from torch_geometric.nn import global_add_pool, global_mean_pool, global_max_pool, GlobalAttention, Set2Set
4 | import torch.nn.functional as F
5 | from torch_geometric.nn.inits import uniform
6 |
7 | from conv import GNN_node, GNN_node_Virtualnode
8 |
9 | class GNN(torch.nn.Module):
10 |
11 | def __init__(self, num_tasks = 1, num_layers = 5, emb_dim = 300,
12 | gnn_type = 'gin', virtual_node = True, residual = False, drop_ratio = 0, JK = "last", graph_pooling = "sum"):
13 | '''
14 | num_tasks (int): number of labels to be predicted
15 | virtual_node (bool): whether to add virtual node or not
16 | '''
17 | super(GNN, self).__init__()
18 |
19 | self.num_layers = num_layers
20 | self.drop_ratio = drop_ratio
21 | self.JK = JK
22 | self.emb_dim = emb_dim
23 | self.num_tasks = num_tasks
24 | self.graph_pooling = graph_pooling
25 |
26 | if self.num_layers < 2:
27 | raise ValueError("Number of GNN layers must be greater than 1.")
28 |
29 | ### GNN to generate node embeddings
30 | if virtual_node:
31 | self.gnn_node = GNN_node_Virtualnode(num_layers, emb_dim, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
32 | else:
33 | self.gnn_node = GNN_node(num_layers, emb_dim, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
34 |
35 |
36 | ### Pooling function to generate whole-graph embeddings
37 | if self.graph_pooling == "sum":
38 | self.pool = global_add_pool
39 | elif self.graph_pooling == "mean":
40 | self.pool = global_mean_pool
41 | elif self.graph_pooling == "max":
42 | self.pool = global_max_pool
43 | elif self.graph_pooling == "attention":
44 | self.pool = GlobalAttention(gate_nn = torch.nn.Sequential(torch.nn.Linear(emb_dim, emb_dim), torch.nn.BatchNorm1d(emb_dim), torch.nn.ReLU(), torch.nn.Linear(emb_dim, 1)))
45 | elif self.graph_pooling == "set2set":
46 | self.pool = Set2Set(emb_dim, processing_steps = 2)
47 | else:
48 | raise ValueError("Invalid graph pooling type.")
49 |
50 | if graph_pooling == "set2set":
51 | self.graph_pred_linear = torch.nn.Linear(2*self.emb_dim, self.num_tasks)
52 | else:
53 | self.graph_pred_linear = torch.nn.Linear(self.emb_dim, self.num_tasks)
54 |
55 | def forward(self, batched_data):
56 | h_node = self.gnn_node(batched_data)
57 |
58 | h_graph = self.pool(h_node, batched_data.batch)
59 | output = self.graph_pred_linear(h_graph)
60 |
61 | if self.training:
62 | return output
63 | else:
64 | # At inference time, we clamp the value between 0 and 20
65 | return torch.clamp(output, min=0, max=20)
66 |
67 |
68 | if __name__ == '__main__':
69 | GNN(num_tasks = 10)
70 |
--------------------------------------------------------------------------------
/examples/lsc/pcqm4m/README.md:
--------------------------------------------------------------------------------
1 | # Baseline code for PCQM4M
2 |
3 | ## **Important:** This dataset has been deprecated. Please use the updated version, PCQM4Mv2.
4 |
5 | - Please refer to the **[OGB-LSC paper](https://arxiv.org/abs/2103.09430)** for the detailed setting.
6 | - Baseline code based on **[DGL](https://www.dgl.ai/)** is available **[here](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/PCQM4M)**.
7 |
8 | ## Installation requirements
9 | ```
10 | ogb>=1.3.0
11 | rdkit>=2019.03.1
12 | torch>=1.7.0
13 | ```
14 |
15 | ## Basic commandline arguments
16 | - `LOG_DIR`: Tensorboard log directory.
17 | - `CHECKPOINT_DIR`: Directory to save the best validation checkpoint. The checkpoint file will be saved at `${CHECKPOINT_DIR}/checkpoint.pt`.
18 | - `TEST_DIR`: Directory path to save the test submission. The test file will be saved at `${TEST_DIR}/y_pred_pcqm4m.npz`.
19 |
20 | ## Baseline models
21 |
22 | ### GIN [1]
23 | ```bash
24 | python main_gnn.py --gnn gin --log_dir $LOG_DIR --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
25 | ```
26 |
27 | ### GIN-virtual [1,3]
28 | ```bash
29 | python main_gnn.py --gnn gin-virtual --log_dir $LOG_DIR --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
30 | ```
31 |
32 | ### GCN [2]
33 | ```bash
34 | python main_gnn.py --gnn gcn --log_dir $LOG_DIR --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
35 | ```
36 |
37 | ### GCN-virtual [2,3]
38 | ```bash
39 | python main_gnn.py --gnn gcn-virtual --log_dir $LOG_DIR --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
40 | ```
41 |
42 | ### MLP + Morgan fingerprint baseline [4]
43 | ```bash
44 | python main_mlpfp.py --log_dir $LOG_DIR --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
45 | ```
46 |
47 | ## Measuring the Test Inference Time
48 | The code below takes **the raw SMILES strings as input**, uses the saved checkpoint, and perform inference over for all the 377,423 test molecules.
49 | ```bash
50 | python test_inference_gnn.py --gnn $GNN --checkpoint_dir $CHECKPOINT_DIR --save_test_dir $TEST_DIR
51 | ```
52 |
53 | For GIN-virtual, the total inference (from SMILES strings to target values) takes around 3 minutes on a single GeForce RTX 2080 GPU and an Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz.
54 | For your model, **the total inference time needs to be less than 12 hours on a single GPU and a CPU**. Ideally, you should use the GPU/CPU with the same spec as ours. However, we also allow the use of other GPU/CPU specs, as long as the specs are clearly reported in the final submission.
55 |
56 | ## Performance
57 |
58 | | Model |Valid MAE | Test MAE* | \#Parameters | Hardware |
59 | |:------------------ |:-------------- |:---------------| --------------:|----------|
60 | | GIN | 0.1536 | 0.1678 | 3.8M | GeForce RTX 2080 (11GB GPU) |
61 | | GIN-virtual | 0.1396 | 0.1487 | 6.7M | GeForce RTX 2080 (11GB GPU) |
62 | | GCN | 0.1684 | 0.1838 | 2.0M | GeForce RTX 2080 (11GB GPU) |
63 | | GCN-virtual | 0.1510 | 0.1579 | 4.9M | GeForce RTX 2080 (11GB GPU) |
64 | | MLP+Fingerprint | 0.2044 | 0.2068 | 16.1M | GeForce RTX 2080 (11GB GPU) |
65 |
66 | \* Test MAE is evaluated on the **hidden test set.**
67 |
68 | ## References
69 | [1] Xu, K., Hu, W., Leskovec, J., & Jegelka, S. (2019). How powerful are graph neural networks?. ICLR 2019
70 |
71 | [2] Kipf, T. N., & Welling, M. (2017). Semi-supervised classification with graph convolutional networks. ICLR 2017
72 |
73 | [3] Gilmer, J., Schoenholz, S. S., Riley, P. F., Vinyals, O., & Dahl, G. E. Neural message passing for quantum chemistry. ICML 2017.
74 |
75 | [4] Morgan, Harry L. "The generation of a unique machine description for chemical structures-a technique developed at chemical abstracts service." Journal of Chemical Documentation 5.2 (1965): 107-113.
76 |
--------------------------------------------------------------------------------
/examples/lsc/pcqm4m/gnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch_geometric.nn import MessagePassing
3 | from torch_geometric.nn import global_add_pool, global_mean_pool, global_max_pool, GlobalAttention, Set2Set
4 | import torch.nn.functional as F
5 | from torch_geometric.nn.inits import uniform
6 |
7 | from conv import GNN_node, GNN_node_Virtualnode
8 |
9 | class GNN(torch.nn.Module):
10 |
11 | def __init__(self, num_tasks = 1, num_layers = 5, emb_dim = 300,
12 | gnn_type = 'gin', virtual_node = True, residual = False, drop_ratio = 0, JK = "last", graph_pooling = "sum"):
13 | '''
14 | num_tasks (int): number of labels to be predicted
15 | virtual_node (bool): whether to add virtual node or not
16 | '''
17 | super(GNN, self).__init__()
18 |
19 | self.num_layers = num_layers
20 | self.drop_ratio = drop_ratio
21 | self.JK = JK
22 | self.emb_dim = emb_dim
23 | self.num_tasks = num_tasks
24 | self.graph_pooling = graph_pooling
25 |
26 | if self.num_layers < 2:
27 | raise ValueError("Number of GNN layers must be greater than 1.")
28 |
29 | ### GNN to generate node embeddings
30 | if virtual_node:
31 | self.gnn_node = GNN_node_Virtualnode(num_layers, emb_dim, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
32 | else:
33 | self.gnn_node = GNN_node(num_layers, emb_dim, JK = JK, drop_ratio = drop_ratio, residual = residual, gnn_type = gnn_type)
34 |
35 |
36 | ### Pooling function to generate whole-graph embeddings
37 | if self.graph_pooling == "sum":
38 | self.pool = global_add_pool
39 | elif self.graph_pooling == "mean":
40 | self.pool = global_mean_pool
41 | elif self.graph_pooling == "max":
42 | self.pool = global_max_pool
43 | elif self.graph_pooling == "attention":
44 | self.pool = GlobalAttention(gate_nn = torch.nn.Sequential(torch.nn.Linear(emb_dim, emb_dim), torch.nn.BatchNorm1d(emb_dim), torch.nn.ReLU(), torch.nn.Linear(emb_dim, 1)))
45 | elif self.graph_pooling == "set2set":
46 | self.pool = Set2Set(emb_dim, processing_steps = 2)
47 | else:
48 | raise ValueError("Invalid graph pooling type.")
49 |
50 | if graph_pooling == "set2set":
51 | self.graph_pred_linear = torch.nn.Linear(2*self.emb_dim, self.num_tasks)
52 | else:
53 | self.graph_pred_linear = torch.nn.Linear(self.emb_dim, self.num_tasks)
54 |
55 | def forward(self, batched_data):
56 | h_node = self.gnn_node(batched_data)
57 |
58 | h_graph = self.pool(h_node, batched_data.batch)
59 | output = self.graph_pred_linear(h_graph)
60 |
61 | if self.training:
62 | return output
63 | else:
64 | # At inference time, relu is applied to output to ensure positivity
65 | return torch.clamp(output, min=0, max=50)
66 |
67 |
68 | if __name__ == '__main__':
69 | GNN(num_tasks = 10)
70 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m-v2/README.md:
--------------------------------------------------------------------------------
1 | # Baseline code for WikiKG90Mv2
2 |
3 | - Please refer to the **[OGB-LSC paper](https://arxiv.org/abs/2103.09430)** for the detailed setting.
4 | - We use [SMORE](https://arxiv.org/abs/2110.14890) framework. Please refer to [here](https://github.com/google-research/smore/blob/wikikgv2/README_wikikgv2.md) for the code and instruction.
5 |
6 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/README.md:
--------------------------------------------------------------------------------
1 | # Baseline code for WikiKG90M
2 |
3 | ## **Important:** This dataset has been deprecated. Please use the updated version, WikiKG90Mv2.
4 |
5 | Please refer to the **[OGB-LSC paper](https://arxiv.org/abs/2103.09430)** for the detailed setting.
6 |
7 | ## Installation requirements
8 | ```
9 | ogb>=1.3.0
10 | torch>=1.7.0
11 | dgl==0.4.3
12 | ```
13 | In addition, please install the dgl-ke-ogb-lsc by `cd dgl-ke-ogb-lsc/python` and `pip install -e .`
14 |
15 | ### Acknowledgement
16 | Our implementation is based on [DGL-KE](https://github.com/awslabs/dgl-ke).
17 |
18 | ## Key commandline arguments
19 | - `model_name`: Decoder model. Choose from [`TransE_l2`, `ComplEx`].
20 | - `encoder_model_name`: Encoder model. Choose from [`shallow`, `roberta`, `concat`].
21 | - `data_path`: Directory that downloads and stores the dataset.
22 | - `save_path`: Directory that saves model and prediction file.
23 |
24 | ## Baseline models
25 | - TransE-Shallow [1]
26 | - TransE-RoBERTa [1,3]
27 | - TransE-Concat [1,3]
28 | - ComplEx-Shallow [2]
29 | - ComplEx-RoBERTa [2,3]
30 | - ComplEx-Concat [2,3]
31 |
32 | All the scripts for the baseline models can be found in [`run.sh`](https://github.com/snap-stanford/ogb/blob/master/examples/lsc/wikikg90m/run.sh).
33 |
34 | ## Saving Test Submission
35 | After training models using the script, there will be prediction files dumped under the `$SAVE_PATH`. The prediction files are in the following format: `[valid/test]_$PROCID_$STEP`, e.g., `test_0_99999.pkl`, `test_1_99999.pkl`, which means the test prediction files generated by training on two GPUs at step 99999. Then please use the following code to save the test submission file based on the best validation performance.
36 | (`$NUM_PROC` represents the number of GPUs used to train the model, in the example above, set `$NUM_PROC` to 2)
37 | ```
38 | python save_test_submission.py $SAVE_PATH $NUM_PROC
39 | ```
40 | This will save the test submission file at `${SAVE_PATH}/t_pred_wikikg90m.npz`.
41 |
42 | ## Performance
43 |
44 | | Model |Valid MRR | Test MRR* | \#Parameters | Hardware |
45 | |:------------------ |:-------------- |:---------------| --------------:|----------|
46 | | TransE-Shallow | 0.7559 | 0.7412 | 17.4B | Tesla P100 (16GB GPU) |
47 | | ComplEx-Shallow | 0.6142 | 0.5883 | 17.4B | Tesla P100 (16GB GPU) |
48 | | TransE-RoBERTa | 0.6039 | 0.6288 | 0.3M | Tesla P100 (16GB GPU) |
49 | | ComplEx-RoBERTa | 0.7052 | 0.7186 | 0.3M | Tesla P100 (16GB GPU) |
50 | | TransE-Concat | 0.8494 | 0.8548 | 17.4B | Tesla P100 (16GB GPU) |
51 | | ComplEx-Concat | 0.8425 | 0.8637 | 17.4B | Tesla P100 (16GB GPU) |
52 |
53 | \* Test MRR is evaluated on the **hidden test set.**
54 |
55 |
56 | ## References
57 | [1] Bordes, A., Usunier, N., Garcia-Duran, A., Weston, J., & Yakhnenko, O. (2013). Translating embeddings for modeling multi-relational data. NeurIPS 2013
58 |
59 | [2] Trouillon, T., Welbl, J., Riedel, S., Gaussier, É., & Bouchard, G. (2016). Complex embeddings for simple link prediction. ICML 2016
60 |
61 | [3] Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L. & Stoyanov, V. (2019). RoBERTa: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692.
62 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/.gitignore:
--------------------------------------------------------------------------------
1 | ckpts/
2 | old-ckpts/
3 | # IDE
4 | .idea
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 |
11 | # C extensions
12 | *.so
13 |
14 | # Distribution / packaging
15 | .Python
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # pyenv
81 | .python-version
82 |
83 | # celery beat schedule file
84 | celerybeat-schedule
85 |
86 | # SageMath parsed files
87 | *.sage.py
88 |
89 | # Environments
90 | .env
91 | .venv
92 | env/
93 | venv/
94 | ENV/
95 | env.bak/
96 | venv.bak/
97 |
98 | # Spyder project settings
99 | .spyderproject
100 | .spyproject
101 |
102 | # Rope project settings
103 | .ropeproject
104 |
105 | # mkdocs documentation
106 | /site
107 |
108 | # mypy
109 | .mypy_cache/
110 |
111 | examples/pytorch/data/ind.pubmed.y
112 | examples/pytorch/data/ind.pubmed.x
113 | examples/pytorch/data/ind.pubmed.ty
114 | examples/pytorch/data/ind.pubmed.tx
115 | examples/pytorch/data/ind.pubmed.test.index
116 | examples/pytorch/data/ind.pubmed.graph
117 | examples/pytorch/data/ind.pubmed.ally
118 | examples/pytorch/data/ind.pubmed.allx
119 | examples/pytorch/data/ind.cora.y
120 | examples/pytorch/data/ind.cora.x
121 | examples/pytorch/data/ind.cora.ty
122 | examples/pytorch/data/ind.cora.tx
123 | examples/pytorch/data/ind.cora.test.index
124 | examples/pytorch/data/ind.cora.graph
125 | examples/pytorch/data/ind.cora.ally
126 | examples/pytorch/data/ind.cora.allx
127 | examples/pytorch/data/ind.citeseer.y
128 | examples/pytorch/data/ind.citeseer.x
129 | examples/pytorch/data/ind.citeseer.ty
130 | examples/pytorch/data/ind.citeseer.tx
131 | examples/pytorch/data/ind.citeseer.test.index
132 | examples/pytorch/data/ind.citeseer.graph
133 | examples/pytorch/data/ind.citeseer.ally
134 | examples/pytorch/data/ind.citeseer.allx
135 | examples/pytorch/.DS_Store
136 | examples/.DS_Store
137 | examples/pytorch/generative_graph/*.p
138 | .DS_Store
139 |
140 | # data directory
141 | _download
142 |
143 | # CTags & CScope
144 | tags
145 | cscope.*
146 |
147 | # Vim
148 | *.swp
149 | *.swo
150 | *.un~
151 | *~
152 |
153 | # parameters
154 | *.params
155 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 | ## Finding contributions to work on
43 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
44 |
45 | ## Code of Conduct
46 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
47 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
48 | opensource-codeofconduct@amazon.com with any additional questions or comments.
49 |
50 |
51 | ## Security issue notifications
52 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
53 |
54 |
55 | ## Licensing
56 |
57 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
58 |
59 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
60 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | # Contributors of DGL-KE
2 |
3 | * [Zhichen Jiang](https://github.com/sherry-1001): Add a profiler to MXNet KGE models.
4 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/Jenkinsfile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env groovy
2 |
3 | def init_git() {
4 | sh "rm -rf *"
5 | checkout scm
6 | sh "git submodule update --recursive --init"
7 | }
8 |
9 | def kg_test_linux(backend, dev) {
10 | init_git()
11 | timeout(time: 20, unit: 'MINUTES') {
12 | sh "bash tests/scripts/task_kg_test.sh ${backend} ${dev}"
13 | }
14 | }
15 |
16 | pipeline {
17 | agent any
18 | stages {
19 | stage("Lint Check") {
20 | agent {
21 | docker {
22 | label "linux-cpu-node"
23 | image "dgllib/dgl-ci-lint"
24 | }
25 | }
26 | steps {
27 | init_git()
28 | sh "bash tests/scripts/task_lint.sh"
29 | }
30 | post {
31 | always {
32 | cleanWs disableDeferredWipeout: true, deleteDirs: true
33 | }
34 | }
35 | }
36 | stage("App") {
37 | parallel {
38 | stage("Knowledge Graph CPU") {
39 | agent {
40 | docker {
41 | label "linux-cpu-node"
42 | image "dgllib/dgl-ci-cpu:conda"
43 | }
44 | }
45 | stages {
46 | stage("Torch test") {
47 | steps {
48 | kg_test_linux("pytorch", "cpu")
49 | }
50 | }
51 | stage("MXNet test") {
52 | steps {
53 | kg_test_linux("mxnet", "cpu")
54 | }
55 | }
56 | }
57 | post {
58 | always {
59 | cleanWs disableDeferredWipeout: true, deleteDirs: true
60 | }
61 | }
62 | }
63 | stage("Knowledge Graph GPU") {
64 | agent {
65 | docker {
66 | label "linux-gpu-node"
67 | image "dgllib/dgl-ci-gpu:conda"
68 | args "--runtime nvidia"
69 | }
70 | }
71 | stages {
72 | stage("Torch test") {
73 | steps {
74 | kg_test_linux("pytorch", "gpu")
75 | }
76 | }
77 | stage("MXNet test") {
78 | steps {
79 | kg_test_linux("mxnet", "gpu")
80 | }
81 | }
82 | }
83 | post {
84 | always {
85 | cleanWs disableDeferredWipeout: true, deleteDirs: true
86 | }
87 | }
88 | }
89 | }
90 | }
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/README.md:
--------------------------------------------------------------------------------
1 | #
2 |
3 | [](./LICENSE)
4 |
5 | [Documentation](https://dglke.dgl.ai/doc/)
6 |
7 | # This is an implementation of DGL-KE for OGB-LSC
8 |
9 | Check out [the original repo](https://github.com/awslabs/dgl-ke) for more details.
10 |
11 | ### Cite
12 |
13 | If you use DGL-KE in a scientific publication, we would appreciate citations to the following paper:
14 |
15 | ```bibtex
16 | @inproceedings{DGL-KE,
17 | author = {Zheng, Da and Song, Xiang and Ma, Chao and Tan, Zeyuan and Ye, Zihao and Dong, Jin and Xiong, Hao and Zhang, Zheng and Karypis, George},
18 | title = {DGL-KE: Training Knowledge Graph Embeddings at Scale},
19 | year = {2020},
20 | publisher = {Association for Computing Machinery},
21 | address = {New York, NY, USA},
22 | booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
23 | pages = {739–748},
24 | numpages = {10},
25 | series = {SIGIR '20}
26 | }
27 | ```
28 |
29 | ### License
30 |
31 | This project is licensed under the Apache-2.0 License.
32 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/conda/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/conda/README.md
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docker/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docker/README.md
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/.gitignore:
--------------------------------------------------------------------------------
1 | # Prerequisites
2 | *.d
3 |
4 | # Compiled Object files
5 | *.slo
6 | *.lo
7 | *.o
8 | *.obj
9 |
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 |
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 |
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 |
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 |
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 |
34 | *.DS_Store
35 | *.un~
36 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = source
8 | BUILDDIR = build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/dgl_ke_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/dgl_ke_arch.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/dist_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/dist_train.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/distmult.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/distmult.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/kg_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/kg_example.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/kge_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/kge_scores.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/metis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/metis.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/multi-core.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/multi-core.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/multi-gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/multi-gpu.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/multi-gpu.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/rescal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/rescal.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/rescal2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/rescal2.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/rescal3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/rescal3.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/rotate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/rotate.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/transe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/transe.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/transr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/transr.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/vs-gv-fb15k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/vs-gv-fb15k.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/vs-pbg-fb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/images/vs-pbg-fb.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/source/commands.rst:
--------------------------------------------------------------------------------
1 | DGL-KE Command Lines
2 | ====================
3 |
4 | DGL-KE provides a set of command line tools to train knowledge graph embeddings and make prediction
5 | with the embeddings easily.
6 |
7 | .. toctree::
8 | :hidden:
9 | :maxdepth: 1
10 | :titlesonly:
11 |
12 | format_kg
13 | format_out
14 | train
15 | partition
16 | dist_train
17 | eval
18 | predict
19 | emb_sim
20 |
21 | Commands for Training
22 | ---------------------
23 |
24 | DGL-KE provides commands to support training on CPUs, GPUs in a single machine and a cluster of machines.
25 |
26 | ``dglke_train`` trains KG embeddings on CPUs or GPUs in a single machine and saves the trained node embeddings and relation embeddings on disks.
27 |
28 | ``dglke_dist_train`` trains knowledge graph embeddings on a cluster of machines. This command launches a set of processes to perform distributed training automatically.
29 |
30 | To support distributed training, DGL-KE provides a command to partition a knowledge graph before training.
31 |
32 | ``dglke_partition`` partitions the given knowledge graph into ``N`` parts by the METIS partition algorithm. Different partitions will be stored on different machines in distributed training. You can find more details about the METIS partition algorithm in this `link`__.
33 |
34 | .. __: http://glaros.dtc.umn.edu/gkhome/metis/metis/overview
35 |
36 | In addition, DGL-kE provides a command to evaluate the quality of pre-trained embeddings.
37 |
38 | ``dglke_eval`` reads the pre-trained embeddings and evaluates the quality of the embeddings with a link prediction task on the test set.
39 |
40 | Commands for Inference
41 | ----------------------
42 |
43 | DGL-KE supports two types of inference tasks using pretained embeddings (We recommand using DGL-KE to generate these embedding).
44 |
45 | * **Predicting entities/relations in a triplet** Given entities and/or relations, predict which entities or relations are likely to connect with the existing entities for given relations. For example, given a head entity and a relation, predict which entities are likely to connect to the head entity via the given relation.
46 | * **Finding similar embeddings** Given entity/relation embeddings, find the most similar entity/relation embeddings for some pre-defined similarity functions.
47 |
48 | The ranking result will be automatically stored in the output file (result.tsv by default) using the tsv format. DGL-KE provides two commands for the inference tasks:
49 |
50 | ``dglke_predict`` predicts missing entities/relations in triplets using the pre-trained embeddings.
51 |
52 | ``dglke_emb_sim`` computes similarity scores on the entity embeddings or relation embeddings.
53 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/source/eval.rst:
--------------------------------------------------------------------------------
1 | Evaluation on Pre-Trained Embeddings
2 | ====================================
3 |
4 | ``dglke_eval`` reads the pre-trained embeddings and evaluates the quality of the embeddings with a link prediction task on the test set.
5 |
6 | Arguments
7 | ---------
8 | The command line provides the following arguments:
9 |
10 | - ``--model_name {TransE, TransE_l1, TransE_l2, TransR, RESCAL, DistMult, ComplEx, RotatE}``
11 | The models provided by DGL-KE.
12 |
13 | - ``--data_path DATA_PATH``
14 | The name of the knowledge graph stored under data_path. If it is one ofthe builtin knowledge grpahs such as FB15k, DGL-KE will automatically download the knowledge graph and keep it under data_path.
15 |
16 | - ``--dataset DATASET``
17 | The name of the knowledge graph stored under data_path. If it is one ofthe builtin knowledge grpahs such as FB15k, DGL-KE will automatically download the knowledge graph and keep it under data_path.
18 |
19 | - ``--format FORMAT``
20 | The format of the dataset. For builtin knowledge graphs, the format is determined automatically. For users own knowledge graphs, it needs to be ``raw_udd_{htr}`` or ``udd_{htr}``. ``raw_udd_`` indicates that the user's data use **raw ID** for entities and relations and ``udd_`` indicates that the user's data uses **KGE ID**. ``{htr}`` indicates the location of the head entity, tail entity and relation in a triplet. For example, ``htr`` means the head entity is the first element in the triplet, the tail entity is the second element and the relation is the last element.
21 |
22 | - ``--data_files [DATA_FILES ...]``
23 | A list of data file names. This is used if users want to train KGE on their own datasets. If the format is *raw_udd_{htr}*, users need to provide *train_file* [*valid_file*] [*test_file*]. If the format is *udd_{htr}*, users need to provide *entity_file* *relation_file* *train_file* [*valid_file*] [*test_file*]. In both cases, *valid_file* and *test_file* are optional.
24 |
25 | - ``--delimiter DELIMITER``
26 | Delimiter used in data files. Note all files should use the same delimiter.
27 |
28 | - ``--model_path MODEL_PATH``
29 | The place where models are saved.
30 |
31 | - ``--batch_size_eval BATCH_SIZE_EVAL``
32 | Batch size used for eval and test
33 |
34 | - ``--neg_sample_size_eval NEG_SAMPLE_SIZE_EVAL``
35 | Negative sampling size for testing
36 |
37 | - ``--neg_deg_sample_eval``
38 | Negative sampling proportional to vertex degree for testing.
39 |
40 | - ``--hidden_dim HIDDEN_DIM``
41 | Hidden dim used by relation and entity
42 |
43 | - ``-g GAMMA`` or ``--gamma GAMMA``
44 | The margin value in the score function. It is used by *TransX* and *RotatE*.
45 |
46 | - ``--eval_percent EVAL_PERCENT``
47 | Randomly sample some percentage of edges for evaluation.
48 |
49 | - ``--no_eval_filter``
50 | Disable filter positive edges from randomly constructed negative edges for evaluation.
51 |
52 | - ``--gpu [GPU ...]``
53 | A list of gpu ids, e.g. 0 1 2 4
54 |
55 | - ``--mix_cpu_gpu``
56 | Training a knowledge graph embedding model with both CPUs and GPUs.The embeddings are stored in CPU memory and the training is performed in GPUs.This is usually used for training a large knowledge graph embeddings.
57 |
58 | - ``-de`` or ``--double_ent``
59 | Double entitiy dim for complex number It is used by *RotatE*.
60 |
61 | - ``-dr`` or ``--double_rel``
62 | Double relation dim for complex number.
63 |
64 | - ``--num_proc NUM_PROC``
65 | The number of processes to train the model in parallel.In multi-GPU training, the number of processes by default is set to match the number of GPUs. If set explicitly, the number of processes needs to be divisible by the number of GPUs.
66 |
67 | - ``--num_thread NUM_THREAD``
68 | The number of CPU threads to train the model in each process. This argument is used for multi-processing training.
69 |
70 |
71 | Examples
72 | --------
73 |
74 |
75 | The following command evaluates the pre-trained KG embedding on multi-cores::
76 |
77 | dglke_eval --model_name TransE_l2 --dataset FB15k --hidden_dim 400 --gamma 19.9 --batch_size_eval 16 \
78 | --num_thread 1 --num_proc 8 --model_path ~/my_task/ckpts/TransE_l2_FB15k_0/
79 |
80 | We can also use GPUs in our evaluation tasks::
81 |
82 | dglke_eval --model_name TransE_l2 --dataset FB15k --hidden_dim 400 --gamma 19.9 --batch_size_eval 16 \
83 | --gpu 0 1 2 3 4 5 6 7 --model_path ~/my_task/ckpts/TransE_l2_FB15k_0/
84 |
85 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. dglke documentation master file, created by
2 | sphinx-quickstart on Wed Apr 1 12:56:21 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | DGL-KE Documentation
7 | ========================================
8 |
9 | Knowledge graphs (KGs) are data structures that store information about different entities (nodes) and their relations (edges). A common approach of using KGs in various machine learning tasks is to compute knowledge graph embeddings. DGL-KE is a high performance, easy-to-use, and scalable package for learning large-scale knowledge graph embeddings. The package is implemented on the top of Deep Graph Library (`DGL`__) and developers can run DGL-KE on CPU machine, GPU machine, as well as clusters with a set of popular models, including `TransE`__, `TransR`__, `RESCAL`__, `DistMult`__, `ComplEx`__, and `RotatE`__.
10 |
11 | .. __: https://www.dgl.ai/
12 | .. __: https://www.utc.fr/~bordesan/dokuwiki/_media/en/transe_nips13.pdf
13 | .. __: https://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/viewPaper/9571
14 | .. __: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.383.2015&rep=rep1&type=pdf
15 | .. __: https://arxiv.org/abs/1412.6575
16 | .. __: http://proceedings.mlr.press/v48/trouillon16.pdf
17 | .. __: https://arxiv.org/pdf/1902.10197.pdf
18 |
19 | .. image:: https://data.dgl.ai/asset/image/ke/dgl_ke_arch.png
20 | :width: 650
21 |
22 | Performance and Scalability
23 | ----------------------------------------
24 |
25 | DGL-KE is designed for learning at scale. It introduces various novel optimizations that accelerate training on knowledge graphs with millions of nodes and billions of edges. Our benchmark on knowledge graphs consisting of over *86M* nodes and *338M* edges shows that DGL-KE can compute embeddings in 100 minutes on an EC2 instance with 8 GPUs and 30 minutes on an EC2 cluster with 4 machines (48 cores/machine). These results represent a *2×∼5×* speedup over the best competing approaches.
26 |
27 | *DGL-KE vs Graphvite*
28 |
29 | .. image:: https://data.dgl.ai/asset/image/ke/vs-gv-fb15k.png
30 | :width: 650
31 |
32 | *DGL-KE vs Pytorch-Biggraph*
33 |
34 | .. image:: https://data.dgl.ai/asset/image/ke/vs-pbg-fb.png
35 | :width: 650
36 |
37 | Get started with DGL-KE!
38 | ----------------------------------------
39 |
40 | .. toctree::
41 | :maxdepth: 2
42 |
43 | install
44 | kg
45 | commands
46 | train_user_data
47 | benchmarks
48 | profile
49 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/source/install.rst:
--------------------------------------------------------------------------------
1 | Installation Guide
2 | ==================
3 |
4 |
5 | This topic explains how to install DGL-KE. We recommend installing DGL-KE by using ``pip`` and from the source.
6 |
7 | System requirements
8 | -------------------
9 |
10 | DGL-KE works with the following operating systems:
11 |
12 | - Ubuntu 16.04 or higher version
13 | - macOS x
14 |
15 | DGL-KE requires Python version 3.5 or later. Python 3.4 or earlier is not tested. Python 2 support is coming.
16 |
17 | DGL-KE supports multiple tensor libraries as backends, e.g., PyTorch and MXNet. For requirements on backends and how to select one, see Working with different backends. As a demo, we install Pytorch using ``pip``::
18 |
19 | sudo pip3 install torch
20 |
21 |
22 | Install DGL
23 | -----------
24 |
25 | DGL-KE is implemented on the top of DGL (0.4.3 version). You can install DGL using pip::
26 |
27 | sudo pip3 install dgl==0.4.3
28 |
29 |
30 | Install DGL-KE
31 | --------------
32 |
33 | After installing DGL, you can install DGL-KE. The fastest way to install DGL-KE is by using pip::
34 |
35 | sudo pip3 install dglke
36 |
37 | or you can install DGL-KE from source::
38 |
39 | git clone https://github.com/awslabs/dgl-ke.git
40 | cd dgl-ke/python
41 | sudo python3 setup.py install
42 |
43 |
44 | Have a Quick Test
45 | -----------------
46 |
47 | Once you install DGL-KE successfully, you can test it by the following command::
48 |
49 | # create a new workspace
50 | mkdir my_task && cd my_task
51 | # Train transE model on FB15k dataset
52 | DGLBACKEND=pytorch dglke_train --model_name TransE_l2 --dataset FB15k --batch_size 1000 \
53 | --neg_sample_size 200 --hidden_dim 400 --gamma 19.9 --lr 0.25 --max_step 500 --log_interval 100 \
54 | --batch_size_eval 16 -adv --regularization_coef 1.00E-09 --test --num_thread 1 --num_proc 8
55 |
56 | This command will download the ``FB15k`` dataset, train the ``transE`` model on that, and save the trained embeddings into the file. You could see the following output at the end::
57 |
58 | -------------- Test result --------------
59 | Test average MRR : 0.47221913961451095
60 | Test average MR : 58.68289854581774
61 | Test average HITS@1 : 0.2784276548560207
62 | Test average HITS@3 : 0.6244265375564998
63 | Test average HITS@10 : 0.7726295474936941
64 | -----------------------------------------
65 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/source/partition.rst:
--------------------------------------------------------------------------------
1 | Partition a Knowledge Graph
2 | ===========================
3 |
4 | For distributed training, a user needs to partition a graph beforehand. DGL-KE provides a partition tool ``dglke_partition``, which partitions a given knowledge graph into ``N`` parts with `the METIS partition algorithm`__. This partition algorithm reduces the number of edge cuts between partitions to reduce network communication in the distributed training. For a cluster of ``P`` machines, we usually split a graph into ``P`` partitions and assign a partition to a machine as shown in the figure below.
5 |
6 | .. __: http://glaros.dtc.umn.edu/gkhome/metis/metis/overview
7 |
8 |
9 | .. image:: ../images/metis.png
10 | :width: 400
11 |
12 | Arguments
13 | ---------
14 | The command line provides the following arguments:
15 |
16 | - ``--data_path DATA_PATH``
17 | The name of the knowledge graph stored under data_path. If it is one ofthe builtin knowledge grpahs such as FB15k, DGL-KE will automatically download the knowledge graph and keep it under data_path.
18 |
19 | - ``--dataset DATA_SET``
20 | The name of the knowledge graph stored under data_path. If it is one of the builtin knowledge grpahs such as ``FB15k``, ``FB15k-237``, ``wn18``, ``wn18rr``, and ``Freebase``, DGL-KE will automatically download the knowledge graph and keep it under data_path.
21 |
22 | - ``--format FORMAT``
23 | The format of the dataset. For builtin knowledge graphs, the format is determined automatically. For users own knowledge graphs, it needs to be ``raw_udd_{htr}`` or ``udd_{htr}``. ``raw_udd_`` indicates that the user's data use **raw ID** for entities and relations and ``udd_`` indicates that the user's data uses **KGE ID**. ``{htr}`` indicates the location of the head entity, tail entity and relation in a triplet. For example, ``htr`` means the head entity is the first element in the triplet, the tail entity is the second element and the relation is the last element.
24 |
25 | - ``--data_files [DATA_FILES ...]``
26 | A list of data file names. This is required for training KGE on their own datasets. If the format is ``raw_udd_{htr}``, users need to provide *train_file* [*valid_file*] [*test_file*]. If the format is ``udd_{htr}``, users need to provide *entity_file* *relation_file* *train_file* [*valid_file*] [*test_file*]. In both cases, *valid_file* and *test_file* are optional.
27 |
28 | - ``--delimiter DELIMITER``
29 | Delimiter used in data files. Note all files should use the same delimiter.
30 |
31 | - ``-k NUM_PARTS`` or ``--num-parts NUM_PARTS``
32 | The number of partitions.
33 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/source/profile.rst:
--------------------------------------------------------------------------------
1 | Profile DGL-KE
2 | --------------
3 |
4 | This document is mainly for developing the DGL-KE models and accelerating their training.
5 |
6 | To analyze MXNet version of KE models, please enable `MXNet_PROFILER` environment variable when running the training job::
7 |
8 | MXNET_PROFILER=1 dglke_train --model_name TransE_l2 --dataset FB15k --batch_size 1000 --neg_sample_size 200 --hidden_dim 400 \
9 | --gamma 19.9 --lr 0.25 --max_step 3000 --log_interval 100 --batch_size_eval 16 --test -adv \
10 | --regularization_coef 1.00E-09 --num_thread 1 --num_proc 8
11 |
12 |
13 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/docs/source/train_user_data.rst:
--------------------------------------------------------------------------------
1 | Train User-Defined Knowledage Graphs
2 | --------------------------------------
3 |
4 | Users can use DGL-KE to train embeddings on their own knowledge graphs. In this case, users need to use ``--data_path`` to specify the path to the knowledge graph dataset, ``--data_files`` to specify the triplets of a knowledge graph as well as node/relation ID mapping, ``--format`` to specify the input format of the knowledge graph.
5 |
6 | The input format of users' knowledge graphs
7 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
8 | Users need to store all the data associated with a knowledge graph in the same directory. DGL-KE supports two knowledge graph input formats:
9 |
10 | * Raw user-defined knowledge graphs: user only needs to provide triplets, both entities and relations in the triplets can be arbitrary strings. The dataloader will automatically generate the id mappings for entities and relations in the triplets. An example of triplets:
11 | .. csv-table::
12 | :header: "", "train.tsv", ""
13 | :widths: 12, 20, 12
14 | :align: center
15 |
16 | "Beijing","is_capital_of","China"
17 | "London","is_capital_of","UK"
18 | "UK","located_at","Europe"
19 | ...
20 |
21 | * User-defined knowledge graphs: user need to provide the id mapping for entities and relations as well as the triplets of the knowledge graph. The triplets should only contains entities ids and relation ids. Here we assume the both the entities ids and relation ids start from 0 and should be contineous. An example of mapping and triplets files:
22 | .. csv-table::
23 | :header: "entities.dict", "relation.dict", "train.tsv"
24 | :widths: 24 26 16
25 | :align: center
26 | :keepspace:
27 |
28 | "Beijing 0","is_capital_of 0","0 0 2"
29 | "London 1","located_at 1","1 0 3"
30 | "China 2"," ","3 1 4"
31 | "UK 3"," "," "
32 | "Europe 4"," "," "
33 |
34 | Using raw user-defined knowledge graph format
35 | """""""""""""""""""""""""""""""""""""""""""""
36 |
37 | Users need to store all the data associated with a knowledge graph in the same directory. DGL-KE supports two knowledge graph input formats:
38 |
39 | ``raw_udd_[h|r|t]``: In this format, users only need to provide triplets and the dataloader generates the id mappings for entities and relations in the triplets. The dataloader outputs two files: entities.tsv for entity id mapping and relations.tsv for relation id mapping while loading data. The order of head, relation and tail entities are described in ``[h|r|t]``, for example, raw_udd_trh means the triplets are stored in the order of tail, relation and head. The directory contains three files:
40 |
41 | * *train* stores the triplets in the training set. The format of a triplet, e.g., ``[src_name, rel_name, dst_name]``, should follow the order specified in ``[h|r|t]``
42 | * *valid* stores the triplets in the validation set. The format of a triplet, e.g., ``[src_name, rel_name, dst_name]``, should follow the order specified in ``[h|r|t]``. This is optional.
43 | * *test* stores the triplets in the test set. The format of a triplet, e.g., ``[src_name, rel_name, dst_name]``, should follow the order specified in ``[h|r|t]``. This is optional.
44 |
45 | Using user-defined knowledge graph format
46 | """""""""""""""""""""""""""""""""""""""""
47 |
48 | ``udd_[h|r|t]``: In this format, user should provide the id mapping for entities and relations. The order of head, relation and tail entities are described in ``[h|r|t]``, for example, raw_udd_trh means the triplets are stored in the order of tail, relation and head. The directory should contains five files:
49 |
50 | * *entities* stores the mapping between entity name and entity Id
51 | * *relations* stores the mapping between relation name relation Id
52 | * *train* stores the triplets in the training set. The format of a triplet, e.g., ``[src_id, rel_id, dst_id]``, should follow the order specified in ``[h|r|t]``
53 | * *valid* stores the triplets in the validation set. The format of a triplet, e.g., ``[src_id, rel_id, dst_id]``, should follow the order specified in ``[h|r|t]``
54 | * *test* stores the triplets in the test set. The format of a triplet, e.g., ``[src_id, rel_id, dst_id]``, should follow the order specified in ``[h|r|t]``
55 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/examples/wn18/head.list:
--------------------------------------------------------------------------------
1 | 0
2 | 1
3 | 2
4 | 3
5 | 4
6 | 5
7 | 6
8 | 7
9 | 8
10 | 9
11 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/examples/wn18/raw_head.list:
--------------------------------------------------------------------------------
1 | 02553196
2 | 13068917
3 | 00083809
4 | 02757462
5 | 02321009
6 | 03976960
7 | 08847694
8 | 02537319
9 | 12927354
10 | 01685439
11 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/examples/wn18/raw_rel.list:
--------------------------------------------------------------------------------
1 | _derivationally_related_form
2 | _hyponym
3 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/examples/wn18/raw_tail.list:
--------------------------------------------------------------------------------
1 | 05940414
2 | 01999186
3 | 05494365
4 | 01490112
5 | 08221897
6 | 00719705
7 | 07747951
8 | 07362386
9 | 09440400
10 | 02482139
11 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/examples/wn18/rel.list:
--------------------------------------------------------------------------------
1 | 0
2 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/examples/wn18/tail.list:
--------------------------------------------------------------------------------
1 | 11
2 | 12
3 | 13
4 | 14
5 | 15
6 | 16
7 | 17
8 | 18
9 | 19
10 | 20
11 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/examples/wn18_weighted/README.md:
--------------------------------------------------------------------------------
1 | # Weighted WN18 Example
2 | This example shows how to train a knowledge graph with weighted edges (each edge has an importance score)
3 |
4 | ## How to get data
5 | ```
6 | >>> wget https://dgl-data.s3-us-west-2.amazonaws.com/dataset/KGE_Examples/wn18_weighted_edge/wn18_weighted.tgz
7 | >>> tar -zxf wn18_weighted.tgz
8 | >>> ls wn18_weighted
9 | README entities.dict relations.dict test_weight.txt train_weight.txt valid_weight.txt
10 | ```
11 |
12 | ## How to train
13 | ```
14 | dglke_train --model_name TransE_l1 --dataset wn18-weight --format raw_udd_hrt --data_files train_weight.txt valid_weight.txt test_weight.txt --data_path ./data/wn18_weighted/ --batch_size 2048 --log_interval 1000 --neg_sample_size 128 --regularization_coef 2e-07 --hidden_dim 512 --gamma 12.0 --lr 0.007 --batch_size_eval 16 --test -adv --gpu 0 --max_step 32000 --has_edge_importance
15 | ```
16 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/img/dgl_ke_arch.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/img/dgl_ke_arch.PNG
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/img/logo.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/img/vs-gv-fb15k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/img/vs-gv-fb15k.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/img/vs-gv-wn18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/img/vs-gv-wn18.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/img/vs-pbg-fb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/snap-stanford/ogb/61e9784ca76edeaa6e259ba0f836099608ff0586/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/img/vs-pbg-fb.png
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/python/dglke/VERSION.txt:
--------------------------------------------------------------------------------
1 | 0.1.0.dev
2 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/python/dglke/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # __init__.py
4 | #
5 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | #!/usr/bin/env python3
21 |
22 | import pkg_resources
23 |
24 | __version__ = pkg_resources.resource_string("dglke", "VERSION.txt").decode("utf-8").strip()
25 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/python/dglke/dataloader/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # __init__.py
4 | #
5 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | from .KGDataset import *
21 | from .sampler import *
22 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/python/dglke/models/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # __init__.py
4 | #
5 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | from .general_models import KEModel, InferModel
21 | from .ke_model import TransEModel, TransE_l2Model, TransE_l1Model, DistMultModel, TransRModel, ComplExModel, RESCALModel, RotatEModel, GNNModel
22 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/python/dglke/models/mxnet/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # __init__.py
4 | #
5 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/python/dglke/models/mxnet/loss.py:
--------------------------------------------------------------------------------
1 | from ..base_loss import *
2 | from .tensor_models import logsigmoid
3 |
4 | class HingeLoss(BaseHingeLoss):
5 | def __init__(self, margin):
6 | assert False, 'HingeLoss is not implemented'
7 |
8 | def __call__(self, score, label):
9 | pass
10 |
11 | class LogisticLoss(BaseLogisticLoss):
12 | def __init__(self):
13 | assert False, 'LogisticLoss is not implemented'
14 |
15 | def __call__(self, score, label):
16 | pass
17 |
18 | class BCELoss(BaseBCELoss):
19 | def __init__(self):
20 | assert False, 'BCELoss is not implemented'
21 |
22 | def __call__(self, score, label):
23 | pass
24 |
25 | class LogsigmoidLoss(BaseLogsigmoidLoss):
26 | def __init__(self):
27 | super(LogsigmoidLoss, self).__init__()
28 |
29 | def __call__(self, score, label):
30 | return - logsigmoid(label * score)
31 |
32 | class LossGenerator(BaseLossGenerator):
33 | def __init__(self, args, loss_genre='Logsigmoid', neg_adversarial_sampling=False, adversarial_temperature=1.0,
34 | pairwise=False):
35 | assert False, 'LossGenerator is not implemented'
36 |
37 |
38 | def _get_pos_loss(self, pos_score, edge_weight):
39 | pass
40 |
41 | def _get_neg_loss(self, neg_score, edge_weight):
42 | pass
43 |
44 | def get_total_loss(self, pos_score, neg_score, edge_weight):
45 | pass
46 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/python/dglke/models/pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # __init__.py
4 | #
5 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/python/dglke/models/pytorch/loss.py:
--------------------------------------------------------------------------------
1 | from ..base_loss import *
2 | from .tensor_models import *
3 | import torch as th
4 | import torch.nn.functional as functional
5 |
6 | logsigmoid = functional.logsigmoid
7 | softplus = functional.softplus
8 | sigmoid = functional.sigmoid
9 |
10 | class HingeLoss(BaseHingeLoss):
11 | def __init__(self, margin):
12 | super(HingeLoss, self).__init__(margin)
13 |
14 | def __call__(self, score: th.Tensor, label):
15 | loss = self.margin - label * score
16 | loss[loss < 0] = 0
17 | return loss
18 |
19 | class LogisticLoss(BaseLogisticLoss):
20 | def __init__(self):
21 | super(LogisticLoss, self).__init__()
22 |
23 | def __call__(self, score: th.Tensor, label):
24 | return softplus(-label * score)
25 |
26 | class BCELoss(BaseBCELoss):
27 | def __init__(self):
28 | super(BCELoss, self).__init__()
29 |
30 | def __call__(self, score: th.Tensor, label):
31 | return -(label * th.log(sigmoid(score)) + (1 - label) * th.log(1 - sigmoid(score)))
32 |
33 | class LogsigmoidLoss(BaseLogsigmoidLoss):
34 | def __init__(self):
35 | super(LogsigmoidLoss, self).__init__()
36 |
37 | def __call__(self, score: th.Tensor, label):
38 | return - logsigmoid(label * score)
39 |
40 |
41 | class LossGenerator(BaseLossGenerator):
42 | def __init__(self, args, loss_genre='Logsigmoid', neg_adversarial_sampling=False, adversarial_temperature=1.0,
43 | pairwise=False):
44 | super(LossGenerator, self).__init__(neg_adversarial_sampling, adversarial_temperature, pairwise)
45 | if loss_genre == 'Hinge':
46 | self.neg_label = -1
47 | self.loss_criterion = HingeLoss(args.margin)
48 | elif loss_genre == 'Logistic':
49 | self.neg_label = -1
50 | self.loss_criterion = LogisticLoss()
51 | elif loss_genre == 'Logsigmoid':
52 | self.neg_label = -1
53 | self.loss_criterion = LogsigmoidLoss()
54 | elif loss_genre == 'BCE':
55 | self.neg_label = 0
56 | self.loss_criterion = BCELoss()
57 | else:
58 | raise ValueError('loss genre %s is not support' % loss_genre)
59 |
60 | if self.pairwise and loss_genre not in ['Logistic', 'Hinge']:
61 | raise ValueError('{} loss cannot be applied to pairwise loss function'.format(loss_genre))
62 |
63 | def _get_pos_loss(self, pos_score):
64 | return self.loss_criterion(pos_score, 1)
65 |
66 | def _get_neg_loss(self, neg_score):
67 | return self.loss_criterion(neg_score, self.neg_label)
68 |
69 | def get_total_loss(self, pos_score, neg_score, edge_weight=None):
70 | log = {}
71 | if edge_weight is None:
72 | edge_weight = 1
73 | if self.pairwise:
74 | pos_score = pos_score.unsqueeze(-1)
75 | loss = th.mean(self.loss_criterion((pos_score - neg_score), 1) * edge_weight)
76 | log['loss'] = get_scalar(loss)
77 | return loss, log
78 |
79 | pos_loss = self._get_pos_loss(pos_score) * edge_weight
80 | neg_loss = self._get_neg_loss(neg_score) * edge_weight
81 | # MARK - would average twice make loss function lose precision?
82 | # do mean over neg_sample
83 | if self.neg_adversarial_sampling:
84 | neg_loss = th.sum(th.softmax(neg_score * self.adversarial_temperature, dim=-1).detach() * neg_loss, dim=-1)
85 | else:
86 | neg_loss = th.mean(neg_loss, dim=-1)
87 | # do mean over chunk
88 | neg_loss = th.mean(neg_loss)
89 | pos_loss = th.mean(pos_loss)
90 | loss = (neg_loss + pos_loss) / 2
91 | log['pos_loss'] = get_scalar(pos_loss)
92 | log['neg_loss'] = get_scalar(neg_loss)
93 | log['loss'] = get_scalar(loss)
94 | return loss, log
95 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/python/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = dglke
3 | version = file: dglke/VERSION.txt
4 | url = https://github.com/awslabs/dgl-ke
5 | project_urls =
6 | Source = https://github.com/awslabs/dgl-ke
7 | Bug Reports = https://github.com/awslabs/dgl-ke/issues
8 | Documentation = https://aws-dglke.readthedocs.io/en/latest/index.html
9 | author = AWS AI
10 | classifiers =
11 | Development Status :: 5 - Production/Stable
12 | Environment :: Console
13 | Intended Audience :: Science/Research
14 | License :: OSI Approved :: Apache Software License
15 | Operating System :: OS Independent
16 | Programming Language :: Python
17 | Programming Language :: Python :: 3
18 | Programming Language :: Python :: 3.6
19 | Programming Language :: Python :: 3.7
20 | Programming Language :: Python :: 3 :: Only
21 | Topic :: Scientific/Engineering :: Artificial Intelligence
22 | # Already provided as a classifier.
23 | # license = Apache License
24 | license_files =
25 | LICENSE
26 | description = A distributed system to learn embeddings of large graphs
27 | long_description = file: README.md
28 | long_description_content_type = text/markdown
29 | keywords =
30 | machine-learning
31 | knowledge-base
32 | graph-embedding
33 | test_suite =
34 |
35 | [options]
36 | setup_requires =
37 | setuptools >= 39.2
38 | install_requires =
39 | numpy
40 | setuptools
41 | python_requires = >=3.5, <4
42 | packages = find:
43 |
44 | [options.extras_require]
45 | docs = Sphinx
46 |
47 | [options.entry_points]
48 | console_scripts =
49 | dglke_train = dglke.train:main
50 | dglke_eval = dglke.eval:main
51 | dglke_partition = dglke.partition:main
52 | dglke_convert = dglke.convert:main
53 | dglke_server = dglke.kvserver:main
54 | dglke_client = dglke.kvclient:main
55 | dglke_dist_train = dglke.dist_train:main
56 | dglke_emb_sim = dglke.infer_emb_sim:main
57 | dglke_predict = dglke.infer_score:main
58 |
59 | [options.packages.find]
60 | exclude =
61 | docs
62 | test
63 |
64 | [options.package_data]
65 | dglke=
66 | VERSION.txt
67 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/dgl-ke-ogb-lsc/python/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # -*- coding: utf-8 -*-
4 | #
5 | # setup.py
6 | #
7 | # Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | #
21 |
22 | from setuptools import setup
23 |
24 | if __name__ == "__main__":
25 | setup()
26 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/run.sh:
--------------------------------------------------------------------------------
1 | # TransE-shallow
2 | dglke_train --model_name TransE_l2 \
3 | --hidden_dim 200 --gamma 10 --lr 0.1 --regularization_coef 1e-9 \
4 | --valid --test -adv --mix_cpu_gpu --num_proc 4 --num_thread 4 \
5 | --gpu 0 1 2 3 \
6 | --async_update --force_sync_interval 10000 --no_save_emb \
7 | --print_on_screen --encoder_model_name shallow --save_path $SAVE_PATH
8 |
9 |
10 | # TransE-roberta
11 | dglke_train --model_name TransE_l2 \
12 | --hidden_dim 200 --gamma 10 --lr 0.1 --regularization_coef 1e-9 \
13 | --valid --test -adv --mix_cpu_gpu --num_proc 4 --num_thread 4 \
14 | --gpu 0 1 2 3 \
15 | --async_update --force_sync_interval 10000 --no_save_emb \
16 | --print_on_screen --encoder_model_name roberta --save_path $SAVE_PATH
17 |
18 |
19 | # TransE-concat
20 | CUDA_VISIBLE_DEVICES=0,1,2,3 dglke_train --model_name TransE_l2 \
21 | --hidden_dim 200 --gamma 10 --lr 0.1 --regularization_coef 1e-9 \
22 | --valid --test -adv --mix_cpu_gpu --num_proc 4 --num_thread 4 \
23 | --gpu 0 1 2 3 \
24 | --async_update --force_sync_interval 50000 --no_save_emb \
25 | --print_on_screen --encoder_model_name concat --save_path $SAVE_PATH
26 |
27 |
28 | # ComplEx-shallow
29 | CUDA_VISIBLE_DEVICES=0,1,2,3 dglke_train --model_name ComplEx \
30 | --hidden_dim 100 --gamma 8 --lr 0.01 --regularization_coef 2e-6 \
31 | --valid --test -adv --mix_cpu_gpu --num_proc 4 --num_thread 4 \
32 | --gpu 0 1 2 3 \
33 | --async_update --force_sync_interval 50000 --no_save_emb \
34 | --print_on_screen --encoder_model_name shallow -de -dr --save_path $SAVE_PATH
35 |
36 | # ComplEx-roberta
37 | CUDA_VISIBLE_DEVICES=0,1,2,3 dglke_train --model_name ComplEx \
38 | --hidden_dim 100 --gamma 100 --lr 0.1 --regularization_coef 1e-9 \
39 | --valid --test -adv --mix_cpu_gpu --num_proc 4 --num_thread 4 \
40 | --gpu 0 1 2 3 \
41 | --async_update --force_sync_interval 10000 --no_save_emb \
42 | --print_on_screen --encoder_model_name roberta -de -dr --save_path $SAVE_PATH
43 |
44 | # ComplEx-concat
45 | CUDA_VISIBLE_DEVICES=0,1,2,3 dglke_train --model_name ComplEx \
46 | --hidden_dim 100 --gamma 3 --lr 0.1 --regularization_coef 1e-9 \
47 | --valid --test -adv --mix_cpu_gpu --num_proc 4 --num_thread 4 \
48 | --gpu 0 1 2 3 \
49 | --async_update --force_sync_interval 50000 --no_save_emb \
50 | --print_on_screen --encoder_model_name concat -de -dr --save_path $SAVE_PATH
51 |
--------------------------------------------------------------------------------
/examples/lsc/wikikg90m/save_test_submission.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle
3 | import json
4 | import numpy as np
5 | import sys
6 | from ogb.lsc import WikiKG90MDataset, WikiKG90MEvaluator
7 | import pdb
8 | from collections import defaultdict
9 | import torch.nn.functional as F
10 | import torch
11 |
12 | # python save_test_submission.py $SAVE_PATH $NUM_PROC
13 | if __name__ == '__main__':
14 | path = sys.argv[1]
15 | num_proc = int(sys.argv[2])
16 |
17 | all_file_names = os.listdir(path)
18 | test_file_names = [name for name in all_file_names if '.pkl' in name and 'test' in name]
19 | valid_file_names = [name for name in all_file_names if '.pkl' in name and 'valid' in name]
20 | steps = [int(name.split('.')[0].split('_')[-1]) for name in valid_file_names if 'valid_0' in name]
21 | steps.sort()
22 | evaluator = WikiKG90MEvaluator()
23 | device = torch.device('cpu')
24 |
25 | all_test_dicts = []
26 | best_valid_mrr = -1
27 | best_valid_idx = -1
28 |
29 | for i, step in enumerate(steps):
30 | valid_result_dict = defaultdict(lambda: defaultdict(list))
31 | test_result_dict = defaultdict(lambda: defaultdict(list))
32 | for proc in range(num_proc):
33 | valid_result_dict_proc = torch.load(os.path.join(path, "valid_{}_{}.pkl".format(proc, step)), map_location=device)
34 | test_result_dict_proc = torch.load(os.path.join(path, "test_{}_{}.pkl".format(proc, step)), map_location=device)
35 | for result_dict_proc, result_dict in zip([valid_result_dict_proc, test_result_dict_proc], [valid_result_dict, test_result_dict]):
36 | for key in result_dict_proc['h,r->t']:
37 | result_dict['h,r->t'][key].append(result_dict_proc['h,r->t'][key].numpy())
38 | for result_dict in [valid_result_dict, test_result_dict]:
39 | for key in result_dict['h,r->t']:
40 | result_dict['h,r->t'][key] = np.concatenate(result_dict['h,r->t'][key], 0)
41 |
42 | all_test_dicts.append(test_result_dict)
43 | metrics = evaluator.eval(valid_result_dict)
44 | metric = 'mrr'
45 | print("valid-{} at step {}: {}".format(metric, step, metrics[metric]))
46 | if metrics[metric] > best_valid_mrr:
47 | best_valid_mrr = metrics[metric]
48 | best_valid_idx = i
49 |
50 | best_test_dict = all_test_dicts[best_valid_idx]
51 | evaluator.save_test_submission(best_test_dict, path)
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/examples/nodeproppred/arxiv/README.md:
--------------------------------------------------------------------------------
1 | # ogbn-arxiv
2 |
3 | This repository includes the following example scripts:
4 |
5 | * **[MLP](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/arxiv/mlp.py)**: Full-batch MLP training based on paper features and optional Node2Vec features (`--use_node_embedding`). For training with Node2Vec features, this script requires node embeddings be saved in `embedding.pt`. To generate them, please run `python node2vec.py` [requires `torch-geometric>=1.5.0`].
6 | * **[GNN](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/arxiv/gnn.py)**: Full-batch GNN training using either the GCN or GraphSAGE operator (`--use_sage`) [requires `torch-geometric>=1.6.0`].
7 |
8 | ## Training & Evaluation
9 |
10 | ```
11 | # Run with default config
12 | python gnn.py
13 |
14 | # Run with custom config
15 | python gnn.py --hidden_channels=128
16 | ```
17 |
18 | ## Getting Raw Texts
19 |
20 | The tsv file that maps paper IDs into their titles and abstracts are available [here](https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz).
21 | There are three columns: paperid \t title \t abstract.
22 | You can obtain the paper ID for each node at `mapping/nodeidx2paperid.csv.gz` of the downloaded dataset directory.
23 |
--------------------------------------------------------------------------------
/examples/nodeproppred/arxiv/logger.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class Logger(object):
5 | def __init__(self, runs, info=None):
6 | self.info = info
7 | self.results = [[] for _ in range(runs)]
8 |
9 | def add_result(self, run, result):
10 | assert len(result) == 3
11 | assert run >= 0 and run < len(self.results)
12 | self.results[run].append(result)
13 |
14 | def print_statistics(self, run=None):
15 | if run is not None:
16 | result = 100 * torch.tensor(self.results[run])
17 | argmax = result[:, 1].argmax().item()
18 | print(f'Run {run + 1:02d}:')
19 | print(f'Highest Train: {result[:, 0].max():.2f}')
20 | print(f'Highest Valid: {result[:, 1].max():.2f}')
21 | print(f' Final Train: {result[argmax, 0]:.2f}')
22 | print(f' Final Test: {result[argmax, 2]:.2f}')
23 | else:
24 | result = 100 * torch.tensor(self.results)
25 |
26 | best_results = []
27 | for r in result:
28 | train1 = r[:, 0].max().item()
29 | valid = r[:, 1].max().item()
30 | train2 = r[r[:, 1].argmax(), 0].item()
31 | test = r[r[:, 1].argmax(), 2].item()
32 | best_results.append((train1, valid, train2, test))
33 |
34 | best_result = torch.tensor(best_results)
35 |
36 | print(f'All runs:')
37 | r = best_result[:, 0]
38 | print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
39 | r = best_result[:, 1]
40 | print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
41 | r = best_result[:, 2]
42 | print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}')
43 | r = best_result[:, 3]
44 | print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}')
45 |
--------------------------------------------------------------------------------
/examples/nodeproppred/arxiv/node2vec.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch_geometric.nn import Node2Vec
5 | from torch_geometric.utils import to_undirected
6 |
7 | from ogb.nodeproppred import PygNodePropPredDataset
8 |
9 |
10 | def save_embedding(model):
11 | torch.save(model.embedding.weight.data.cpu(), 'embedding.pt')
12 |
13 |
14 | def main():
15 | parser = argparse.ArgumentParser(description='OGBN-Arxiv (Node2Vec)')
16 | parser.add_argument('--device', type=int, default=0)
17 | parser.add_argument('--embedding_dim', type=int, default=128)
18 | parser.add_argument('--walk_length', type=int, default=80)
19 | parser.add_argument('--context_size', type=int, default=20)
20 | parser.add_argument('--walks_per_node', type=int, default=10)
21 | parser.add_argument('--batch_size', type=int, default=256)
22 | parser.add_argument('--lr', type=float, default=0.01)
23 | parser.add_argument('--epochs', type=int, default=5)
24 | parser.add_argument('--log_steps', type=int, default=1)
25 | args = parser.parse_args()
26 |
27 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
28 | device = torch.device(device)
29 |
30 | dataset = PygNodePropPredDataset(name='ogbn-arxiv')
31 | data = dataset[0]
32 | data.edge_index = to_undirected(data.edge_index, data.num_nodes)
33 |
34 | model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length,
35 | args.context_size, args.walks_per_node,
36 | sparse=True).to(device)
37 |
38 | loader = model.loader(batch_size=args.batch_size, shuffle=True,
39 | num_workers=4)
40 | optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr)
41 |
42 | model.train()
43 | for epoch in range(1, args.epochs + 1):
44 | for i, (pos_rw, neg_rw) in enumerate(loader):
45 | optimizer.zero_grad()
46 | loss = model.loss(pos_rw.to(device), neg_rw.to(device))
47 | loss.backward()
48 | optimizer.step()
49 |
50 | if (i + 1) % args.log_steps == 0:
51 | print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
52 | f'Loss: {loss:.4f}')
53 |
54 | if (i + 1) % 100 == 0: # Save model every 100 steps.
55 | save_embedding(model)
56 | save_embedding(model)
57 |
58 |
59 | if __name__ == "__main__":
60 | main()
61 |
--------------------------------------------------------------------------------
/examples/nodeproppred/mag/README.md:
--------------------------------------------------------------------------------
1 | # ogbn-mag
2 |
3 | This repository includes the following example scripts:
4 |
5 | * **[MLP](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/mag/mlp.py)**: Full-batch MLP training based on paper features and optional MetaPath2Vec features (`--use_node_embedding`). For training with MetaPath2Vec features, this script requires node embeddings be saved in `embedding.pt`. To generate them, please run `python metapath.py` [requires `torch-geometric>=1.5.0`].
6 | * **[GNN](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/mag/gnn.py)**: Full-batch GNN training on the paper-paper relational graph using either the GCN or GraphSAGE operator (`--use_sage`) [requires `torch_geometric>=1.6.0`].
7 | * **[R-GCN](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/mag/rgcn.py)**: Full-batch R-GCN training on the complete heterogeneous graph. This script will consume about 14GB of GPU memory [requires `torch_geometric>=1.4.3`].
8 | * **[Cluster-GCN](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/mag/cluster_gcn.py)**: Mini-batch R-GCN training using the Cluster-GCN algorithm [requires `torch-geometric>= 1.4.3`].
9 | * **[NeighborSampler](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/mag/sampler.py)**: Mini-batch R-GCN training using neighbor sampling [requires `torch-geometric>=1.5.0`].
10 | * **[GraphSAINT](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/mag/graph_saint.py)**: Mini-batch R-GCN training using the GraphSAINT algorithm [requires `torch-geometric>=1.5.0`].
11 |
12 | For the R-GCN implementation, we use distinct trainable node embeddings for all node types except for paper nodes.
13 |
14 | ## Training & Evaluation
15 |
16 | ```
17 | # Run with default config
18 | python graph_saint.py
19 | ```
20 |
21 |
--------------------------------------------------------------------------------
/examples/nodeproppred/mag/logger.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class Logger(object):
5 | def __init__(self, runs, info=None):
6 | self.info = info
7 | self.results = [[] for _ in range(runs)]
8 |
9 | def add_result(self, run, result):
10 | assert len(result) == 3
11 | assert run >= 0 and run < len(self.results)
12 | self.results[run].append(result)
13 |
14 | def print_statistics(self, run=None):
15 | if run is not None:
16 | result = 100 * torch.tensor(self.results[run])
17 | argmax = result[:, 1].argmax().item()
18 | print(f'Run {run + 1:02d}:')
19 | print(f'Highest Train: {result[:, 0].max():.2f}')
20 | print(f'Highest Valid: {result[:, 1].max():.2f}')
21 | print(f' Final Train: {result[argmax, 0]:.2f}')
22 | print(f' Final Test: {result[argmax, 2]:.2f}')
23 | else:
24 | result = 100 * torch.tensor(self.results)
25 |
26 | best_results = []
27 | for r in result:
28 | train1 = r[:, 0].max().item()
29 | valid = r[:, 1].max().item()
30 | train2 = r[r[:, 1].argmax(), 0].item()
31 | test = r[r[:, 1].argmax(), 2].item()
32 | best_results.append((train1, valid, train2, test))
33 |
34 | best_result = torch.tensor(best_results)
35 |
36 | print(f'All runs:')
37 | r = best_result[:, 0]
38 | print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
39 | r = best_result[:, 1]
40 | print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
41 | r = best_result[:, 2]
42 | print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}')
43 | r = best_result[:, 3]
44 | print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}')
45 |
--------------------------------------------------------------------------------
/examples/nodeproppred/mag/metapath.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch_sparse import transpose
5 | from torch_geometric.nn import MetaPath2Vec
6 |
7 | from ogb.nodeproppred import PygNodePropPredDataset
8 |
9 |
10 | @torch.no_grad()
11 | def save_embedding(model):
12 | embedding = model('paper').cpu()
13 | torch.save(embedding, 'embedding.pt')
14 |
15 |
16 | def main():
17 | parser = argparse.ArgumentParser(description='OGBN-MAG (MetaPath2Vec)')
18 | parser.add_argument('--device', type=int, default=0)
19 | parser.add_argument('--embedding_dim', type=int, default=128)
20 | parser.add_argument('--walk_length', type=int, default=64)
21 | parser.add_argument('--context_size', type=int, default=7)
22 | parser.add_argument('--walks_per_node', type=int, default=5)
23 | parser.add_argument('--num_negative_samples', type=int, default=5)
24 | parser.add_argument('--batch_size', type=int, default=128)
25 | parser.add_argument('--lr', type=float, default=0.01)
26 | parser.add_argument('--epochs', type=int, default=5)
27 | parser.add_argument('--log_steps', type=int, default=100)
28 | args = parser.parse_args()
29 |
30 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
31 | device = torch.device(device)
32 |
33 | dataset = PygNodePropPredDataset('ogbn-mag')
34 | data = dataset[0]
35 |
36 | # We need to add reverse edges to the heterogeneous graph.
37 | data.edge_index_dict[('institution', 'employs', 'author')] = transpose(
38 | data.edge_index_dict[('author', 'affiliated_with', 'institution')],
39 | None, m=data.num_nodes_dict['author'],
40 | n=data.num_nodes_dict['institution'])[0]
41 | data.edge_index_dict[('paper', 'written_by', 'author')] = transpose(
42 | data.edge_index_dict[('author', 'writes', 'paper')], None,
43 | m=data.num_nodes_dict['author'], n=data.num_nodes_dict['paper'])[0]
44 | data.edge_index_dict[('field_of_study', 'contains', 'paper')] = transpose(
45 | data.edge_index_dict[('paper', 'has_topic', 'field_of_study')], None,
46 | m=data.num_nodes_dict['paper'],
47 | n=data.num_nodes_dict['field_of_study'])[0]
48 | print(data)
49 |
50 | metapath = [
51 | ('author', 'writes', 'paper'),
52 | ('paper', 'has_topic', 'field_of_study'),
53 | ('field_of_study', 'contains', 'paper'),
54 | ('paper', 'written_by', 'author'),
55 | ('author', 'affiliated_with', 'institution'),
56 | ('institution', 'employs', 'author'),
57 | ('author', 'writes', 'paper'),
58 | ('paper', 'cites', 'paper'),
59 | ('paper', 'written_by', 'author'),
60 | ]
61 |
62 | model = MetaPath2Vec(data.edge_index_dict, embedding_dim=128,
63 | metapath=metapath, walk_length=64, context_size=7,
64 | walks_per_node=5, num_negative_samples=5,
65 | sparse=True).to(device)
66 |
67 | loader = model.loader(batch_size=128, shuffle=True, num_workers=4)
68 | optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
69 |
70 | model.train()
71 | for epoch in range(1, args.epochs + 1):
72 | for i, (pos_rw, neg_rw) in enumerate(loader):
73 | optimizer.zero_grad()
74 | loss = model.loss(pos_rw.to(device), neg_rw.to(device))
75 | loss.backward()
76 | optimizer.step()
77 |
78 | if (i + 1) % args.log_steps == 0:
79 | print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
80 | f'Loss: {loss:.4f}')
81 |
82 | if (i + 1) % 1000 == 0: # Save model every 1000 steps.
83 | save_embedding(model)
84 | save_embedding(model)
85 |
86 |
87 | if __name__ == "__main__":
88 | main()
89 |
--------------------------------------------------------------------------------
/examples/nodeproppred/papers100M/README.md:
--------------------------------------------------------------------------------
1 | # ogbn-papers100M
2 |
3 | This repository includes the following example scripts.
4 |
5 | * **[sgc.py](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/papers100M/sgc.py)**: Simplified Graph Convolution (SGC) pre-processing on CPU. This will produce `sgc_dict.pt` that saves SGC features and labels for a subset of nodes that are associated with target labels. Requires more than 100GB CPU memory. Takes about a few hours to complete.
6 | * **[node2vec.py](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/papers100M/node2vec.py)**: Node2vec training on CPU. This will produce `data_dict.pt` that saves features and labels for a subset of nodes that are associated with target labels. Requires more than 100GB CPU memory. Each epoch takes about a few weeks. The pre-trained output node2vec embedding (only for labeled nodes) is available [here](https://snap.stanford.edu/ogb/data/misc/ogbn_papers100M/data_dict.pt).
7 | * **[MLP](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/papers100M/mlp.py)**: Mini-batch MLP training on GPU based on paper features and optional Node2Vec features (`--use_node_embedding`) or SGC features (`--use_sgc_embedding`).
8 |
9 | Note that the graph is huge: the size of the downloaded file is 57GB, and the size of the Pytorch Geometric graph object is 79GB (which takes a while to be generated and saved).
10 | Please ensure you have sufficient memory and storage before running the scripts.
11 |
12 |
13 | ## Training & Evaluation
14 |
15 | ```
16 | # Generate SGC features and save them as sgc_dict.pt
17 | python sgc.py
18 |
19 | # Generate node2vec features and save them as data_dict.pt
20 | # The features are saved every 1000 iterations.
21 | python node2vec.py
22 |
23 | ### Train MLPs based on the saved features and labels
24 | # Train based only on input node features (Requires data_dict.pt, the node2vec features do not need to be converged.)
25 | python mlp.py
26 |
27 | # Train based on SGC features (Requires sgc_dict.pt)
28 | python mlp.py --use_sgc_embedding
29 |
30 | # Train based on node2vec features (Requires data_dict.pt, the node2vec features need to be converged.)
31 | python mlp.py --use_node_embedding
32 | ```
33 |
34 | ## Getting Raw Texts
35 |
36 | The tsv file that maps paper IDs into their titles and abstracts are available [here](https://snap.stanford.edu/ogb/data/misc/ogbn_papers100M/paperinfo.zip) (34GB).
37 | ```bash
38 | unzip paperinfo.zip
39 | cd paperinfo
40 | ```
41 |
42 | There are two files: `idx_title.tsv` and `idx_abs.tsv`. For `idx_title.tsv`, the format is nodeidx \t title. For `idx_abs.tsv`, the format is nodeidx \t abstract.
43 | You can obtain the mapping from node idx to MAG's paper ID at `mapping/nodeidx2paperid.csv.gz` of the downloaded dataset directory.
44 | Note that the titles and abstract were created from a MAG dump that is different from the original dataset; Hence, abstract and titles of some paper nodes are missing.
45 |
--------------------------------------------------------------------------------
/examples/nodeproppred/papers100M/logger.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class Logger(object):
5 | def __init__(self, runs, info=None):
6 | self.info = info
7 | self.results = [[] for _ in range(runs)]
8 |
9 | def add_result(self, run, result):
10 | assert len(result) == 3
11 | assert run >= 0 and run < len(self.results)
12 | self.results[run].append(result)
13 |
14 | def print_statistics(self, run=None):
15 | if run is not None:
16 | result = 100 * torch.tensor(self.results[run])
17 | argmax = result[:, 1].argmax().item()
18 | print(f'Run {run + 1:02d}:')
19 | print(f'Highest Train: {result[:, 0].max():.2f}')
20 | print(f'Highest Valid: {result[:, 1].max():.2f}')
21 | print(f' Final Train: {result[argmax, 0]:.2f}')
22 | print(f' Final Test: {result[argmax, 2]:.2f}')
23 | else:
24 | result = 100 * torch.tensor(self.results)
25 |
26 | best_results = []
27 | for r in result:
28 | train1 = r[:, 0].max().item()
29 | valid = r[:, 1].max().item()
30 | train2 = r[r[:, 1].argmax(), 0].item()
31 | test = r[r[:, 1].argmax(), 2].item()
32 | best_results.append((train1, valid, train2, test))
33 |
34 | best_result = torch.tensor(best_results)
35 |
36 | print(f'All runs:')
37 | r = best_result[:, 0]
38 | print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
39 | r = best_result[:, 1]
40 | print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
41 | r = best_result[:, 2]
42 | print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}')
43 | r = best_result[:, 3]
44 | print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}')
45 |
--------------------------------------------------------------------------------
/examples/nodeproppred/papers100M/node2vec.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch_geometric.nn import Node2Vec
5 | from torch_geometric.utils import to_undirected, dropout_edge
6 | from tqdm.auto import tqdm
7 |
8 | from ogb.nodeproppred import PygNodePropPredDataset
9 |
10 | # save both node2vec embeddings and raw node features, labels, and split.
11 | # Only save nodes that are labeled (specified by save_idx).
12 | def save_data_dict(model, data, split_idx, save_file):
13 | data_dict = {}
14 | train_idx, valid_idx, test_idx = split_idx['train'], split_idx['valid'], split_idx['test']
15 | all_idx = torch.cat([train_idx, valid_idx, test_idx])
16 | mapped_train_idx = torch.arange(len(train_idx))
17 | mapped_valid_idx = torch.arange(len(train_idx), len(train_idx) + len(valid_idx))
18 | mapped_test_idx = torch.arange(len(train_idx) + len(valid_idx), len(train_idx) + len(valid_idx) + len(test_idx))
19 |
20 | data_dict['node2vec_embedding'] = model.embedding.weight.data[all_idx].cpu()
21 | data_dict['node_feat'] = data.x.data[all_idx]
22 | data_dict['label'] = data.y.data[all_idx].to(torch.long)
23 |
24 | data_dict['split_idx'] = {'train': mapped_train_idx, 'valid': mapped_valid_idx, 'test': mapped_test_idx}
25 |
26 | print(data_dict)
27 |
28 | torch.save(data_dict, save_file)
29 |
30 |
31 | def main():
32 | parser = argparse.ArgumentParser(description='OGBN-Papers100M (Node2Vec)')
33 | parser.add_argument('--device', type=int, default=0)
34 | parser.add_argument('--embedding_dim', type=int, default=128)
35 | parser.add_argument('--walk_length', type=int, default=20)
36 | parser.add_argument('--context_size', type=int, default=10)
37 | parser.add_argument('--walks_per_node', type=int, default=10)
38 | parser.add_argument('--batch_size', type=int, default=256)
39 | parser.add_argument('--lr', type=float, default=0.01)
40 | parser.add_argument('--epochs', type=int, default=2)
41 | parser.add_argument('--log_steps', type=int, default=10)
42 | parser.add_argument('--dropedge_rate', type=float, default=0.4)
43 | args = parser.parse_args()
44 |
45 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
46 | device = torch.device(device)
47 |
48 | dataset = PygNodePropPredDataset(name='ogbn-papers100M')
49 |
50 | split_idx = dataset.get_idx_split()
51 |
52 | data = dataset[0]
53 |
54 | print('Making the graph undirected.')
55 | ### Randomly drop some edges to avoid segmentation fault
56 | data.edge_index, _ = dropout_edge(data.edge_index, p = args.dropedge_rate)
57 | data.edge_index = to_undirected(data.edge_index, data.num_nodes)
58 | filename = 'data_dict.pt'
59 |
60 | print(data)
61 |
62 | model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length,
63 | args.context_size, args.walks_per_node,
64 | sparse=True).to(device)
65 |
66 | loader = model.loader(batch_size=args.batch_size, shuffle=True,
67 | num_workers=4)
68 | optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr)
69 |
70 | print('Saving data_dict before training...')
71 | save_data_dict(model, data, split_idx, save_file = filename)
72 |
73 | model.train()
74 | for epoch in range(1, args.epochs + 1):
75 | for i, (pos_rw, neg_rw) in tqdm(enumerate(loader)):
76 | optimizer.zero_grad()
77 | loss = model.loss(pos_rw.to(device), neg_rw.to(device))
78 | loss.backward()
79 | optimizer.step()
80 |
81 | if (i + 1) % args.log_steps == 0:
82 | print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
83 | f'Loss: {loss:.4f}')
84 |
85 | if (i + 1) % 1000 == 0: # Save model every 1000 steps.
86 | print('Saving data dict...')
87 | save_data_dict(model, data, split_idx, save_file = filename)
88 |
89 | print('Saving data dict...')
90 | save_data_dict(model, data, split_idx, save_file = filename)
91 |
92 |
93 | if __name__ == "__main__":
94 | main()
95 |
--------------------------------------------------------------------------------
/examples/nodeproppred/papers100M/sgc.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from tqdm.auto import tqdm
3 |
4 | import torch
5 | import torch.nn.functional as F
6 | from torch_sparse import SparseTensor
7 | from torch_geometric.utils import to_undirected, dropout_edge
8 |
9 | from ogb.nodeproppred import PygNodePropPredDataset
10 |
11 |
12 | def main():
13 | parser = argparse.ArgumentParser(description='OGBN-papers100M (MLP)')
14 | parser.add_argument('--num_propagations', type=int, default=3)
15 | parser.add_argument('--dropedge_rate', type=float, default=0.4)
16 | args = parser.parse_args()
17 |
18 | # SGC pre-processing ######################################################
19 |
20 | dataset = PygNodePropPredDataset('ogbn-papers100M')
21 | split_idx = dataset.get_idx_split()
22 | data = dataset[0]
23 |
24 | x = data.x.numpy()
25 | N = data.num_nodes
26 |
27 | print('Making the graph undirected.')
28 | ### Randomly drop some edges to save computation
29 | data.edge_index, _ = dropout_edge(data.edge_index, p = args.dropedge_rate)
30 | data.edge_index = to_undirected(data.edge_index, data.num_nodes)
31 |
32 | print(data)
33 |
34 | row, col = data.edge_index
35 |
36 | print('Computing adj...')
37 |
38 | adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N))
39 | adj = adj.set_diag()
40 | deg = adj.sum(dim=1).to(torch.float)
41 | deg_inv_sqrt = deg.pow(-0.5)
42 | deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
43 | adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)
44 |
45 | adj = adj.to_scipy(layout='csr')
46 |
47 |
48 | train_idx, valid_idx, test_idx = split_idx['train'], split_idx['valid'], split_idx['test']
49 | all_idx = torch.cat([train_idx, valid_idx, test_idx])
50 | mapped_train_idx = torch.arange(len(train_idx))
51 | mapped_valid_idx = torch.arange(len(train_idx), len(train_idx) + len(valid_idx))
52 | mapped_test_idx = torch.arange(len(train_idx) + len(valid_idx), len(train_idx) + len(valid_idx) + len(test_idx))
53 |
54 | sgc_dict = {}
55 | sgc_dict['label'] = data.y.data[all_idx].to(torch.long)
56 | sgc_dict['split_idx'] = {'train': mapped_train_idx, 'valid': mapped_valid_idx, 'test': mapped_test_idx}
57 |
58 |
59 | sgc_dict['sgc_embedding'] = []
60 | sgc_dict['sgc_embedding'].append(torch.from_numpy(x[all_idx]).to(torch.float))
61 |
62 | print('Start SGC processing')
63 |
64 | for _ in tqdm(range(args.num_propagations)):
65 | x = adj @ x
66 | sgc_dict['sgc_embedding'].append(torch.from_numpy(x[all_idx]).to(torch.float))
67 |
68 |
69 | print(sgc_dict)
70 |
71 | torch.save(sgc_dict, 'sgc_dict.pt')
72 |
73 |
74 | if __name__ == "__main__":
75 | main()
76 |
--------------------------------------------------------------------------------
/examples/nodeproppred/products/README.md:
--------------------------------------------------------------------------------
1 | # ogbn-products
2 |
3 | This repository includes the following example scripts:
4 |
5 | * **[MLP](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/products/mlp.py)**: Full-batch MLP training based on product features and optional Node2Vec features (`--use_node_embedding`). For training with Node2Vec features, this script requires node embeddings be saved in `embedding.pt`. To generate them, please run `python node2vec.py` [requires `torch-geometric>=1.5.0`].
6 | * **[GNN](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/products/gnn.py)**: Full-batch GNN training using either the GCN or GraphSAGE operator (`--use_sage`). This script will consume large amounts of GPU memory [requires `torch_geometric>=1.6.0`].
7 | * **[Cluster-GCN](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/products/cluster_gcn.py)**: Mini-batch GCN training using the Cluster-GCN algorithm [requires `torch-geometric>= 1.4.3`].
8 | * **[NeighborSampler](https://github.com/rusty1s/pytorch_geometric/blob/master/examples/ogbn_products_sage.py)**: Mini-batch GNN training using neighbor sampling [requires `torch-geometric>=1.5.0`].
9 | * **[GraphSAINT](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/products/graph_saint.py)**: Mini-batch GCN training using the GraphSAINT algorithm [requires `torch-geometric>=1.5.0`].
10 | * **[SIGN](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/products/sign.py)**: Training based on pre-computed GNN representations using Scalable Inception Graph Neural Networks (SIGN) [requires `torch-geometric>=1.5.0`].
11 |
12 | ## Training & Evaluation
13 |
14 | ```
15 | # Run with default config
16 | python cluster_gcn.py
17 |
18 | # Run with custom config
19 | python cluster_gcn.py --hidden_channels=128
20 | ```
21 |
--------------------------------------------------------------------------------
/examples/nodeproppred/products/logger.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class Logger(object):
5 | def __init__(self, runs, info=None):
6 | self.info = info
7 | self.results = [[] for _ in range(runs)]
8 |
9 | def add_result(self, run, result):
10 | assert len(result) == 3
11 | assert run >= 0 and run < len(self.results)
12 | self.results[run].append(result)
13 |
14 | def print_statistics(self, run=None):
15 | if run is not None:
16 | result = 100 * torch.tensor(self.results[run])
17 | argmax = result[:, 1].argmax().item()
18 | print(f'Run {run + 1:02d}:')
19 | print(f'Highest Train: {result[:, 0].max():.2f}')
20 | print(f'Highest Valid: {result[:, 1].max():.2f}')
21 | print(f' Final Train: {result[argmax, 0]:.2f}')
22 | print(f' Final Test: {result[argmax, 2]:.2f}')
23 | else:
24 | result = 100 * torch.tensor(self.results)
25 |
26 | best_results = []
27 | for r in result:
28 | train1 = r[:, 0].max().item()
29 | valid = r[:, 1].max().item()
30 | train2 = r[r[:, 1].argmax(), 0].item()
31 | test = r[r[:, 1].argmax(), 2].item()
32 | best_results.append((train1, valid, train2, test))
33 |
34 | best_result = torch.tensor(best_results)
35 |
36 | print(f'All runs:')
37 | r = best_result[:, 0]
38 | print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
39 | r = best_result[:, 1]
40 | print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
41 | r = best_result[:, 2]
42 | print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}')
43 | r = best_result[:, 3]
44 | print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}')
45 |
--------------------------------------------------------------------------------
/examples/nodeproppred/products/node2vec.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch_geometric.nn import Node2Vec
5 |
6 | from ogb.nodeproppred import PygNodePropPredDataset
7 |
8 |
9 | def save_embedding(model):
10 | torch.save(model.embedding.weight.data.cpu(), 'embedding.pt')
11 |
12 |
13 | def main():
14 | parser = argparse.ArgumentParser(description='OGBN-Products (Node2Vec)')
15 | parser.add_argument('--device', type=int, default=0)
16 | parser.add_argument('--embedding_dim', type=int, default=128)
17 | parser.add_argument('--walk_length', type=int, default=40)
18 | parser.add_argument('--context_size', type=int, default=20)
19 | parser.add_argument('--walks_per_node', type=int, default=10)
20 | parser.add_argument('--batch_size', type=int, default=256)
21 | parser.add_argument('--lr', type=float, default=0.01)
22 | parser.add_argument('--epochs', type=int, default=1)
23 | parser.add_argument('--log_steps', type=int, default=1)
24 | args = parser.parse_args()
25 |
26 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
27 | device = torch.device(device)
28 |
29 | dataset = PygNodePropPredDataset(name='ogbn-products')
30 | data = dataset[0]
31 |
32 | model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length,
33 | args.context_size, args.walks_per_node,
34 | sparse=True).to(device)
35 |
36 | loader = model.loader(batch_size=args.batch_size, shuffle=True,
37 | num_workers=4)
38 | optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr)
39 |
40 | model.train()
41 | for epoch in range(1, args.epochs + 1):
42 | for i, (pos_rw, neg_rw) in enumerate(loader):
43 | optimizer.zero_grad()
44 | loss = model.loss(pos_rw.to(device), neg_rw.to(device))
45 | loss.backward()
46 | optimizer.step()
47 |
48 | if (i + 1) % args.log_steps == 0:
49 | print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
50 | f'Loss: {loss:.4f}')
51 |
52 | if (i + 1) % 100 == 0: # Save model every 100 steps.
53 | save_embedding(model)
54 | save_embedding(model)
55 |
56 |
57 | if __name__ == "__main__":
58 | main()
59 |
--------------------------------------------------------------------------------
/examples/nodeproppred/proteins/README.md:
--------------------------------------------------------------------------------
1 | # ogbn-proteins
2 |
3 | This repository includes the following example scripts:
4 |
5 | * **[MLP](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/proteins/mlp.py)**: Full-batch MLP training based on average incoming edge features and optional Node2Vec features (`--use_node_embedding`). For training with Node2Vec features, this script requires node embeddings be saved in `embedding.pt`. To generate them, please run `python node2vec.py` [requires `torch-geometric>=1.5.0`].
6 | * **[GNN](https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppred/proteins/gnn.py)**: Full-batch GNN training using either the GCN or GraphSAGE operator (`--use_sage`) [requires `torch-geometric>=1.6.0`].
7 |
8 | ## Training & Evaluation
9 |
10 | ```
11 | # Run with default config
12 | python gnn.py
13 |
14 | # Run with custom config (adding dropout may improve performance)
15 | python gnn.py --dropout 0.5
16 | ```
17 |
--------------------------------------------------------------------------------
/examples/nodeproppred/proteins/logger.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class Logger(object):
5 | def __init__(self, runs, info=None):
6 | self.info = info
7 | self.results = [[] for _ in range(runs)]
8 |
9 | def add_result(self, run, result):
10 | assert len(result) == 3
11 | assert run >= 0 and run < len(self.results)
12 | self.results[run].append(result)
13 |
14 | def print_statistics(self, run=None):
15 | if run is not None:
16 | result = 100 * torch.tensor(self.results[run])
17 | argmax = result[:, 1].argmax().item()
18 | print(f'Run {run + 1:02d}:')
19 | print(f'Highest Train: {result[:, 0].max():.2f}')
20 | print(f'Highest Valid: {result[:, 1].max():.2f}')
21 | print(f' Final Train: {result[argmax, 0]:.2f}')
22 | print(f' Final Test: {result[argmax, 2]:.2f}')
23 | else:
24 | result = 100 * torch.tensor(self.results)
25 |
26 | best_results = []
27 | for r in result:
28 | train1 = r[:, 0].max().item()
29 | valid = r[:, 1].max().item()
30 | train2 = r[r[:, 1].argmax(), 0].item()
31 | test = r[r[:, 1].argmax(), 2].item()
32 | best_results.append((train1, valid, train2, test))
33 |
34 | best_result = torch.tensor(best_results)
35 |
36 | print(f'All runs:')
37 | r = best_result[:, 0]
38 | print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
39 | r = best_result[:, 1]
40 | print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
41 | r = best_result[:, 2]
42 | print(f' Final Train: {r.mean():.2f} ± {r.std():.2f}')
43 | r = best_result[:, 3]
44 | print(f' Final Test: {r.mean():.2f} ± {r.std():.2f}')
45 |
--------------------------------------------------------------------------------
/examples/nodeproppred/proteins/node2vec.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 | from torch_geometric.nn import Node2Vec
5 |
6 | from ogb.nodeproppred import PygNodePropPredDataset
7 |
8 |
9 | def save_embedding(model):
10 | torch.save(model.embedding.weight.data.cpu(), 'embedding.pt')
11 |
12 |
13 | def main():
14 | parser = argparse.ArgumentParser(description='OGBN-Proteins (Node2Vec)')
15 | parser.add_argument('--device', type=int, default=0)
16 | parser.add_argument('--embedding_dim', type=int, default=128)
17 | parser.add_argument('--walk_length', type=int, default=80)
18 | parser.add_argument('--context_size', type=int, default=20)
19 | parser.add_argument('--walks_per_node', type=int, default=10)
20 | parser.add_argument('--batch_size', type=int, default=256)
21 | parser.add_argument('--lr', type=float, default=0.01)
22 | parser.add_argument('--epochs', type=int, default=1)
23 | parser.add_argument('--log_steps', type=int, default=1)
24 | args = parser.parse_args()
25 |
26 | device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
27 | device = torch.device(device)
28 |
29 | dataset = PygNodePropPredDataset(name='ogbn-proteins')
30 | data = dataset[0]
31 |
32 | model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length,
33 | args.context_size, args.walks_per_node,
34 | sparse=True).to(device)
35 |
36 | loader = model.loader(batch_size=args.batch_size, shuffle=True,
37 | num_workers=4)
38 | optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr)
39 |
40 | model.train()
41 | for epoch in range(1, args.epochs + 1):
42 | for i, (pos_rw, neg_rw) in enumerate(loader):
43 | optimizer.zero_grad()
44 | loss = model.loss(pos_rw.to(device), neg_rw.to(device))
45 | loss.backward()
46 | optimizer.step()
47 |
48 | if (i + 1) % args.log_steps == 0:
49 | print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
50 | f'Loss: {loss:.4f}')
51 |
52 | if (i + 1) % 100 == 0: # Save model every 100 steps.
53 | save_embedding(model)
54 | save_embedding(model)
55 |
56 |
57 | if __name__ == "__main__":
58 | main()
59 |
--------------------------------------------------------------------------------
/ogb/__init__.py:
--------------------------------------------------------------------------------
1 | from .version import __version__
--------------------------------------------------------------------------------
/ogb/graphproppred/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluate import Evaluator
2 | from .dataset import GraphPropPredDataset
3 |
4 | try:
5 | from .dataset_pyg import PygGraphPropPredDataset
6 | except ImportError:
7 | pass
8 |
9 | try:
10 | from .dataset_dgl import DglGraphPropPredDataset
11 | from .dataset_dgl import collate_dgl
12 | except (ImportError, OSError):
13 | pass
14 |
--------------------------------------------------------------------------------
/ogb/graphproppred/make_master_file.py:
--------------------------------------------------------------------------------
1 | ### script for writing meta information of datasets into master.csv
2 | ### for graph property prediction datasets.
3 | import pandas as pd
4 |
5 | dataset_list = []
6 | dataset_dict = {}
7 |
8 | ### start molecule dataset
9 | dataset_dict['ogbg-molbace'] = {'num tasks': 1, 'eval metric': 'rocauc', 'download_name': 'bace'}
10 | dataset_dict['ogbg-molbbbp'] = {'num tasks': 1, 'eval metric': 'rocauc', 'download_name': 'bbbp'}
11 | dataset_dict['ogbg-molclintox'] = {'num tasks': 2, 'eval metric': 'rocauc', 'download_name': 'clintox'}
12 | dataset_dict['ogbg-molmuv'] = {'num tasks': 17, 'eval metric': 'ap', 'download_name': 'muv'}
13 | dataset_dict['ogbg-molpcba'] = {'num tasks': 128, 'eval metric': 'ap', 'download_name': 'pcba'}
14 | dataset_dict['ogbg-molsider'] = {'num tasks': 27, 'eval metric': 'rocauc', 'download_name': 'sider'}
15 | dataset_dict['ogbg-moltox21'] = {'num tasks': 12, 'eval metric': 'rocauc', 'download_name': 'tox21'}
16 | dataset_dict['ogbg-moltoxcast'] = {'num tasks': 617, 'eval metric': 'rocauc', 'download_name': 'toxcast'}
17 | dataset_dict['ogbg-molhiv'] = {'num tasks': 1, 'eval metric': 'rocauc', 'download_name': 'hiv'}
18 | dataset_dict['ogbg-molesol'] = {'num tasks': 1, 'eval metric': 'rmse', 'download_name': 'esol'}
19 | dataset_dict['ogbg-molfreesolv'] = {'num tasks': 1, 'eval metric': 'rmse', 'download_name': 'freesolv'}
20 | dataset_dict['ogbg-mollipo'] = {'num tasks': 1, 'eval metric': 'rmse', 'download_name': 'lipophilicity'}
21 | dataset_dict['ogbg-molchembl'] = {'num tasks': 1310, 'eval metric': 'rocauc', 'download_name': 'chembl'}
22 |
23 | mol_dataset_list = list(dataset_dict.keys())
24 |
25 | for nme in mol_dataset_list:
26 | download_folder_name = dataset_dict[nme]['download_name']
27 | dataset_dict[nme]['version'] = 1
28 | dataset_dict[nme]['url'] = 'http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/' + download_folder_name + '.zip'
29 | dataset_dict[nme]['add_inverse_edge'] = True
30 | dataset_dict[nme]['data type'] = 'mol'
31 | dataset_dict[nme]['has_node_attr'] = True
32 | dataset_dict[nme]['has_edge_attr'] = True
33 |
34 | if dataset_dict[nme]['eval metric'] == 'rmse':
35 | dataset_dict[nme]['task type'] = 'regression'
36 | dataset_dict[nme]['num classes'] = -1 # num classes is not defined for regression datasets.
37 | else:
38 | dataset_dict[nme]['task type'] = 'binary classification'
39 | dataset_dict[nme]['num classes'] = 2
40 |
41 | dataset_dict[nme]['split'] = 'scaffold'
42 |
43 | dataset_dict[nme]['additional node files'] = 'None'
44 | dataset_dict[nme]['additional edge files'] = 'None'
45 | dataset_dict[nme]['binary'] = False
46 |
47 | dataset_list.extend(mol_dataset_list)
48 |
49 | ### end molecule dataset
50 |
51 | ### add ppi dataset (medium)
52 | name = 'ogbg-ppa'
53 | dataset_dict[name] = {'eval metric': 'acc'}
54 | dataset_dict[name]['download_name'] = 'ogbg_ppi_medium'
55 | dataset_dict[name]['version'] = 1
56 | dataset_dict[name]['url'] = 'http://snap.stanford.edu/ogb/data/graphproppred/' + dataset_dict[name]['download_name'] + '.zip'
57 | ## For undirected grarph, we only store one directional information. This flag allows us to add inverse edge at pre-processing time
58 | dataset_dict[name]['add_inverse_edge'] = True
59 | dataset_dict[name]['split'] = 'species'
60 | dataset_dict[name]['num tasks'] = 1
61 | dataset_dict[name]['has_node_attr'] = False
62 | dataset_dict[name]['has_edge_attr'] = True
63 | dataset_dict[name]['task type'] = 'multiclass classification'
64 | dataset_dict[name]['num classes'] = 37
65 | dataset_dict[name]['additional node files'] = 'None'
66 | dataset_dict[name]['additional edge files'] = 'None'
67 | dataset_dict[name]['binary'] = False
68 |
69 |
70 | ### add ppi dataset (medium)
71 | name = 'ogbg-code2'
72 | dataset_dict[name] = {'eval metric': 'F1'}
73 | dataset_dict[name]['download_name'] = 'code2'
74 | dataset_dict[name]['version'] = 1
75 | dataset_dict[name]['url'] = 'http://snap.stanford.edu/ogb/data/graphproppred/' + dataset_dict[name]['download_name'] + '.zip'
76 | dataset_dict[name]['add_inverse_edge'] = False
77 | dataset_dict[name]['split'] = 'project'
78 | dataset_dict[name]['num tasks'] = 1
79 | dataset_dict[name]['has_node_attr'] = True
80 | dataset_dict[name]['has_edge_attr'] = False
81 | dataset_dict[name]['task type'] = 'subtoken prediction'
82 | dataset_dict[name]['num classes'] = -1
83 | dataset_dict[name]['additional node files'] = 'node_is_attributed,node_dfs_order,node_depth'
84 | dataset_dict[name]['additional edge files'] = 'None'
85 | dataset_dict[name]['binary'] = False
86 |
87 |
88 | df = pd.DataFrame(dataset_dict)
89 | # saving the dataframe
90 | df.to_csv('master.csv')
--------------------------------------------------------------------------------
/ogb/graphproppred/master.csv:
--------------------------------------------------------------------------------
1 | ,ogbg-molbace,ogbg-molbbbp,ogbg-molclintox,ogbg-molmuv,ogbg-molpcba,ogbg-molsider,ogbg-moltox21,ogbg-moltoxcast,ogbg-molhiv,ogbg-molesol,ogbg-molfreesolv,ogbg-mollipo,ogbg-molchembl,ogbg-ppa,ogbg-code2
2 | num tasks,1,1,2,17,128,27,12,617,1,1,1,1,1310,1,1
3 | eval metric,rocauc,rocauc,rocauc,ap,ap,rocauc,rocauc,rocauc,rocauc,rmse,rmse,rmse,rocauc,acc,F1
4 | download_name,bace,bbbp,clintox,muv,pcba,sider,tox21,toxcast,hiv,esol,freesolv,lipophilicity,chembl,ogbg_ppi_medium,code2
5 | version,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
6 | url,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/bace.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/bbbp.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/clintox.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/muv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/pcba.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/sider.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/tox21.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/toxcast.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/hiv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/esol.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/freesolv.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/lipophilicity.zip,http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/chembl.zip,http://snap.stanford.edu/ogb/data/graphproppred/ogbg_ppi_medium.zip,http://snap.stanford.edu/ogb/data/graphproppred/code2.zip
7 | add_inverse_edge,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False
8 | data type,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,mol,,
9 | has_node_attr,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True
10 | has_edge_attr,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False
11 | task type,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,binary classification,regression,regression,regression,binary classification,multiclass classification,subtoken prediction
12 | num classes,2,2,2,2,2,2,2,2,2,-1,-1,-1,2,37,-1
13 | split,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,scaffold,species,project
14 | additional node files,None,None,None,None,None,None,None,None,None,None,None,None,None,None,"node_is_attributed,node_dfs_order,node_depth"
15 | additional edge files,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None
16 | binary,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
17 |
--------------------------------------------------------------------------------
/ogb/graphproppred/mol_encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims
3 |
4 | class AtomEncoder(torch.nn.Module):
5 | def __init__(self, emb_dim, optional_full_atom_features_dims=None):
6 | super(AtomEncoder, self).__init__()
7 |
8 |
9 | self.atom_embedding_list = torch.nn.ModuleList()
10 |
11 | if optional_full_atom_features_dims is not None:
12 | full_atom_feature_dims = optional_full_atom_features_dims
13 | else:
14 | full_atom_feature_dims = get_atom_feature_dims()
15 |
16 | for i, dim in enumerate(full_atom_feature_dims):
17 | emb = torch.nn.Embedding(dim, emb_dim)
18 | torch.nn.init.xavier_uniform_(emb.weight.data)
19 | self.atom_embedding_list.append(emb)
20 |
21 | def forward(self, x):
22 | x_embedding = 0
23 | for i in range(x.shape[1]):
24 | x_embedding += self.atom_embedding_list[i](x[:,i])
25 |
26 | return x_embedding
27 |
28 |
29 | class BondEncoder(torch.nn.Module):
30 | def __init__(self, emb_dim):
31 | super(BondEncoder, self).__init__()
32 |
33 | full_bond_feature_dims = get_bond_feature_dims()
34 |
35 | self.bond_embedding_list = torch.nn.ModuleList()
36 |
37 | for i, dim in enumerate(full_bond_feature_dims):
38 | emb = torch.nn.Embedding(dim, emb_dim)
39 | torch.nn.init.xavier_uniform_(emb.weight.data)
40 | self.bond_embedding_list.append(emb)
41 |
42 | def forward(self, edge_attr):
43 | bond_embedding = 0
44 | for i in range(edge_attr.shape[1]):
45 | bond_embedding += self.bond_embedding_list[i](edge_attr[:,i])
46 |
47 | return bond_embedding
48 |
49 |
50 | if __name__ == '__main__':
51 | from loader import GraphClassificationPygDataset
52 | dataset = GraphClassificationPygDataset(name = 'tox21')
53 | atom_enc = AtomEncoder(100)
54 | bond_enc = BondEncoder(100)
55 |
56 | print(atom_enc(dataset[0].x))
57 | print(bond_enc(dataset[0].edge_attr))
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/ogb/io/__init__.py:
--------------------------------------------------------------------------------
1 | from .save_dataset import DatasetSaver
--------------------------------------------------------------------------------
/ogb/io/read_graph_dgl.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import torch
3 | import os.path as osp
4 | import numpy as np
5 | import dgl
6 | from ogb.io.read_graph_raw import read_csv_graph_raw, read_csv_heterograph_raw, read_binary_graph_raw, read_binary_heterograph_raw
7 | from tqdm.auto import tqdm
8 |
9 | def read_graph_dgl(raw_dir, add_inverse_edge = False, additional_node_files = [], additional_edge_files = [], binary=False):
10 |
11 | if binary:
12 | # npz
13 | graph_list = read_binary_graph_raw(raw_dir, add_inverse_edge)
14 | else:
15 | # csv
16 | graph_list = read_csv_graph_raw(raw_dir, add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files)
17 |
18 | dgl_graph_list = []
19 |
20 | print('Converting graphs into DGL objects...')
21 |
22 | for graph in tqdm(graph_list):
23 | g = dgl.graph((graph['edge_index'][0], graph['edge_index'][1]), num_nodes = graph['num_nodes'])
24 |
25 | if graph['edge_feat'] is not None:
26 | g.edata['feat'] = torch.from_numpy(graph['edge_feat'])
27 |
28 | if graph['node_feat'] is not None:
29 | g.ndata['feat'] = torch.from_numpy(graph['node_feat'])
30 |
31 | for key in additional_node_files:
32 | g.ndata[key[5:]] = torch.from_numpy(graph[key])
33 |
34 | for key in additional_edge_files:
35 | g.edata[key[5:]] = torch.from_numpy(graph[key])
36 |
37 | dgl_graph_list.append(g)
38 |
39 | return dgl_graph_list
40 |
41 |
42 | def read_heterograph_dgl(raw_dir, add_inverse_edge = False, additional_node_files = [], additional_edge_files = [], binary=False):
43 |
44 | if binary:
45 | # npz
46 | graph_list = read_binary_heterograph_raw(raw_dir, add_inverse_edge)
47 | else:
48 | # csv
49 | graph_list = read_csv_heterograph_raw(raw_dir, add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files)
50 |
51 | dgl_graph_list = []
52 |
53 | print('Converting graphs into DGL objects...')
54 |
55 | for graph in tqdm(graph_list):
56 | g_dict = {}
57 |
58 | # add edge connectivity
59 | for triplet, edge_index in graph['edge_index_dict'].items():
60 | edge_tuple = [(i, j) for i, j in zip(graph['edge_index_dict'][triplet][0], graph['edge_index_dict'][triplet][1])]
61 | g_dict[triplet] = edge_tuple
62 |
63 | dgl_hetero_graph = dgl.heterograph(g_dict, num_nodes_dict = graph['num_nodes_dict'])
64 |
65 | if graph['edge_feat_dict'] is not None:
66 | for triplet in graph['edge_feat_dict'].keys():
67 | dgl_hetero_graph.edges[triplet].data['feat'] = torch.from_numpy(graph['edge_feat_dict'][triplet])
68 |
69 | if graph['node_feat_dict'] is not None:
70 | for nodetype in graph['node_feat_dict'].keys():
71 | dgl_hetero_graph.nodes[nodetype].data['feat'] = torch.from_numpy(graph['node_feat_dict'][nodetype])
72 |
73 | for key in additional_node_files:
74 | for nodetype in graph[key].keys():
75 | dgl_hetero_graph.nodes[nodetype].data[key[5:]] = torch.from_numpy(graph[key][nodetype])
76 |
77 | for key in additional_edge_files:
78 | for triplet in graph[key].keys():
79 | dgl_hetero_graph.edges[triplet].data[key[5:]] = torch.from_numpy(graph[key][triplet])
80 |
81 | dgl_graph_list.append(dgl_hetero_graph)
82 |
83 |
84 | return dgl_graph_list
85 |
86 |
87 | if __name__ == '__main__':
88 | pass
89 |
--------------------------------------------------------------------------------
/ogb/io/read_graph_pyg.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import torch
3 | from torch_geometric.data import Data
4 | import os.path as osp
5 | import numpy as np
6 | from ogb.io.read_graph_raw import read_csv_graph_raw, read_csv_heterograph_raw, read_binary_graph_raw, read_binary_heterograph_raw
7 | from tqdm.auto import tqdm
8 |
9 | def read_graph_pyg(raw_dir, add_inverse_edge = False, additional_node_files = [], additional_edge_files = [], binary = False):
10 |
11 | if binary:
12 | # npz
13 | graph_list = read_binary_graph_raw(raw_dir, add_inverse_edge)
14 | else:
15 | # csv
16 | graph_list = read_csv_graph_raw(raw_dir, add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files)
17 |
18 | pyg_graph_list = []
19 |
20 | print('Converting graphs into PyG objects...')
21 |
22 | for graph in tqdm(graph_list):
23 | g = Data()
24 | g.num_nodes = graph['num_nodes']
25 | g.edge_index = torch.from_numpy(graph['edge_index'])
26 |
27 | del graph['num_nodes']
28 | del graph['edge_index']
29 |
30 | if graph['edge_feat'] is not None:
31 | g.edge_attr = torch.from_numpy(graph['edge_feat'])
32 | del graph['edge_feat']
33 |
34 | if graph['node_feat'] is not None:
35 | g.x = torch.from_numpy(graph['node_feat'])
36 | del graph['node_feat']
37 |
38 | for key in additional_node_files:
39 | g[key] = torch.from_numpy(graph[key])
40 | del graph[key]
41 |
42 | for key in additional_edge_files:
43 | g[key] = torch.from_numpy(graph[key])
44 | del graph[key]
45 |
46 | pyg_graph_list.append(g)
47 |
48 | return pyg_graph_list
49 |
50 |
51 | def read_heterograph_pyg(raw_dir, add_inverse_edge = False, additional_node_files = [], additional_edge_files = [], binary = False):
52 |
53 | if binary:
54 | # npz
55 | graph_list = read_binary_heterograph_raw(raw_dir, add_inverse_edge)
56 | else:
57 | # csv
58 | graph_list = read_csv_heterograph_raw(raw_dir, add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files)
59 |
60 | pyg_graph_list = []
61 |
62 | print('Converting graphs into PyG objects...')
63 |
64 | for graph in tqdm(graph_list):
65 | g = Data()
66 |
67 | g.__num_nodes__ = graph['num_nodes_dict']
68 | g.num_nodes_dict = graph['num_nodes_dict']
69 |
70 | # add edge connectivity
71 | g.edge_index_dict = {}
72 | for triplet, edge_index in graph['edge_index_dict'].items():
73 | g.edge_index_dict[triplet] = torch.from_numpy(edge_index)
74 |
75 | del graph['edge_index_dict']
76 |
77 | if graph['edge_feat_dict'] is not None:
78 | g.edge_attr_dict = {}
79 | for triplet in graph['edge_feat_dict'].keys():
80 | g.edge_attr_dict[triplet] = torch.from_numpy(graph['edge_feat_dict'][triplet])
81 |
82 | del graph['edge_feat_dict']
83 |
84 | if graph['node_feat_dict'] is not None:
85 | g.x_dict = {}
86 | for nodetype in graph['node_feat_dict'].keys():
87 | g.x_dict[nodetype] = torch.from_numpy(graph['node_feat_dict'][nodetype])
88 |
89 | del graph['node_feat_dict']
90 |
91 | for key in additional_node_files:
92 | g[key] = {}
93 | for nodetype in graph[key].keys():
94 | g[key][nodetype] = torch.from_numpy(graph[key][nodetype])
95 |
96 | del graph[key]
97 |
98 | for key in additional_edge_files:
99 | g[key] = {}
100 | for triplet in graph[key].keys():
101 | g[key][triplet] = torch.from_numpy(graph[key][triplet])
102 |
103 | del graph[key]
104 |
105 | pyg_graph_list.append(g)
106 |
107 | return pyg_graph_list
108 |
109 | if __name__ == '__main__':
110 | pass
111 |
--------------------------------------------------------------------------------
/ogb/linkproppred/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluate import Evaluator
2 | from .dataset import LinkPropPredDataset
3 |
4 | try:
5 | from .dataset_pyg import PygLinkPropPredDataset
6 | except ImportError:
7 | pass
8 |
9 | try:
10 | from .dataset_dgl import DglLinkPropPredDataset
11 | except (ImportError, OSError):
12 | pass
13 |
--------------------------------------------------------------------------------
/ogb/linkproppred/master.csv:
--------------------------------------------------------------------------------
1 | ,ogbl-ppa,ogbl-collab,ogbl-citation2,ogbl-wikikg2,ogbl-ddi,ogbl-biokg,ogbl-vessel
2 | eval metric,hits@100,hits@50,mrr,mrr,hits@20,mrr,rocauc
3 | task type,link prediction,link prediction,link prediction,KG completion,link prediction,KG completion,link prediction
4 | download_name,ppassoc,collab,citation-v2,wikikg-v2,ddi,biokg,vessel
5 | version,1,1,1,1,1,1,1
6 | url,http://snap.stanford.edu/ogb/data/linkproppred/ppassoc.zip,http://snap.stanford.edu/ogb/data/linkproppred/collab.zip,http://snap.stanford.edu/ogb/data/linkproppred/citation-v2.zip,http://snap.stanford.edu/ogb/data/linkproppred/wikikg-v2.zip,http://snap.stanford.edu/ogb/data/linkproppred/ddi.zip,http://snap.stanford.edu/ogb/data/linkproppred/biokg.zip,http://snap.stanford.edu/ogb/data/linkproppred/vessel.zip
7 | add_inverse_edge,True,True,False,False,True,False,False
8 | has_node_attr,True,True,True,False,False,False,True
9 | has_edge_attr,False,False,False,False,False,False,True
10 | split,throughput,time,time,time,target,random,spatial
11 | additional node files,None,None,node_year,None,None,None,None
12 | additional edge files,None,"edge_weight,edge_year",None,edge_reltype,None,edge_reltype,None
13 | is hetero,False,False,False,False,False,True,False
14 | binary,False,False,False,False,False,False,True
15 |
--------------------------------------------------------------------------------
/ogb/lsc/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | from .pcqm4m import PCQM4MDataset, PCQM4MEvaluator
3 | except ImportError:
4 | pass
5 |
6 | try:
7 | from .pcqm4m_pyg import PygPCQM4MDataset
8 | except ImportError:
9 | pass
10 |
11 | try:
12 | from .pcqm4m_dgl import DglPCQM4MDataset
13 | except (ImportError, OSError):
14 | pass
15 |
16 | try:
17 | from .pcqm4mv2 import PCQM4Mv2Dataset, PCQM4Mv2Evaluator
18 | except ImportError:
19 | pass
20 |
21 | try:
22 | from .pcqm4mv2_pyg import PygPCQM4Mv2Dataset
23 | except ImportError:
24 | pass
25 |
26 | try:
27 | from .pcqm4mv2_dgl import DglPCQM4Mv2Dataset
28 | except (ImportError, OSError):
29 | pass
30 |
31 | from .mag240m import MAG240MDataset, MAG240MEvaluator
32 | from .wikikg90m import WikiKG90MDataset, WikiKG90MEvaluator
33 | from .wikikg90mv2 import WikiKG90Mv2Dataset, WikiKG90Mv2Evaluator
34 |
--------------------------------------------------------------------------------
/ogb/lsc/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | # Assigning the whole test into test-dev and test-challenge
5 | def split_test(split_dict):
6 | if 'test' not in split_dict:
7 | assert 'test-whole' in split_dict
8 | assert 'test-dev' in split_dict
9 | assert 'test-challenge' in split_dict
10 | return
11 |
12 | if isinstance(split_dict['test'], torch.Tensor):
13 | idx = torch.arange(len(split_dict['test']))
14 | dev_idx = torch.nonzero(idx % 5 < 3, as_tuple=True)[0]
15 | challenge_idx = torch.nonzero(~(idx % 5 < 3), as_tuple=True)[0]
16 | else:
17 | idx = np.arange(len(split_dict['test']))
18 | dev_idx = np.nonzero(idx % 5 < 3)[0]
19 | challenge_idx = np.nonzero(~(idx % 5 < 3))[0]
20 |
21 | split_dict['test-whole'] = split_dict['test']
22 | split_dict['test-dev'] = split_dict['test-whole'][dev_idx]
23 | split_dict['test-challenge'] = split_dict['test-whole'][challenge_idx]
24 |
25 | assert len(split_dict['test-dev']) + len(split_dict['test-whole'][challenge_idx]) == len(split_dict['test'])
26 |
27 | del split_dict['test']
--------------------------------------------------------------------------------
/ogb/nodeproppred/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluate import Evaluator
2 | from .dataset import NodePropPredDataset
3 |
4 | try:
5 | from .dataset_pyg import PygNodePropPredDataset
6 | except ImportError:
7 | pass
8 |
9 | try:
10 | from .dataset_dgl import DglNodePropPredDataset
11 | except (ImportError, OSError):
12 | pass
13 |
--------------------------------------------------------------------------------
/ogb/nodeproppred/make_master_file.py:
--------------------------------------------------------------------------------
1 | ### script for writing meta information of datasets into master.csv
2 | ### for node property prediction datasets.
3 | import pandas as pd
4 |
5 | dataset_dict = {}
6 | dataset_list = []
7 |
8 | ### add meta-information about protein function prediction task
9 | name = 'ogbn-proteins'
10 | dataset_dict[name] = {'num tasks': 112, 'num classes': 2, 'eval metric': 'rocauc', 'task type': 'binary classification'}
11 | dataset_dict[name]['download_name'] = 'proteins'
12 | dataset_dict[name]['version'] = 1
13 | dataset_dict[name]['url'] = 'http://snap.stanford.edu/ogb/data/nodeproppred/'+dataset_dict[name]['download_name']+'.zip'
14 | ## For undirected grarph, we only store one directional information. This flag allows us to add inverse edge at pre-processing time
15 | dataset_dict[name]['add_inverse_edge'] = True
16 | dataset_dict[name]['has_node_attr'] = False
17 | dataset_dict[name]['has_edge_attr'] = True
18 | dataset_dict[name]['split'] = 'species'
19 | dataset_dict[name]['additional node files'] = 'node_species'
20 | dataset_dict[name]['additional edge files'] = 'None'
21 | dataset_dict[name]['is hetero'] = False
22 | dataset_dict[name]['binary'] = False
23 |
24 | ### add meta-information about product category prediction task
25 | name = 'ogbn-products'
26 | dataset_dict[name] = {'num tasks': 1, 'num classes': 47, 'eval metric': 'acc', 'task type': 'multiclass classification'}
27 | dataset_dict[name]['download_name'] = 'products'
28 | dataset_dict[name]['version'] = 1
29 | dataset_dict[name]['url'] = 'http://snap.stanford.edu/ogb/data/nodeproppred/'+dataset_dict[name]['download_name']+'.zip'
30 | ## For undirected grarph, we only store one directional information. This flag allows us to add inverse edge at pre-processing time
31 | dataset_dict[name]['add_inverse_edge'] = True
32 | dataset_dict[name]['has_node_attr'] = True
33 | dataset_dict[name]['has_edge_attr'] = False
34 | dataset_dict[name]['split'] = 'sales_ranking'
35 | dataset_dict[name]['additional node files'] = 'None'
36 | dataset_dict[name]['additional edge files'] = 'None'
37 | dataset_dict[name]['is hetero'] = False
38 | dataset_dict[name]['binary'] = False
39 |
40 | ### add meta-information about arxiv category prediction task
41 | name = 'ogbn-arxiv'
42 | dataset_dict[name] = {'num tasks': 1, 'num classes': 40, 'eval metric': 'acc', 'task type': 'multiclass classification'}
43 | dataset_dict[name]['download_name'] = 'arxiv'
44 | dataset_dict[name]['version'] = 1
45 | dataset_dict[name]['url'] = 'http://snap.stanford.edu/ogb/data/nodeproppred/'+dataset_dict[name]['download_name']+'.zip'
46 | dataset_dict[name]['add_inverse_edge'] = False
47 | dataset_dict[name]['has_node_attr'] = True
48 | dataset_dict[name]['has_edge_attr'] = False
49 | dataset_dict[name]['split'] = 'time'
50 | dataset_dict[name]['additional node files'] = 'node_year'
51 | dataset_dict[name]['additional edge files'] = 'None'
52 | dataset_dict[name]['is hetero'] = False
53 | dataset_dict[name]['binary'] = False
54 |
55 | ### add meta-information about paper venue prediction task
56 | name = 'ogbn-mag'
57 | dataset_dict[name] = {'num tasks': 1, 'num classes': 349, 'eval metric': 'acc', 'task type': 'multiclass classification'}
58 | dataset_dict[name]['download_name'] = 'mag'
59 | dataset_dict[name]['version'] = 2
60 | dataset_dict[name]['url'] = 'http://snap.stanford.edu/ogb/data/nodeproppred/'+dataset_dict[name]['download_name']+'.zip'
61 | dataset_dict[name]['add_inverse_edge'] = False
62 | dataset_dict[name]['has_node_attr'] = True
63 | dataset_dict[name]['has_edge_attr'] = False
64 | dataset_dict[name]['split'] = 'time'
65 | dataset_dict[name]['additional node files'] = 'node_year'
66 | dataset_dict[name]['additional edge files'] = 'edge_reltype'
67 | dataset_dict[name]['is hetero'] = True
68 | dataset_dict[name]['binary'] = False
69 |
70 | ### add meta-information about paper category prediction in huge paper citation network
71 | name = 'ogbn-papers100M'
72 | dataset_dict[name] = {'num tasks': 1, 'num classes': 172, 'eval metric': 'acc', 'task type': 'multiclass classification'}
73 | dataset_dict[name]['download_name'] = 'papers100M-bin'
74 | dataset_dict[name]['version'] = 1
75 | dataset_dict[name]['url'] = 'http://snap.stanford.edu/ogb/data/nodeproppred/'+dataset_dict[name]['download_name']+'.zip'
76 | dataset_dict[name]['add_inverse_edge'] = False
77 | dataset_dict[name]['has_node_attr'] = True
78 | dataset_dict[name]['has_edge_attr'] = False
79 | dataset_dict[name]['split'] = 'time'
80 | dataset_dict[name]['additional node files'] = 'node_year'
81 | dataset_dict[name]['additional edge files'] = 'None'
82 | dataset_dict[name]['is hetero'] = False
83 | dataset_dict[name]['binary'] = True
84 |
85 | df = pd.DataFrame(dataset_dict)
86 | # saving the dataframe
87 | df.to_csv('master.csv')
--------------------------------------------------------------------------------
/ogb/nodeproppred/master.csv:
--------------------------------------------------------------------------------
1 | ,ogbn-proteins,ogbn-products,ogbn-arxiv,ogbn-mag,ogbn-papers100M
2 | num tasks,112,1,1,1,1
3 | num classes,2,47,40,349,172
4 | eval metric,rocauc,acc,acc,acc,acc
5 | task type,binary classification,multiclass classification,multiclass classification,multiclass classification,multiclass classification
6 | download_name,proteins,products,arxiv,mag,papers100M-bin
7 | version,1,1,1,2,1
8 | url,http://snap.stanford.edu/ogb/data/nodeproppred/proteins.zip,http://snap.stanford.edu/ogb/data/nodeproppred/products.zip,http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip,http://snap.stanford.edu/ogb/data/nodeproppred/mag.zip,http://snap.stanford.edu/ogb/data/nodeproppred/papers100M-bin.zip
9 | add_inverse_edge,True,True,False,False,False
10 | has_node_attr,False,True,True,True,True
11 | has_edge_attr,True,False,False,False,False
12 | split,species,sales_ranking,time,time,time
13 | additional node files,node_species,None,node_year,node_year,node_year
14 | additional edge files,None,None,None,edge_reltype,None
15 | is hetero,False,False,False,True,False
16 | binary,False,False,False,False,True
17 |
--------------------------------------------------------------------------------
/ogb/utils/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | from .mol import smiles2graph
3 | except ImportError:
4 | pass
--------------------------------------------------------------------------------
/ogb/utils/mol.py:
--------------------------------------------------------------------------------
1 | from ogb.utils.features import (allowable_features, atom_to_feature_vector,
2 | bond_to_feature_vector, atom_feature_vector_to_dict, bond_feature_vector_to_dict)
3 | from rdkit import Chem
4 | import numpy as np
5 |
6 |
7 | def ReorderCanonicalRankAtoms(mol):
8 | order = tuple(zip(*sorted([(j, i) for i, j in enumerate(Chem.CanonicalRankAtoms(mol))])))[1]
9 | mol_renum = Chem.RenumberAtoms(mol, order)
10 | return mol_renum, order
11 |
12 | def smiles2graph(smiles_string, removeHs=True, reorder_atoms=False):
13 | """
14 | Converts SMILES string to graph Data object
15 | :input: SMILES string (str)
16 | :return: graph object
17 | """
18 |
19 | mol = Chem.MolFromSmiles(smiles_string)
20 | mol = mol if removeHs else Chem.AddHs(mol)
21 | if reorder_atoms:
22 | mol, _ = ReorderCanonicalRankAtoms(mol)
23 |
24 | # atoms
25 | atom_features_list = []
26 | for atom in mol.GetAtoms():
27 | atom_features_list.append(atom_to_feature_vector(atom))
28 | x = np.array(atom_features_list, dtype = np.int64)
29 |
30 | # bonds
31 | num_bond_features = 3 # bond type, bond stereo, is_conjugated
32 | if len(mol.GetBonds()) > 0: # mol has bonds
33 | edges_list = []
34 | edge_features_list = []
35 | for bond in mol.GetBonds():
36 | i = bond.GetBeginAtomIdx()
37 | j = bond.GetEndAtomIdx()
38 |
39 | edge_feature = bond_to_feature_vector(bond)
40 |
41 | # add edges in both directions
42 | edges_list.append((i, j))
43 | edge_features_list.append(edge_feature)
44 | edges_list.append((j, i))
45 | edge_features_list.append(edge_feature)
46 |
47 | # data.edge_index: Graph connectivity in COO format with shape [2, num_edges]
48 | edge_index = np.array(edges_list, dtype = np.int64).T
49 |
50 | # data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]
51 | edge_attr = np.array(edge_features_list, dtype = np.int64)
52 |
53 | else: # mol has no bonds
54 | edge_index = np.empty((2, 0), dtype = np.int64)
55 | edge_attr = np.empty((0, num_bond_features), dtype = np.int64)
56 |
57 | graph = dict()
58 | graph['edge_index'] = edge_index
59 | graph['edge_feat'] = edge_attr
60 | graph['node_feat'] = x
61 | graph['num_nodes'] = len(x)
62 |
63 | return graph
64 |
65 |
66 | if __name__ == '__main__':
67 | graph = smiles2graph('O1C=C[C@H]([C@H]1O2)c3c2cc(OC)c4c3OC(=O)C5=C4CCC(=O)5')
68 | print(graph)
69 |
--------------------------------------------------------------------------------
/ogb/utils/torch_util.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | def replace_numpy_with_torchtensor(obj):
5 | # assume obj comprises either list or dictionary
6 | # replace all the numpy instance with torch tensor.
7 |
8 | if isinstance(obj, dict):
9 | for key in obj.keys():
10 | if isinstance(obj[key], np.ndarray):
11 | obj[key] = torch.from_numpy(obj[key])
12 | else:
13 | replace_numpy_with_torchtensor(obj[key])
14 | elif isinstance(obj, list):
15 | for i in range(len(obj)):
16 | if isinstance(obj[i], np.ndarray):
17 | obj[i] = torch.from_numpy(obj[i])
18 | else:
19 | replace_numpy_with_torchtensor(obj[i])
20 |
21 | # if the original input obj is numpy array
22 | elif isinstance(obj, np.ndarray):
23 | obj = torch.from_numpy(obj)
24 |
25 | return obj
26 |
27 |
28 | def all_numpy(obj):
29 | # Ensure everything is in numpy or int or float (no torch tensor)
30 |
31 | if isinstance(obj, dict):
32 | for key in obj.keys():
33 | all_numpy(obj[key])
34 | elif isinstance(obj, list):
35 | for i in range(len(obj)):
36 | all_numpy(obj[i])
37 | else:
38 | if not isinstance(obj, (np.ndarray, int, float)):
39 | return False
40 |
41 | return True
--------------------------------------------------------------------------------
/ogb/utils/url.py:
--------------------------------------------------------------------------------
1 | import urllib.request as ur
2 | import zipfile
3 | import os
4 | import os.path as osp
5 | from six.moves import urllib
6 | import errno
7 | from tqdm.auto import tqdm
8 |
9 | GBFACTOR = float(1 << 30)
10 |
11 | def decide_download(url):
12 | d = ur.urlopen(url)
13 | size = int(d.info()["Content-Length"])/GBFACTOR
14 |
15 | ### confirm if larger than 1GB
16 | if size > 1:
17 | return input("This will download %.2fGB. Will you proceed? (y/N)\n" % (size)).lower() == "y"
18 | else:
19 | return True
20 |
21 | def makedirs(path):
22 | try:
23 | os.makedirs(osp.expanduser(osp.normpath(path)))
24 | except OSError as e:
25 | if e.errno != errno.EEXIST and osp.isdir(path):
26 | raise e
27 |
28 | def download_url(url, folder, log=True):
29 | r"""Downloads the content of an URL to a specific folder.
30 | Args:
31 | url (string): The url.
32 | folder (string): The folder.
33 | log (bool, optional): If :obj:`False`, will not print anything to the
34 | console. (default: :obj:`True`)
35 | """
36 |
37 | filename = url.rpartition('/')[2]
38 | path = osp.join(folder, filename)
39 |
40 | if osp.exists(path) and osp.getsize(path) > 0: # pragma: no cover
41 | if log:
42 | print('Using exist file', filename)
43 | return path
44 |
45 | if log:
46 | print('Downloading', url)
47 |
48 | makedirs(folder)
49 | data = ur.urlopen(url)
50 |
51 | size = int(data.info()["Content-Length"])
52 |
53 | chunk_size = 1024*1024
54 | num_iter = int(size/chunk_size) + 2
55 |
56 | downloaded_size = 0
57 |
58 | try:
59 | with open(path, 'wb') as f:
60 | pbar = tqdm(range(num_iter))
61 | for i in pbar:
62 | chunk = data.read(chunk_size)
63 | downloaded_size += len(chunk)
64 | pbar.set_description("Downloaded {:.2f} GB".format(float(downloaded_size)/GBFACTOR))
65 | f.write(chunk)
66 | except:
67 | if os.path.exists(path):
68 | os.remove(path)
69 | raise RuntimeError('Stopped downloading due to interruption.')
70 |
71 |
72 | return path
73 |
74 | def maybe_log(path, log=True):
75 | if log:
76 | print('Extracting', path)
77 |
78 | def extract_zip(path, folder, log=True):
79 | r"""Extracts a zip archive to a specific folder.
80 | Args:
81 | path (string): The path to the tar archive.
82 | folder (string): The folder.
83 | log (bool, optional): If :obj:`False`, will not print anything to the
84 | console. (default: :obj:`True`)
85 | """
86 | maybe_log(path, log)
87 | with zipfile.ZipFile(path, 'r') as f:
88 | f.extractall(folder)
89 |
90 | if __name__ == "__main__":
91 | pass
--------------------------------------------------------------------------------
/ogb/version.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | from threading import Thread
4 |
5 | __version__ = '1.3.6'
6 |
7 | try:
8 | os.environ['OUTDATED_IGNORE'] = '1'
9 | from outdated import check_outdated # noqa
10 | except ImportError:
11 | check_outdated = None
12 |
13 |
14 | def check():
15 | try:
16 | is_outdated, latest = check_outdated('ogb', __version__)
17 | if is_outdated:
18 | logging.warning(
19 | f'The OGB package is out of date. Your version is '
20 | f'{__version__}, while the latest version is {latest}.')
21 | except Exception:
22 | pass
23 |
24 |
25 | if check_outdated is not None:
26 | thread = Thread(target=check)
27 | thread.start()
28 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | from os import path
3 | import sys
4 |
5 | from io import open
6 |
7 | here = path.abspath(path.dirname(__file__))
8 | sys.path.insert(0, path.join(here, 'ogb'))
9 | from version import __version__
10 |
11 | print('version')
12 | print(__version__)
13 |
14 | # Get the long description from the README file
15 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
16 | long_description = f.read()
17 |
18 | package_data_list = ['ogb/graphproppred/master.csv', 'ogb/nodeproppred/master.csv', 'ogb/linkproppred/master.csv']
19 |
20 | setup(name='ogb',
21 | version=__version__,
22 | description='Open Graph Benchmark',
23 | url='https://github.com/snap-stanford/ogb',
24 | author='OGB Team',
25 | author_email='ogb@cs.stanford.edu',
26 | keywords=['pytorch', 'graph machine learning', 'graph representation learning', 'graph neural networks'],
27 | long_description=long_description,
28 | long_description_content_type='text/markdown',
29 | install_requires = [
30 | 'torch>=1.6.0',
31 | 'numpy>=1.16.0',
32 | 'tqdm>=4.29.0',
33 | 'scikit-learn>=0.20.0',
34 | 'pandas>=0.24.0',
35 | 'six>=1.12.0',
36 | 'urllib3>=1.24.0',
37 | 'outdated>=0.2.0',
38 | 'joblib>=1.3.2'
39 | ],
40 | license='MIT',
41 | packages=find_packages(exclude=['dataset', 'examples', 'docs']),
42 | package_data={'ogb': package_data_list},
43 | include_package_data=True,
44 | classifiers=[
45 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
46 | 'Intended Audience :: Science/Research',
47 | 'Programming Language :: Python :: 3.6',
48 | 'Programming Language :: Python :: 3.7',
49 | 'License :: OSI Approved :: MIT License',
50 | ],
51 | )
52 |
--------------------------------------------------------------------------------