├── .gitignore
├── Example.ipynb
├── LICENSE
├── README.md
├── data_loaders
└── torch_geometric
│ ├── torch_classification_example.py
│ └── torch_loader.py
├── datasets
├── easy.npz
├── easy_small.npz
├── hard.npz
└── hard_small.npz
├── img
├── sample_graph.png
└── sample_graph2.png
└── make_dataset.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/*
2 | compute_stats.py
3 | *.pyc
4 | *.pt
5 | data_loaders/torch_geometric/data/raw/hard.npz
6 | data_loaders/spektral_tensorflow/tf_classification_example.py
7 | data_loaders/spektral_tensorflow/spektral_loader.py
8 |
--------------------------------------------------------------------------------
/Example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from time import time\n",
10 | "import numpy as np\n",
11 | "import networkx as nx\n",
12 | "\n",
13 | "from sklearn.metrics import accuracy_score\n",
14 | "from sklearn.model_selection import train_test_split\n",
15 | "from sklearn import svm\n",
16 | "\n",
17 | "from grakel import datasets, GraphKernel, graph_from_networkx"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "### Load datasets"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "dataset_name = \"hard_small\" # options: {easy_small, easy, hard_small, hard}\n",
34 | "\n",
35 | "loaded = np.load('datasets/'+dataset_name+'.npz', allow_pickle=True)\n",
36 | "A_train = list(loaded['tr_adj']) # list of adjacency matrices\n",
37 | "X_train = loaded['tr_feat'] # node features\n",
38 | "y_train = loaded['tr_class'] # class labels\n",
39 | "A_test = list(loaded['te_adj']) # list of adjacency matrices\n",
40 | "X_test = loaded['te_feat'] # node features\n",
41 | "y_test = loaded['te_class'] # class labels\n",
42 | "\n",
43 | "# Convert to networkx format\n",
44 | "G_tr = []\n",
45 | "for a, x in zip(A_train, X_train):\n",
46 | " G = nx.from_scipy_sparse_matrix(a)\n",
47 | " x_tuple = tuple(map(tuple, x))\n",
48 | " nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')\n",
49 | " G_tr.append(G)\n",
50 | "G_te = []\n",
51 | "for a, x in zip(A_test, X_test):\n",
52 | " G = nx.from_scipy_sparse_matrix(a)\n",
53 | " x_tuple = tuple(map(tuple, x))\n",
54 | " nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')\n",
55 | " G_te.append(G)\n",
56 | "\n",
57 | "# Convert to GraKel format\n",
58 | "G_train = graph_from_networkx(G_tr, node_labels_tag='features')\n",
59 | "G_train = [g for g in G_train]\n",
60 | "y_train = np.argmax(y_train, axis=-1)\n",
61 | "G_test = graph_from_networkx(G_te, node_labels_tag='features')\n",
62 | "G_test = [g for g in G_test]\n",
63 | "y_test = np.argmax(y_test, axis=-1)"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "### Train and evaluate graph kernels"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 3,
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "name": "stdout",
80 | "output_type": "stream",
81 | "text": [
82 | "shortest_path -- Accuracy: 69.23 % | Took: 7.85 s\n",
83 | "graphlet_sampling -- Accuracy: 38.46 % | Took: 37.84 s\n",
84 | "pyramid_match -- Accuracy: 23.08 % | Took: 2.86 s\n",
85 | "svm_theta -- Accuracy: 23.08 % | Took: 2.91 s\n",
86 | "neighborhood_hash -- Accuracy: 69.23 % | Took: 2.71 s\n",
87 | "subtree_wl -- Accuracy: 15.38 % | Took: 0.03 s\n",
88 | "odd_sth -- Accuracy: 42.31 % | Took: 24.48 s\n",
89 | "propagation -- Accuracy: 53.85 % | Took: 2.61 s\n",
90 | "pyramid_match -- Accuracy: 23.08 % | Took: 3.47 s\n",
91 | "vertex_histogram -- Accuracy: 15.38 % | Took: 0.01 s\n",
92 | "weisfeiler_lehman -- Accuracy: 73.08 % | Took: 58.92 s\n",
93 | "core_framework -- Accuracy: 69.23 % | Took: 18.62 s\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "# Create a list with the graph kernels to evaluate\n",
99 | "# For more kernels and information:\n",
100 | "# https://ysig.github.io/GraKeL/dev/generated/grakel.GraphKernel.html#grakel.GraphKernel\n",
101 | "kernel_names = [\n",
102 | " \"shortest_path\", \n",
103 | " \"graphlet_sampling\", \n",
104 | " \"pyramid_match\", \n",
105 | " \"svm_theta\",\n",
106 | " \"neighborhood_hash\",\n",
107 | " \"subtree_wl\",\n",
108 | " \"odd_sth\",\n",
109 | " \"propagation\",\n",
110 | " \"vertex_histogram\",\n",
111 | " \"weisfeiler_lehman\",\n",
112 | " \"core_framework\"\n",
113 | " ]\n",
114 | "\n",
115 | "for k_ in kernel_names:\n",
116 | " \n",
117 | " start = time()\n",
118 | " \n",
119 | " # General kernels\n",
120 | " if k_ in [\"weisfeiler_lehman\" , \"core_framework\"]:\n",
121 | " gk = GraphKernel(kernel=[{\"name\": k_}, {\"name\": \"shortest_path\"}], normalize=True)\n",
122 | " \n",
123 | " # Base kernels\n",
124 | " else:\n",
125 | " gk = GraphKernel(kernel=[{\"name\": k_}], normalize=True)\n",
126 | "\n",
127 | " # Calculate the kernel matrix\n",
128 | " K_train = gk.fit_transform(G_train)\n",
129 | " K_test = gk.transform(G_test)\n",
130 | " \n",
131 | " # Initialise an SVM and fit\n",
132 | " clf = svm.SVC(kernel='precomputed', C=1)\n",
133 | " clf.fit(K_train, y_train)\n",
134 | " \n",
135 | " # Compute predictions on test set\n",
136 | " y_pred = clf.predict(K_test)\n",
137 | " \n",
138 | " # Calculate accuracy of classification\n",
139 | " acc = accuracy_score(y_test, y_pred)\n",
140 | " \n",
141 | " end = time()\n",
142 | " print(k_, \"-- Accuracy:\", str(round(acc*100, 2)), \"% | Took:\",\n",
143 | " str(round(end - start, 2)), \"s\")"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": []
152 | }
153 | ],
154 | "metadata": {
155 | "kernelspec": {
156 | "display_name": "Python 3",
157 | "language": "python",
158 | "name": "python3"
159 | },
160 | "language_info": {
161 | "codemirror_mode": {
162 | "name": "ipython",
163 | "version": 3
164 | },
165 | "file_extension": ".py",
166 | "mimetype": "text/x-python",
167 | "name": "python",
168 | "nbconvert_exporter": "python",
169 | "pygments_lexer": "ipython3",
170 | "version": "3.6.7"
171 | }
172 | },
173 | "nbformat": 4,
174 | "nbformat_minor": 2
175 | }
176 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Filippo Bianchi
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Benchmark dataset for graph classification
2 | This repository contains datasets to quickly test graph classification algorithms, such as Graph Kernels and Graph Neural Networks.
3 |
4 | The purpose of this dataset is to make the features on the nodes and the adjacency matrix to be completely uninformative if considered alone.
5 | Therefore, an algorithm that relies only on the node features or on the graph structure will fail to achieve good classification results.
6 |
7 | ## Citation
8 |
9 | This dataset was formally introduced in the paper [Pyramidal Reservoir Graph Neural Network](https://arxiv.org/abs/2104.04710).
10 | If you are using this dataset in your research, please consider citing our work:
11 |
12 | ```bibtex
13 | @inproceedings{bianchi2022pyramidal,
14 | title={Pyramidal Reservoir Graph Neural Network},
15 | author={Bianchi, Filippo Maria and Gallicchio, Claudio and Micheli, Alessio},
16 | journal={Neurocomputing},
17 | volume={470},
18 | pages={389--404},
19 | year={2022},
20 | publisher={Elsevier}
21 | }
22 | ```
23 |
24 | ## Dataset details
25 |
26 | The dataset consists of graphs belonging to 3 different classes. The number of nodes in each graph is variable and the feature vector on each node is a one-hot vector of size 5, which encodes the color of the node. The class is determined by the relative position of the colors on the graph.
27 |
28 | 
29 | 
30 |
31 | There are 4 versions of the dataset
32 |
33 | - **small_easy:** 100 graphs per class, number of nodes varying in 40 and 80. Highly connected graphs.
34 | - **easy:** 600 graphs per class, number of nodes varying in 100 and 200. Highly connected graphs.
35 | - **small_hard:** 100 graphs per class, number of nodes varying in 40 and 80. Sparse graphs.
36 | - **hard:** 600 graphs per class, number of nodes varying in 100 and 200. Sparse graphs.
37 |
38 | In the hard dataset, it is necessary to consider higher order neighborhoods to understand the correct class and the graphs might be disconnected.
39 |
40 | | Dataset | # classes | # graphs | TR size | VAL size | TEST size | avg nodes | avg edges | Node Attr. (Dim.) |
41 | |------------|-----------|----------|---------|----------|-----------|-----------|-----------|-------------------|
42 | | easy_small | 3 | 300 | 239 | 30 | 31 | 58.25 | 358.8 | 5 |
43 | | hard_small | 3 | 300 | 245 | 29 | 26 | 58.64 | 224.94 | 5 |
44 | | easy | 3 | 1800 | 1475 | 162 | 163 | 147.82 | 922.66 | 5 |
45 | | hard | 3 | 1800 | 1451 | 159 | 190 | 148.32 | 572.32 | 5 |
46 |
47 | ### Format
48 |
49 | The dataset is already split in training, validation and classification sets.
50 | Each set contains:
51 | - the list of adjacency matrices in csr_matrix format,
52 | - the list of node features as numpy arrays,
53 | - the class labels contained in a numpy array,
54 |
55 |
56 |
57 | ### Loader (Numpy)
58 |
59 | The following code snippet shows how to load the data
60 |
61 | ````python
62 | import numpy as np
63 |
64 | loaded = np.load('datasets/hard.npz', allow_pickle=True)
65 |
66 | X_train = loaded['tr_feat'] # node features
67 | A_train = list(loaded['tr_adj']) # list of adjacency matrices
68 | y_train = loaded['tr_class'] # class labels
69 |
70 | X_val = loaded['val_feat'] # node features
71 | A_val = list(loaded['val_adj']) # list of adjacency matrices
72 | y_val = loaded['val_class'] # class labels
73 |
74 | X_test = loaded['te_feat'] # node features
75 | A_test = list(loaded['te_adj']) # list of adjacency matrices
76 | y_test = loaded['te_class'] # class labels
77 |
78 | # OPTIONAL - Convert to networkx format
79 | import networkx as nx
80 |
81 | G_train = []
82 | for a, x in zip(A_train, X_train):
83 | G = nx.from_scipy_sparse_matrix(a)
84 | x_tuple = tuple(map(tuple, x))
85 | nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')
86 | G_train.append(G)
87 |
88 | G_val = []
89 | for a, x in zip(A_val, X_val):
90 | G = nx.from_scipy_sparse_matrix(a)
91 | x_tuple = tuple(map(tuple, x))
92 | nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')
93 | G_val.append(G)
94 |
95 | G_test = []
96 | for a, x in zip(A_test, X_test):
97 | G = nx.from_scipy_sparse_matrix(a)
98 | x_tuple = tuple(map(tuple, x))
99 | nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')
100 | G_test.append(G)
101 | ````
102 |
103 |
104 |
105 | ### Loader (Pytorch)
106 |
107 | The dataset can be processed by a GNN implemented in [Pytorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/index.html) using the function defined in [torch_loader.py](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/data_loaders/torch_geometric/torch_loader.py).
108 |
109 | ````python
110 | from torch_geometric.loader import DataLoader
111 | from torch_loader import GraphClassificationBench
112 |
113 | # Load "hard"
114 | train_dataset = GraphClassificationBench("data/", split='train', easy=False, small=False)
115 | val_dataset = GraphClassificationBench("data/", split='val', easy=False, small=False)
116 | test_dataset = GraphClassificationBench("data/", split='test', easy=False, small=False)
117 |
118 | train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
119 | val_loader = DataLoader(val_dataset, batch_size=32)
120 | test_loader = DataLoader(test_dataset, batch_size=32)
121 | ````
122 |
123 | See [torch_classification_example.py](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/data_loaders/torch_geometric/torch_classification_example.py) for a complete working example.
124 |
125 | ## Results
126 | Classification results obtained by using Graph Kernels and other techniques are reported below.
127 |
128 | Feel free to send a pull request if you have results you'd like to share!
129 |
130 | #### Graph Kernels
131 | The Graph Kernels are computed with the [GraKeL](https://ysig.github.io/GraKeL/dev/index.html) library. After each kernel is computed, an SVM that uses as precomputed kernel the Graph Kernel is trained and then evaluated on the test data.
132 | As SVM implementation, the [sklearn.svm](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm) module was used.
133 | The code used to generate the results can be found in the [notebook](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/Example.ipynb) of this repository.
134 |
135 | Dependecies to run the notebook:
136 | - scikitlearn ````pip install sklearn````
137 | - networkx ````pip install networkx````
138 | - grakel ````pip install grakel-dev````
139 |
140 | | | easy_small | | hard_small | |
141 | |--------------------|------------------|----------------|------------------|----------------|
142 | | Shortest Path | Accuracy: 100 | Time: 20.67 s | Accuracy: 69.23 | Time: 7.85 s |
143 | | Graphlet Sampling | Accuracy: 41.94 | Time: 281.35 s | Accuracy: 38.46 | Time: 37.84 s |
144 | | Pyramid Match | Accuracy: 51.61 | Time: 2.91 s | Accuracy: 23.08 | Time: 2.86 s |
145 | | SVM Theta | Accuracy: 32.26 | Time: 3.34 s | Accuracy: 23.08 | Time: 2.91 s |
146 | | Neighborhood Hash | Accuracy: 90.32 | Time: 2.73 s | Accuracy: 69.23 | Time: 2.71 s |
147 | | Subtree WL | Accuracy: 29.03 | Time: 0.01 s | Accuracy: 15.38 | Time: 0.03 s |
148 | | ODD STH | Accuracy: 77.42 | Time: 58.75 s | Accuracy: 42.31 | Time: 24.48 s |
149 | | Propagation | Accuracy: 87.1 | Time: 3.35 s | Accuracy: 53.85 | Time: 2.61 s |
150 | | Vertex Histogram | Accuracy: 29.03 | Time: 0.02 s | Accuracy: 15.38 | Time: 0.01 s |
151 | | Weisfeiler Lehman | Accuracy: 100 | Time: 151.81 s | Accuracy: 73.08 | Time: 58.92 s |
152 | | Core Framework | Accuracy: 100 | Time: 62.18 s | Accuracy: 69.23 | Time: 18.62 s |
153 |
154 |
155 | #### Graph Neural Networks
156 |
157 | Results obtained with the following GNN architecture: MP(32)-Pool-MP(32)-Pool-MP(32)-GlobalPool-Dense(Softmax). MP is a message-passing architecture. A Chebyshev convolutional layer \[1\] with K=1 and 32 hidden units was used here. Results refer to different graph pooling layers: Graclus \[2\], Node Decimation Pooling (NDP) \[3\], DiffPool \[4\], Top-K pooling \[5\], SAGpool \[6\] and MinCutPool \[7\].
158 |
159 |
160 | | | easy | hard |
161 | |------------|----------------------|-----------------------|
162 | | Graclus | Accuracy: 97.5 ± 0.5 | Accuracy: 69.0 ± 1.5 |
163 | | NDP | Accuracy: 97.9 ± 0.5 | Accuracy: 72.6 ± 0.9 |
164 | | DiffPool | Accuracy: 98.6 ± 0.4 | Accuracy: 69.9 ± 1.9 |
165 | | Top-K | Accuracy: 82.4 ± 8.9 | Accuracy: 42.7 ± 15.2 |
166 | | SAGPool | Accuracy: 84.2 ± 2.3 | Accuracy: 37.7 ± 14.5 |
167 | | MinCutPool | Accuracy: 99.0 ± 0.0 | Accuracy: 73.8 ± 1.9 |
168 |
169 |
170 |
171 | #### Embedding Simplicial Complexes (ESC)
172 | Techniques proposed in \[8\].
173 |
174 | | | easy_small | | hard_small | |
175 | |--------------------|------------------|----------------|------------------|----------------|
176 | | ESC + RBF-SVM | Accuracy: 74.19 ± 6.84 | Time: 0.68 s| Accuracy: 48.46 ± 8.43| Time: 0.48 s|
177 | | ESC + L1-SVM | Accuracy: 94.19 ± 2.70 | Time: 0.68 s| Accuracy: 70.77 ± 5.83| Time: 0.48 s|
178 | | ESC + L2-SVM | Accuracy: 92.26 ± 2.89 | Time: 0.68 s| Accuracy: 69.23 ± 5.44| Time: 0.48 s|
179 |
180 | | | easy | | hard | |
181 | |--------------------|------------------|----------------|------------------|----------------|
182 | | ESC + RBF-SVM | Accuracy: 80.37 ± 7.04 | Time: 10.94 s| Accuracy: 62.53 ± 4.58| Time: 16.65 s|
183 | | ESC + L1-SVM | Accuracy: 96.07 ± 0.93 | Time: 10.94 s| Accuracy: 72.21 ± 1.01| Time: 16.65 s|
184 | | ESC + L2-SVM | Accuracy: 93.37 ± 1.96 | Time: 10.94 s| Accuracy: 69.26 ± 1.85| Time: 16.65 s|
185 |
186 | #### Hypergraph kernels
187 | Techniques proposed in \[9\].
188 |
189 | | | easy_small | | hard_small | |
190 | |--------------------|------------------|----------------|------------------|----------------|
191 | | Hist Kernel | Accuracy: 94.0 ± 0.02 | Time: 0.72 s| Accuracy: 77.0 ± 0.02 | Time: 0.46 s|
192 | | Jaccard Kernel | Accuracy: 94.0 ± 0.0 | Time: 0.86 s| Accuracy: 68.0 ± 0.02 | Time: 0.54 s|
193 | | Edit Kernel | Accuracy: 94.0 ± 0.01 | Time: 9.97 s| Accuracy: 60.0 ± 0.02 | Time: 7.70 s|
194 | | Stratedit Kernel | Accuracy: 94.0 ± 0.0 | Time: 5.14 s| Accuracy: 58.0 ± 0.02 | Time: 4.79 s|
195 |
196 | | | easy | | hard | |
197 | |--------------------|-----------------------|----------------|-----------------------|----------------|
198 | | Hist Kernel | Accuracy: 94.0 ± 0.01 | Time: 10.39 s | Accuracy: 72.0 ± 0.01 | Time: 6.93 s |
199 | | Jaccard Kernel | Accuracy: 94.0 ± 0.01 | Time: 14.15 s | Accuracy: 63.0 ± 0.00 | Time: 8.11 s |
200 | | Edit Kernel | Accuracy: 93.0 ± 0.00 | Time: 2784.47 s | Accuracy: 60.0 ± 0.00 | Time: 2183.41 s |
201 | | Stratedit Kernel | Accuracy: 93.0 ± 0.00 | Time: 932.96 s | Accuracy: 60.0 ± 0.01 | Time: 954.87 s |
202 |
203 |
204 | ## References
205 | \[1\] Defferrard, M., Bresson, X., & Vandergheynst, P. (2016). Convolutional neural networks on graphs with fast localized spectral filtering. In Advances in neural information processing systems
206 |
207 | \[2\] Dhillon, I. S., Guan, Y., & Kulis, B. (2007). Weighted graph cuts without eigenvectors a multilevel approach. IEEE transactions on pattern analysis and machine intelligence
208 |
209 | \[3\] Bianchi, F. M., Grattarola, D., Livi, L., & Alippi, C. (2019). Hierarchical Representation Learning in Graph Neural Networks with Node Decimation Pooling
210 |
211 | \[4\] Ying, Z., You, J., Morris, C., Ren, X., Hamilton, W., & Leskovec, J. (2018). Hierarchical graph representation learning with differentiable pooling. In Advances in neural information processing systems
212 |
213 | \[5\] Gao, H., & Ji, S., Graph u-nets, ICML 2019
214 |
215 | \[6\] Lee, J., Lee, I., & Kang, J., Self-attention graph pooling, ICML 2019
216 |
217 | \[7\] F. M. Bianchi, D. Grattarola, C. Alippi, Spectral Clustering with Graph Neural Networks for Graph Pooling, ICML 2020
218 |
219 | \[8\] Martino A, Giuliani A, Rizzi A., (Hyper) Graph Embedding and Classification via Simplicial Complexes. Algorithms. 2019 Nov; 12(11):223
220 |
221 | \[9\] Martino A. and Rizzi A., (Hyper)graph kernels over simplicial complexes. 2020. Pattern Recognition
222 |
223 | ## License
224 | The dataset and the code are released under the MIT License. See the attached LICENSE file.
225 |
--------------------------------------------------------------------------------
/data_loaders/torch_geometric/torch_classification_example.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import torch
3 | import torch.nn.functional as F
4 | from torch_geometric.loader import DataLoader
5 | from torch_geometric.logging import log
6 | from torch_geometric.nn import MLP, GINConv, global_add_pool
7 |
8 | from torch_loader import GraphClassificationBench
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--batch_size', type=int, default=32)
12 | parser.add_argument('--hidden_channels', type=int, default=32)
13 | parser.add_argument('--num_layers', type=int, default=3)
14 | parser.add_argument('--lr', type=float, default=5e-4)
15 | parser.add_argument('--epochs', type=int, default=100)
16 | args = parser.parse_args()
17 |
18 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19 |
20 |
21 | file_path = "data/"
22 | train_dataset = GraphClassificationBench(file_path, split='train', easy=False, small=False)
23 | train_loader = DataLoader(train_dataset, args.batch_size, shuffle=True)
24 | val_dataset = GraphClassificationBench(file_path, split='val', easy=False, small=False)
25 | val_loader = DataLoader(val_dataset, args.batch_size)
26 | test_dataset = GraphClassificationBench(file_path, split='test', easy=False, small=False)
27 | test_loader = DataLoader(test_dataset, args.batch_size)
28 |
29 |
30 | class Net(torch.nn.Module):
31 | def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
32 | super().__init__()
33 |
34 | self.convs = torch.nn.ModuleList()
35 | for _ in range(num_layers):
36 | mlp = MLP([in_channels, hidden_channels, hidden_channels])
37 | self.convs.append(GINConv(nn=mlp, train_eps=False))
38 | in_channels = hidden_channels
39 |
40 | self.mlp = MLP([hidden_channels, hidden_channels, out_channels],
41 | norm=None, dropout=0.5)
42 |
43 | def forward(self, x, edge_index, batch):
44 | for conv in self.convs:
45 | x = conv(x, edge_index).relu()
46 | x = global_add_pool(x, batch)
47 | return self.mlp(x)
48 |
49 |
50 | model = Net(train_dataset.num_features, args.hidden_channels, train_dataset.num_classes,
51 | args.num_layers).to(device)
52 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
53 |
54 |
55 | def train():
56 | model.train()
57 |
58 | total_loss = 0
59 | for data in train_loader:
60 | data = data.to(device)
61 | optimizer.zero_grad()
62 | out = model(data.x, data.edge_index, data.batch)
63 | loss = F.cross_entropy(out, data.y)
64 | loss.backward()
65 | optimizer.step()
66 | total_loss += float(loss) * data.num_graphs
67 | return total_loss / len(train_loader.dataset)
68 |
69 |
70 | @torch.no_grad()
71 | def test(loader):
72 | model.eval()
73 |
74 | total_correct = 0
75 | for data in loader:
76 | data = data.to(device)
77 | pred = model(data.x, data.edge_index, data.batch).argmax(dim=-1)
78 | total_correct += int((pred == data.y).sum())
79 | return total_correct / len(loader.dataset)
80 |
81 |
82 | for epoch in range(1, args.epochs + 1):
83 | loss = train()
84 | train_acc = test(train_loader)
85 | val_acc = test(val_loader)
86 | test_acc = test(test_loader)
87 | log(Epoch=epoch, Loss=loss, Train=train_acc, Val=val_acc, Test=test_acc)
88 |
--------------------------------------------------------------------------------
/data_loaders/torch_geometric/torch_loader.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch_geometric.data import InMemoryDataset, Data, download_url
3 | from os import path
4 | import numpy as np
5 |
6 |
7 | class GraphClassificationBench(InMemoryDataset):
8 | """The synthetic dataset from `"Pyramidal Reservoir Graph Neural Network"
9 | `_ paper.
10 |
11 | Args:
12 | root (string): Root directory where the dataset should be saved.
13 | split (string): If `"train"`, loads the training dataset.
14 | If `"val"`, loads the validation dataset.
15 | If `"test"`, loads the test dataset. Defaults to `"train"`.
16 | easy (bool, optional): If `True`, use the easy version of the dataset.
17 | Defaults to `True`.
18 | small (bool, optional): If `True`, use the small version of the
19 | dataset. Defaults to `True`.
20 | transform (callable, optional): A function/transform that takes in an
21 | `torch_geometric.data.Data` object and returns a transformed
22 | version. The data object will be transformed before every access.
23 | Defaults to `None`.
24 | pre_transform (callable, optional): A function/transform that takes in
25 | an `torch_geometric.data.Data` object and returns a
26 | transformed version. The data object will be transformed before
27 | being saved to disk. Defaults to `None`.
28 | pre_filter (callable, optional): A function that takes in an
29 | `torch_geometric.data.Data` object and returns a boolean
30 | value, indicating whether the data object should be included in the
31 | final dataset. Defaults to `None`.
32 | """
33 | base_url = ('http://github.com/FilippoMB/'
34 | 'Benchmark_dataset_for_graph_classification/'
35 | 'raw/master/datasets/')
36 |
37 | def __init__(self, root, split='train', easy=True, small=True, transform=None, pre_transform=None, pre_filter=None):
38 | self.split = split.lower()
39 | assert self.split in {'train', 'val', 'test'}
40 | if self.split != 'val':
41 | self.split = self.split[:2]
42 |
43 | self.file_name = ('easy' if easy else 'hard') + ('_small' if small else '')
44 |
45 | super(GraphClassificationBench, self).__init__(root, transform, pre_transform, pre_filter)
46 | self.data, self.slices = torch.load(self.processed_paths[0])
47 |
48 | @property
49 | def raw_file_names(self):
50 | return '{}.npz'.format(self.file_name)
51 |
52 | @property
53 | def processed_file_names(self):
54 | return '{}.pt'.format(self.file_name+ '_' + self.split)
55 |
56 | def download(self):
57 | download_url('{}{}.npz'.format(self.base_url, self.file_name), self.raw_dir)
58 |
59 | def process(self):
60 | npz = np.load(path.join(self.raw_dir, self.raw_file_names), allow_pickle=True)
61 | raw_data = (npz['{}_{}'.format(self.split, key)] for key in ['feat', 'adj', 'class'])
62 | data_list = [Data(x=torch.FloatTensor(x),
63 | edge_index=torch.LongTensor(np.stack(adj.nonzero())),
64 | y=torch.LongTensor(y.nonzero()[0])) for x, adj, y in zip(*raw_data)]
65 |
66 | if self.pre_filter is not None:
67 | data_list = [data for data in data_list if self.pre_filter(data)]
68 |
69 | if self.pre_transform is not None:
70 | data_list = [self.pre_transform(data) for data in data_list]
71 |
72 | self.data, self.slices = self.collate(data_list)
73 | torch.save((self.data, self.slices), self.processed_paths[0])
74 |
75 |
76 | if __name__ == "__main__":
77 | file_path = "data/"
78 | tr_dataset = GraphClassificationBench(file_path, split='train', easy=False, small=False)
79 | print(tr_dataset[5])
--------------------------------------------------------------------------------
/datasets/easy.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/easy.npz
--------------------------------------------------------------------------------
/datasets/easy_small.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/easy_small.npz
--------------------------------------------------------------------------------
/datasets/hard.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/hard.npz
--------------------------------------------------------------------------------
/datasets/hard_small.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/hard_small.npz
--------------------------------------------------------------------------------
/img/sample_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/img/sample_graph.png
--------------------------------------------------------------------------------
/img/sample_graph2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/img/sample_graph2.png
--------------------------------------------------------------------------------
/make_dataset.py:
--------------------------------------------------------------------------------
1 | from sklearn import datasets
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | from sklearn.neighbors import kneighbors_graph
5 | import scipy.sparse as sp
6 | import networkx as nx
7 | from sklearn.preprocessing import OneHotEncoder
8 |
9 |
10 | def make_instance(
11 | moon_c=[0,0],
12 | gaussq_c=[2,-2],
13 | blob_c=[[4,0]],
14 | n_nodes=90,
15 | random_n_nodes=False,
16 | cov=.2,
17 | k_neigh=5,
18 | conn_mode='connectivity',
19 | noisy_coords=False,
20 | noise_level=.1,
21 | plot_on=False):
22 |
23 | if random_n_nodes:
24 | n_nodes = np.random.randint(low=n_nodes//2, high=n_nodes)
25 |
26 | n_samples = n_nodes//3
27 |
28 | # initial features
29 | Xm, Ym = datasets.make_moons(n_samples=n_samples, noise=0.1)
30 | Xm[:,0] += moon_c[0]
31 | Xm[:,1] += moon_c[1]
32 | Xq, Yq = datasets.make_gaussian_quantiles(n_samples=n_samples, mean=gaussq_c, n_classes=2, cov=cov)
33 | Yq += 2
34 | Xb, Yb = datasets.make_blobs(n_samples=n_samples, centers=blob_c, cluster_std=cov*2)
35 | Yb += 4
36 | X = np.concatenate((Xm, Xq, Xb))
37 | X /= np.max(X,axis=0)
38 | Y = np.concatenate((Ym, Yq, Yb))
39 |
40 | if plot_on:
41 | plt.scatter(X[:,0], X[:,1], c=Y)
42 | plt.title('initial features')
43 | plt.show()
44 |
45 | # build graph
46 | A = kneighbors_graph(X, n_neighbors=k_neigh, mode=conn_mode).todense()
47 | A = np.asarray(A)
48 | A = np.maximum(A, A.T)
49 | A /= A.max() # normalize in [0,1]
50 | A = sp.csr_matrix(A, dtype=np.float32)
51 | G = nx.from_scipy_sparse_matrix(A)
52 |
53 | if plot_on:
54 | nx.draw_networkx(G, pos=nx.fruchterman_reingold_layout(G), with_labels=False, node_size=20, edge_color='lightgray', node_color=Y,
55 | linewidths=1)
56 | plt.title('graph')
57 | plt.show()
58 |
59 | # node features
60 | F = OneHotEncoder(sparse=False, categories='auto').fit_transform(Y[...,None])
61 |
62 | if noisy_coords:
63 | X = np.tanh(X*1.1)
64 | X = np.multiply(X, np.diag(np.random.randn(X.shape[0],1)*noise_level))
65 |
66 | if plot_on:
67 | plt.scatter(X[:,0], X[:,1], c=Y)
68 | plt.title('noisy coords features')
69 | plt.show()
70 |
71 | F = np.concatenate((F, X), axis=-1)
72 |
73 | F_tuple = tuple(map(tuple, F))
74 | nx.set_node_attributes(G, dict(enumerate(F_tuple)), 'features')
75 |
76 | return F.astype(np.float32), A, G
77 |
78 |
79 | def make_dataset(
80 | moon = [ [[4,0]], [[2,-2]], [[0,0]] ], #[ [[4,0], [0,0]], [[2,-2],[4,0]], [[2,-2], [0,0]] ],
81 | gaussq = [ [[0,0]], [[4,0]], [[2,-2]] ], #[ [[0,0], [4,0]], [[0,0], [2,-2]], [[4,0], [2,-2]] ],
82 | blob = [ [[[2,-2]]], [[[0,0]]], [[[4,0]]] ], #[ [[[2,-2]],[[2,-2]]], [[[4,0]], [[0,0]]], [[[0,0]], [[4,0]]] ],
83 | n_nodes=90,
84 | random_n_nodes=False,
85 | cov=.2,
86 | k_neigh=5,
87 | conn_mode='connectivity',
88 | noisy_coords=False,
89 | noise_level=.5,
90 | plot_on=False,
91 | tr_size = 0.9,
92 | samples_per_subclass=150
93 | ):
94 | n_classes = len(moon)
95 | n_subclass = len(moon[0])
96 | print('n_classes:',n_classes,', n_subclasses:',n_subclass)
97 |
98 | tr_F = []
99 | tr_A = []
100 | tr_G = []
101 | tr_C = []
102 |
103 | val_F = []
104 | val_A = []
105 | val_G = []
106 | val_C = []
107 |
108 | te_F = []
109 | te_A = []
110 | te_G = []
111 | te_C = []
112 |
113 | for c in range(n_classes):
114 | for s in range(n_subclass):
115 |
116 | for _ in range(samples_per_subclass):
117 | F, A, G = make_instance(moon_c=moon[c][s],
118 | gaussq_c=gaussq[c][s],
119 | blob_c=blob[c][s],
120 | n_nodes=n_nodes,
121 | random_n_nodes=random_n_nodes,
122 | cov=cov,
123 | k_neigh=k_neigh,
124 | conn_mode=conn_mode,
125 | noisy_coords=noisy_coords,
126 | noise_level=noise_level,
127 | plot_on=plot_on)
128 |
129 | if np.random.rand() < tr_size:
130 | if np.random.rand() < tr_size:
131 | tr_F.append(F)
132 | tr_A.append(A)
133 | tr_G.append(G)
134 | tr_C.append(c)
135 | else:
136 | val_F.append(F)
137 | val_A.append(A)
138 | val_G.append(G)
139 | val_C.append(c)
140 | else:
141 | te_F.append(F)
142 | te_A.append(A)
143 | te_G.append(G)
144 | te_C.append(c)
145 |
146 | # one-hot class labels
147 | tr_C = np.asarray(tr_C)
148 | tr_C = OneHotEncoder(sparse=False, categories='auto').fit_transform(tr_C[...,None])
149 | val_C = np.asarray(val_C)
150 | val_C = OneHotEncoder(sparse=False, categories='auto').fit_transform(val_C[...,None])
151 | te_C = np.asarray(te_C)
152 | te_C = OneHotEncoder(sparse=False, categories='auto').fit_transform(te_C[...,None])
153 |
154 | return tr_F, tr_A, tr_G, tr_C.astype(np.float32), \
155 | val_F, val_A, val_G, val_C.astype(np.float32), \
156 | te_F, te_A, te_G, te_C.astype(np.float32)
157 |
158 |
159 | if __name__=='__main__':
160 |
161 | ds = 'hard_small'
162 |
163 | ds_kwargs = {
164 | 'hard_small': {'n_nodes':80, 'cov':.2, 'k_neigh':3, 'samples_per_subclass':100},
165 | 'hard_normal': {'n_nodes':200, 'cov':.2, 'k_neigh':3, 'samples_per_subclass':600},
166 | 'easy_small': {'n_nodes':80, 'cov':.4, 'k_neigh':5, 'samples_per_subclass':100},
167 | 'easy_normal': {'n_nodes':200, 'cov':.4, 'k_neigh':5, 'samples_per_subclass':600},
168 | }
169 |
170 | F, A, G = make_instance(
171 | n_nodes=80,
172 | cov=.4,
173 | k_neigh=5,
174 | conn_mode='connectivity',
175 | plot_on=True)
176 |
177 | tr_F, tr_A, tr_G, tr_C, val_F, val_A, val_G, val_C, te_F, te_A, te_G, te_C = make_dataset(
178 | n_nodes=ds_kwargs[ds]['n_nodes'],
179 | random_n_nodes=True,
180 | cov=ds_kwargs[ds]['cov'],
181 | k_neigh=ds_kwargs[ds]['k_neigh'],
182 | noisy_coords=False,
183 | conn_mode='connectivity',
184 | tr_size = 0.9,
185 | samples_per_subclass=ds_kwargs[ds]['samples_per_subclass']
186 | )
187 |
188 | np.savez(ds,
189 | tr_adj=tr_A,
190 | tr_feat=tr_F,
191 | tr_class=tr_C,
192 | val_adj=val_A,
193 | val_feat=val_F,
194 | val_class=val_C,
195 | te_adj=te_A,
196 | te_feat=te_F,
197 | te_class=te_C)
--------------------------------------------------------------------------------