├── .gitignore ├── Example.ipynb ├── LICENSE ├── README.md ├── data_loaders └── torch_geometric │ ├── torch_classification_example.py │ └── torch_loader.py ├── datasets ├── easy.npz ├── easy_small.npz ├── hard.npz └── hard_small.npz ├── img ├── sample_graph.png └── sample_graph2.png └── make_dataset.py /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/* 2 | compute_stats.py 3 | *.pyc 4 | *.pt 5 | data_loaders/torch_geometric/data/raw/hard.npz 6 | data_loaders/spektral_tensorflow/tf_classification_example.py 7 | data_loaders/spektral_tensorflow/spektral_loader.py 8 | -------------------------------------------------------------------------------- /Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from time import time\n", 10 | "import numpy as np\n", 11 | "import networkx as nx\n", 12 | "\n", 13 | "from sklearn.metrics import accuracy_score\n", 14 | "from sklearn.model_selection import train_test_split\n", 15 | "from sklearn import svm\n", 16 | "\n", 17 | "from grakel import datasets, GraphKernel, graph_from_networkx" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "### Load datasets" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "dataset_name = \"hard_small\" # options: {easy_small, easy, hard_small, hard}\n", 34 | "\n", 35 | "loaded = np.load('datasets/'+dataset_name+'.npz', allow_pickle=True)\n", 36 | "A_train = list(loaded['tr_adj']) # list of adjacency matrices\n", 37 | "X_train = loaded['tr_feat'] # node features\n", 38 | "y_train = loaded['tr_class'] # class labels\n", 39 | "A_test = list(loaded['te_adj']) # list of adjacency matrices\n", 40 | "X_test = loaded['te_feat'] # node features\n", 41 | "y_test = loaded['te_class'] # class labels\n", 42 | "\n", 43 | "# Convert to networkx format\n", 44 | "G_tr = []\n", 45 | "for a, x in zip(A_train, X_train):\n", 46 | " G = nx.from_scipy_sparse_matrix(a)\n", 47 | " x_tuple = tuple(map(tuple, x))\n", 48 | " nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')\n", 49 | " G_tr.append(G)\n", 50 | "G_te = []\n", 51 | "for a, x in zip(A_test, X_test):\n", 52 | " G = nx.from_scipy_sparse_matrix(a)\n", 53 | " x_tuple = tuple(map(tuple, x))\n", 54 | " nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')\n", 55 | " G_te.append(G)\n", 56 | "\n", 57 | "# Convert to GraKel format\n", 58 | "G_train = graph_from_networkx(G_tr, node_labels_tag='features')\n", 59 | "G_train = [g for g in G_train]\n", 60 | "y_train = np.argmax(y_train, axis=-1)\n", 61 | "G_test = graph_from_networkx(G_te, node_labels_tag='features')\n", 62 | "G_test = [g for g in G_test]\n", 63 | "y_test = np.argmax(y_test, axis=-1)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Train and evaluate graph kernels" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 3, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "shortest_path -- Accuracy: 69.23 % | Took: 7.85 s\n", 83 | "graphlet_sampling -- Accuracy: 38.46 % | Took: 37.84 s\n", 84 | "pyramid_match -- Accuracy: 23.08 % | Took: 2.86 s\n", 85 | "svm_theta -- Accuracy: 23.08 % | Took: 2.91 s\n", 86 | "neighborhood_hash -- Accuracy: 69.23 % | Took: 2.71 s\n", 87 | "subtree_wl -- Accuracy: 15.38 % | Took: 0.03 s\n", 88 | "odd_sth -- Accuracy: 42.31 % | Took: 24.48 s\n", 89 | "propagation -- Accuracy: 53.85 % | Took: 2.61 s\n", 90 | "pyramid_match -- Accuracy: 23.08 % | Took: 3.47 s\n", 91 | "vertex_histogram -- Accuracy: 15.38 % | Took: 0.01 s\n", 92 | "weisfeiler_lehman -- Accuracy: 73.08 % | Took: 58.92 s\n", 93 | "core_framework -- Accuracy: 69.23 % | Took: 18.62 s\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "# Create a list with the graph kernels to evaluate\n", 99 | "# For more kernels and information:\n", 100 | "# https://ysig.github.io/GraKeL/dev/generated/grakel.GraphKernel.html#grakel.GraphKernel\n", 101 | "kernel_names = [\n", 102 | " \"shortest_path\", \n", 103 | " \"graphlet_sampling\", \n", 104 | " \"pyramid_match\", \n", 105 | " \"svm_theta\",\n", 106 | " \"neighborhood_hash\",\n", 107 | " \"subtree_wl\",\n", 108 | " \"odd_sth\",\n", 109 | " \"propagation\",\n", 110 | " \"vertex_histogram\",\n", 111 | " \"weisfeiler_lehman\",\n", 112 | " \"core_framework\"\n", 113 | " ]\n", 114 | "\n", 115 | "for k_ in kernel_names:\n", 116 | " \n", 117 | " start = time()\n", 118 | " \n", 119 | " # General kernels\n", 120 | " if k_ in [\"weisfeiler_lehman\" , \"core_framework\"]:\n", 121 | " gk = GraphKernel(kernel=[{\"name\": k_}, {\"name\": \"shortest_path\"}], normalize=True)\n", 122 | " \n", 123 | " # Base kernels\n", 124 | " else:\n", 125 | " gk = GraphKernel(kernel=[{\"name\": k_}], normalize=True)\n", 126 | "\n", 127 | " # Calculate the kernel matrix\n", 128 | " K_train = gk.fit_transform(G_train)\n", 129 | " K_test = gk.transform(G_test)\n", 130 | " \n", 131 | " # Initialise an SVM and fit\n", 132 | " clf = svm.SVC(kernel='precomputed', C=1)\n", 133 | " clf.fit(K_train, y_train)\n", 134 | " \n", 135 | " # Compute predictions on test set\n", 136 | " y_pred = clf.predict(K_test)\n", 137 | " \n", 138 | " # Calculate accuracy of classification\n", 139 | " acc = accuracy_score(y_test, y_pred)\n", 140 | " \n", 141 | " end = time()\n", 142 | " print(k_, \"-- Accuracy:\", str(round(acc*100, 2)), \"% | Took:\",\n", 143 | " str(round(end - start, 2)), \"s\")" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.6.7" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 2 175 | } 176 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Filippo Bianchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Benchmark dataset for graph classification 2 | This repository contains datasets to quickly test graph classification algorithms, such as Graph Kernels and Graph Neural Networks. 3 | 4 | The purpose of this dataset is to make the features on the nodes and the adjacency matrix to be completely uninformative if considered alone. 5 | Therefore, an algorithm that relies only on the node features or on the graph structure will fail to achieve good classification results. 6 | 7 | ## Citation 8 | 9 | This dataset was formally introduced in the paper [Pyramidal Reservoir Graph Neural Network](https://arxiv.org/abs/2104.04710). 10 | If you are using this dataset in your research, please consider citing our work: 11 | 12 | ```bibtex 13 | @inproceedings{bianchi2022pyramidal, 14 | title={Pyramidal Reservoir Graph Neural Network}, 15 | author={Bianchi, Filippo Maria and Gallicchio, Claudio and Micheli, Alessio}, 16 | journal={Neurocomputing}, 17 | volume={470}, 18 | pages={389--404}, 19 | year={2022}, 20 | publisher={Elsevier} 21 | } 22 | ``` 23 | 24 | ## Dataset details 25 | 26 | The dataset consists of graphs belonging to 3 different classes. The number of nodes in each graph is variable and the feature vector on each node is a one-hot vector of size 5, which encodes the color of the node. The class is determined by the relative position of the colors on the graph. 27 | 28 | ![](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/img/sample_graph.png) 29 | ![](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/img/sample_graph2.png) 30 | 31 | There are 4 versions of the dataset 32 | 33 | - **small_easy:** 100 graphs per class, number of nodes varying in 40 and 80. Highly connected graphs. 34 | - **easy:** 600 graphs per class, number of nodes varying in 100 and 200. Highly connected graphs. 35 | - **small_hard:** 100 graphs per class, number of nodes varying in 40 and 80. Sparse graphs. 36 | - **hard:** 600 graphs per class, number of nodes varying in 100 and 200. Sparse graphs. 37 | 38 | In the hard dataset, it is necessary to consider higher order neighborhoods to understand the correct class and the graphs might be disconnected. 39 | 40 | | Dataset | # classes | # graphs | TR size | VAL size | TEST size | avg nodes | avg edges | Node Attr. (Dim.) | 41 | |------------|-----------|----------|---------|----------|-----------|-----------|-----------|-------------------| 42 | | easy_small | 3 | 300 | 239 | 30 | 31 | 58.25 | 358.8 | 5 | 43 | | hard_small | 3 | 300 | 245 | 29 | 26 | 58.64 | 224.94 | 5 | 44 | | easy | 3 | 1800 | 1475 | 162 | 163 | 147.82 | 922.66 | 5 | 45 | | hard | 3 | 1800 | 1451 | 159 | 190 | 148.32 | 572.32 | 5 | 46 | 47 | ### Format 48 | 49 | The dataset is already split in training, validation and classification sets. 50 | Each set contains: 51 | - the list of adjacency matrices in csr_matrix format, 52 | - the list of node features as numpy arrays, 53 | - the class labels contained in a numpy array, 54 | 55 | Numpy logo 56 | 57 | ### Loader (Numpy) 58 | 59 | The following code snippet shows how to load the data 60 | 61 | ````python 62 | import numpy as np 63 | 64 | loaded = np.load('datasets/hard.npz', allow_pickle=True) 65 | 66 | X_train = loaded['tr_feat'] # node features 67 | A_train = list(loaded['tr_adj']) # list of adjacency matrices 68 | y_train = loaded['tr_class'] # class labels 69 | 70 | X_val = loaded['val_feat'] # node features 71 | A_val = list(loaded['val_adj']) # list of adjacency matrices 72 | y_val = loaded['val_class'] # class labels 73 | 74 | X_test = loaded['te_feat'] # node features 75 | A_test = list(loaded['te_adj']) # list of adjacency matrices 76 | y_test = loaded['te_class'] # class labels 77 | 78 | # OPTIONAL - Convert to networkx format 79 | import networkx as nx 80 | 81 | G_train = [] 82 | for a, x in zip(A_train, X_train): 83 | G = nx.from_scipy_sparse_matrix(a) 84 | x_tuple = tuple(map(tuple, x)) 85 | nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features') 86 | G_train.append(G) 87 | 88 | G_val = [] 89 | for a, x in zip(A_val, X_val): 90 | G = nx.from_scipy_sparse_matrix(a) 91 | x_tuple = tuple(map(tuple, x)) 92 | nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features') 93 | G_val.append(G) 94 | 95 | G_test = [] 96 | for a, x in zip(A_test, X_test): 97 | G = nx.from_scipy_sparse_matrix(a) 98 | x_tuple = tuple(map(tuple, x)) 99 | nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features') 100 | G_test.append(G) 101 | ```` 102 | 103 | Pytorch logo 104 | 105 | ### Loader (Pytorch) 106 | 107 | The dataset can be processed by a GNN implemented in [Pytorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/index.html) using the function defined in [torch_loader.py](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/data_loaders/torch_geometric/torch_loader.py). 108 | 109 | ````python 110 | from torch_geometric.loader import DataLoader 111 | from torch_loader import GraphClassificationBench 112 | 113 | # Load "hard" 114 | train_dataset = GraphClassificationBench("data/", split='train', easy=False, small=False) 115 | val_dataset = GraphClassificationBench("data/", split='val', easy=False, small=False) 116 | test_dataset = GraphClassificationBench("data/", split='test', easy=False, small=False) 117 | 118 | train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) 119 | val_loader = DataLoader(val_dataset, batch_size=32) 120 | test_loader = DataLoader(test_dataset, batch_size=32) 121 | ```` 122 | 123 | See [torch_classification_example.py](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/data_loaders/torch_geometric/torch_classification_example.py) for a complete working example. 124 | 125 | ## Results 126 | Classification results obtained by using Graph Kernels and other techniques are reported below. 127 | 128 | Feel free to send a pull request if you have results you'd like to share! 129 | 130 | #### Graph Kernels 131 | The Graph Kernels are computed with the [GraKeL](https://ysig.github.io/GraKeL/dev/index.html) library. After each kernel is computed, an SVM that uses as precomputed kernel the Graph Kernel is trained and then evaluated on the test data. 132 | As SVM implementation, the [sklearn.svm](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm) module was used. 133 | The code used to generate the results can be found in the [notebook](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/Example.ipynb) of this repository. 134 | 135 | Dependecies to run the notebook: 136 | - scikitlearn ````pip install sklearn```` 137 | - networkx ````pip install networkx```` 138 | - grakel ````pip install grakel-dev```` 139 | 140 | | | easy_small | | hard_small | | 141 | |--------------------|------------------|----------------|------------------|----------------| 142 | | Shortest Path | Accuracy: 100 | Time: 20.67 s | Accuracy: 69.23 | Time: 7.85 s | 143 | | Graphlet Sampling | Accuracy: 41.94 | Time: 281.35 s | Accuracy: 38.46 | Time: 37.84 s | 144 | | Pyramid Match | Accuracy: 51.61 | Time: 2.91 s | Accuracy: 23.08 | Time: 2.86 s | 145 | | SVM Theta | Accuracy: 32.26 | Time: 3.34 s | Accuracy: 23.08 | Time: 2.91 s | 146 | | Neighborhood Hash | Accuracy: 90.32 | Time: 2.73 s | Accuracy: 69.23 | Time: 2.71 s | 147 | | Subtree WL | Accuracy: 29.03 | Time: 0.01 s | Accuracy: 15.38 | Time: 0.03 s | 148 | | ODD STH | Accuracy: 77.42 | Time: 58.75 s | Accuracy: 42.31 | Time: 24.48 s | 149 | | Propagation | Accuracy: 87.1 | Time: 3.35 s | Accuracy: 53.85 | Time: 2.61 s | 150 | | Vertex Histogram | Accuracy: 29.03 | Time: 0.02 s | Accuracy: 15.38 | Time: 0.01 s | 151 | | Weisfeiler Lehman | Accuracy: 100 | Time: 151.81 s | Accuracy: 73.08 | Time: 58.92 s | 152 | | Core Framework | Accuracy: 100 | Time: 62.18 s | Accuracy: 69.23 | Time: 18.62 s | 153 | 154 | 155 | #### Graph Neural Networks 156 | 157 | Results obtained with the following GNN architecture: MP(32)-Pool-MP(32)-Pool-MP(32)-GlobalPool-Dense(Softmax). MP is a message-passing architecture. A Chebyshev convolutional layer \[1\] with K=1 and 32 hidden units was used here. Results refer to different graph pooling layers: Graclus \[2\], Node Decimation Pooling (NDP) \[3\], DiffPool \[4\], Top-K pooling \[5\], SAGpool \[6\] and MinCutPool \[7\]. 158 | 159 | 160 | | | easy | hard | 161 | |------------|----------------------|-----------------------| 162 | | Graclus | Accuracy: 97.5 ± 0.5 | Accuracy: 69.0 ± 1.5 | 163 | | NDP | Accuracy: 97.9 ± 0.5 | Accuracy: 72.6 ± 0.9 | 164 | | DiffPool | Accuracy: 98.6 ± 0.4 | Accuracy: 69.9 ± 1.9 | 165 | | Top-K | Accuracy: 82.4 ± 8.9 | Accuracy: 42.7 ± 15.2 | 166 | | SAGPool | Accuracy: 84.2 ± 2.3 | Accuracy: 37.7 ± 14.5 | 167 | | MinCutPool | Accuracy: 99.0 ± 0.0 | Accuracy: 73.8 ± 1.9 | 168 | 169 | 170 | 171 | #### Embedding Simplicial Complexes (ESC) 172 | Techniques proposed in \[8\]. 173 | 174 | | | easy_small | | hard_small | | 175 | |--------------------|------------------|----------------|------------------|----------------| 176 | | ESC + RBF-SVM | Accuracy: 74.19 ± 6.84 | Time: 0.68 s| Accuracy: 48.46 ± 8.43| Time: 0.48 s| 177 | | ESC + L1-SVM | Accuracy: 94.19 ± 2.70 | Time: 0.68 s| Accuracy: 70.77 ± 5.83| Time: 0.48 s| 178 | | ESC + L2-SVM | Accuracy: 92.26 ± 2.89 | Time: 0.68 s| Accuracy: 69.23 ± 5.44| Time: 0.48 s| 179 | 180 | | | easy | | hard | | 181 | |--------------------|------------------|----------------|------------------|----------------| 182 | | ESC + RBF-SVM | Accuracy: 80.37 ± 7.04 | Time: 10.94 s| Accuracy: 62.53 ± 4.58| Time: 16.65 s| 183 | | ESC + L1-SVM | Accuracy: 96.07 ± 0.93 | Time: 10.94 s| Accuracy: 72.21 ± 1.01| Time: 16.65 s| 184 | | ESC + L2-SVM | Accuracy: 93.37 ± 1.96 | Time: 10.94 s| Accuracy: 69.26 ± 1.85| Time: 16.65 s| 185 | 186 | #### Hypergraph kernels 187 | Techniques proposed in \[9\]. 188 | 189 | | | easy_small | | hard_small | | 190 | |--------------------|------------------|----------------|------------------|----------------| 191 | | Hist Kernel | Accuracy: 94.0 ± 0.02 | Time: 0.72 s| Accuracy: 77.0 ± 0.02 | Time: 0.46 s| 192 | | Jaccard Kernel | Accuracy: 94.0 ± 0.0 | Time: 0.86 s| Accuracy: 68.0 ± 0.02 | Time: 0.54 s| 193 | | Edit Kernel | Accuracy: 94.0 ± 0.01 | Time: 9.97 s| Accuracy: 60.0 ± 0.02 | Time: 7.70 s| 194 | | Stratedit Kernel | Accuracy: 94.0 ± 0.0 | Time: 5.14 s| Accuracy: 58.0 ± 0.02 | Time: 4.79 s| 195 | 196 | | | easy | | hard | | 197 | |--------------------|-----------------------|----------------|-----------------------|----------------| 198 | | Hist Kernel | Accuracy: 94.0 ± 0.01 | Time: 10.39 s | Accuracy: 72.0 ± 0.01 | Time: 6.93 s | 199 | | Jaccard Kernel | Accuracy: 94.0 ± 0.01 | Time: 14.15 s | Accuracy: 63.0 ± 0.00 | Time: 8.11 s | 200 | | Edit Kernel | Accuracy: 93.0 ± 0.00 | Time: 2784.47 s | Accuracy: 60.0 ± 0.00 | Time: 2183.41 s | 201 | | Stratedit Kernel | Accuracy: 93.0 ± 0.00 | Time: 932.96 s | Accuracy: 60.0 ± 0.01 | Time: 954.87 s | 202 | 203 | 204 | ## References 205 | \[1\] Defferrard, M., Bresson, X., & Vandergheynst, P. (2016). Convolutional neural networks on graphs with fast localized spectral filtering. In Advances in neural information processing systems 206 | 207 | \[2\] Dhillon, I. S., Guan, Y., & Kulis, B. (2007). Weighted graph cuts without eigenvectors a multilevel approach. IEEE transactions on pattern analysis and machine intelligence 208 | 209 | \[3\] Bianchi, F. M., Grattarola, D., Livi, L., & Alippi, C. (2019). Hierarchical Representation Learning in Graph Neural Networks with Node Decimation Pooling 210 | 211 | \[4\] Ying, Z., You, J., Morris, C., Ren, X., Hamilton, W., & Leskovec, J. (2018). Hierarchical graph representation learning with differentiable pooling. In Advances in neural information processing systems 212 | 213 | \[5\] Gao, H., & Ji, S., Graph u-nets, ICML 2019 214 | 215 | \[6\] Lee, J., Lee, I., & Kang, J., Self-attention graph pooling, ICML 2019 216 | 217 | \[7\] F. M. Bianchi, D. Grattarola, C. Alippi, Spectral Clustering with Graph Neural Networks for Graph Pooling, ICML 2020 218 | 219 | \[8\] Martino A, Giuliani A, Rizzi A., (Hyper) Graph Embedding and Classification via Simplicial Complexes. Algorithms. 2019 Nov; 12(11):223 220 | 221 | \[9\] Martino A. and Rizzi A., (Hyper)graph kernels over simplicial complexes. 2020. Pattern Recognition 222 | 223 | ## License 224 | The dataset and the code are released under the MIT License. See the attached LICENSE file. 225 | -------------------------------------------------------------------------------- /data_loaders/torch_geometric/torch_classification_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import torch.nn.functional as F 4 | from torch_geometric.loader import DataLoader 5 | from torch_geometric.logging import log 6 | from torch_geometric.nn import MLP, GINConv, global_add_pool 7 | 8 | from torch_loader import GraphClassificationBench 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--batch_size', type=int, default=32) 12 | parser.add_argument('--hidden_channels', type=int, default=32) 13 | parser.add_argument('--num_layers', type=int, default=3) 14 | parser.add_argument('--lr', type=float, default=5e-4) 15 | parser.add_argument('--epochs', type=int, default=100) 16 | args = parser.parse_args() 17 | 18 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 19 | 20 | 21 | file_path = "data/" 22 | train_dataset = GraphClassificationBench(file_path, split='train', easy=False, small=False) 23 | train_loader = DataLoader(train_dataset, args.batch_size, shuffle=True) 24 | val_dataset = GraphClassificationBench(file_path, split='val', easy=False, small=False) 25 | val_loader = DataLoader(val_dataset, args.batch_size) 26 | test_dataset = GraphClassificationBench(file_path, split='test', easy=False, small=False) 27 | test_loader = DataLoader(test_dataset, args.batch_size) 28 | 29 | 30 | class Net(torch.nn.Module): 31 | def __init__(self, in_channels, hidden_channels, out_channels, num_layers): 32 | super().__init__() 33 | 34 | self.convs = torch.nn.ModuleList() 35 | for _ in range(num_layers): 36 | mlp = MLP([in_channels, hidden_channels, hidden_channels]) 37 | self.convs.append(GINConv(nn=mlp, train_eps=False)) 38 | in_channels = hidden_channels 39 | 40 | self.mlp = MLP([hidden_channels, hidden_channels, out_channels], 41 | norm=None, dropout=0.5) 42 | 43 | def forward(self, x, edge_index, batch): 44 | for conv in self.convs: 45 | x = conv(x, edge_index).relu() 46 | x = global_add_pool(x, batch) 47 | return self.mlp(x) 48 | 49 | 50 | model = Net(train_dataset.num_features, args.hidden_channels, train_dataset.num_classes, 51 | args.num_layers).to(device) 52 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) 53 | 54 | 55 | def train(): 56 | model.train() 57 | 58 | total_loss = 0 59 | for data in train_loader: 60 | data = data.to(device) 61 | optimizer.zero_grad() 62 | out = model(data.x, data.edge_index, data.batch) 63 | loss = F.cross_entropy(out, data.y) 64 | loss.backward() 65 | optimizer.step() 66 | total_loss += float(loss) * data.num_graphs 67 | return total_loss / len(train_loader.dataset) 68 | 69 | 70 | @torch.no_grad() 71 | def test(loader): 72 | model.eval() 73 | 74 | total_correct = 0 75 | for data in loader: 76 | data = data.to(device) 77 | pred = model(data.x, data.edge_index, data.batch).argmax(dim=-1) 78 | total_correct += int((pred == data.y).sum()) 79 | return total_correct / len(loader.dataset) 80 | 81 | 82 | for epoch in range(1, args.epochs + 1): 83 | loss = train() 84 | train_acc = test(train_loader) 85 | val_acc = test(val_loader) 86 | test_acc = test(test_loader) 87 | log(Epoch=epoch, Loss=loss, Train=train_acc, Val=val_acc, Test=test_acc) 88 | -------------------------------------------------------------------------------- /data_loaders/torch_geometric/torch_loader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch_geometric.data import InMemoryDataset, Data, download_url 3 | from os import path 4 | import numpy as np 5 | 6 | 7 | class GraphClassificationBench(InMemoryDataset): 8 | """The synthetic dataset from `"Pyramidal Reservoir Graph Neural Network" 9 | `_ paper. 10 | 11 | Args: 12 | root (string): Root directory where the dataset should be saved. 13 | split (string): If `"train"`, loads the training dataset. 14 | If `"val"`, loads the validation dataset. 15 | If `"test"`, loads the test dataset. Defaults to `"train"`. 16 | easy (bool, optional): If `True`, use the easy version of the dataset. 17 | Defaults to `True`. 18 | small (bool, optional): If `True`, use the small version of the 19 | dataset. Defaults to `True`. 20 | transform (callable, optional): A function/transform that takes in an 21 | `torch_geometric.data.Data` object and returns a transformed 22 | version. The data object will be transformed before every access. 23 | Defaults to `None`. 24 | pre_transform (callable, optional): A function/transform that takes in 25 | an `torch_geometric.data.Data` object and returns a 26 | transformed version. The data object will be transformed before 27 | being saved to disk. Defaults to `None`. 28 | pre_filter (callable, optional): A function that takes in an 29 | `torch_geometric.data.Data` object and returns a boolean 30 | value, indicating whether the data object should be included in the 31 | final dataset. Defaults to `None`. 32 | """ 33 | base_url = ('http://github.com/FilippoMB/' 34 | 'Benchmark_dataset_for_graph_classification/' 35 | 'raw/master/datasets/') 36 | 37 | def __init__(self, root, split='train', easy=True, small=True, transform=None, pre_transform=None, pre_filter=None): 38 | self.split = split.lower() 39 | assert self.split in {'train', 'val', 'test'} 40 | if self.split != 'val': 41 | self.split = self.split[:2] 42 | 43 | self.file_name = ('easy' if easy else 'hard') + ('_small' if small else '') 44 | 45 | super(GraphClassificationBench, self).__init__(root, transform, pre_transform, pre_filter) 46 | self.data, self.slices = torch.load(self.processed_paths[0]) 47 | 48 | @property 49 | def raw_file_names(self): 50 | return '{}.npz'.format(self.file_name) 51 | 52 | @property 53 | def processed_file_names(self): 54 | return '{}.pt'.format(self.file_name+ '_' + self.split) 55 | 56 | def download(self): 57 | download_url('{}{}.npz'.format(self.base_url, self.file_name), self.raw_dir) 58 | 59 | def process(self): 60 | npz = np.load(path.join(self.raw_dir, self.raw_file_names), allow_pickle=True) 61 | raw_data = (npz['{}_{}'.format(self.split, key)] for key in ['feat', 'adj', 'class']) 62 | data_list = [Data(x=torch.FloatTensor(x), 63 | edge_index=torch.LongTensor(np.stack(adj.nonzero())), 64 | y=torch.LongTensor(y.nonzero()[0])) for x, adj, y in zip(*raw_data)] 65 | 66 | if self.pre_filter is not None: 67 | data_list = [data for data in data_list if self.pre_filter(data)] 68 | 69 | if self.pre_transform is not None: 70 | data_list = [self.pre_transform(data) for data in data_list] 71 | 72 | self.data, self.slices = self.collate(data_list) 73 | torch.save((self.data, self.slices), self.processed_paths[0]) 74 | 75 | 76 | if __name__ == "__main__": 77 | file_path = "data/" 78 | tr_dataset = GraphClassificationBench(file_path, split='train', easy=False, small=False) 79 | print(tr_dataset[5]) -------------------------------------------------------------------------------- /datasets/easy.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/easy.npz -------------------------------------------------------------------------------- /datasets/easy_small.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/easy_small.npz -------------------------------------------------------------------------------- /datasets/hard.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/hard.npz -------------------------------------------------------------------------------- /datasets/hard_small.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/hard_small.npz -------------------------------------------------------------------------------- /img/sample_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/img/sample_graph.png -------------------------------------------------------------------------------- /img/sample_graph2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/img/sample_graph2.png -------------------------------------------------------------------------------- /make_dataset.py: -------------------------------------------------------------------------------- 1 | from sklearn import datasets 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from sklearn.neighbors import kneighbors_graph 5 | import scipy.sparse as sp 6 | import networkx as nx 7 | from sklearn.preprocessing import OneHotEncoder 8 | 9 | 10 | def make_instance( 11 | moon_c=[0,0], 12 | gaussq_c=[2,-2], 13 | blob_c=[[4,0]], 14 | n_nodes=90, 15 | random_n_nodes=False, 16 | cov=.2, 17 | k_neigh=5, 18 | conn_mode='connectivity', 19 | noisy_coords=False, 20 | noise_level=.1, 21 | plot_on=False): 22 | 23 | if random_n_nodes: 24 | n_nodes = np.random.randint(low=n_nodes//2, high=n_nodes) 25 | 26 | n_samples = n_nodes//3 27 | 28 | # initial features 29 | Xm, Ym = datasets.make_moons(n_samples=n_samples, noise=0.1) 30 | Xm[:,0] += moon_c[0] 31 | Xm[:,1] += moon_c[1] 32 | Xq, Yq = datasets.make_gaussian_quantiles(n_samples=n_samples, mean=gaussq_c, n_classes=2, cov=cov) 33 | Yq += 2 34 | Xb, Yb = datasets.make_blobs(n_samples=n_samples, centers=blob_c, cluster_std=cov*2) 35 | Yb += 4 36 | X = np.concatenate((Xm, Xq, Xb)) 37 | X /= np.max(X,axis=0) 38 | Y = np.concatenate((Ym, Yq, Yb)) 39 | 40 | if plot_on: 41 | plt.scatter(X[:,0], X[:,1], c=Y) 42 | plt.title('initial features') 43 | plt.show() 44 | 45 | # build graph 46 | A = kneighbors_graph(X, n_neighbors=k_neigh, mode=conn_mode).todense() 47 | A = np.asarray(A) 48 | A = np.maximum(A, A.T) 49 | A /= A.max() # normalize in [0,1] 50 | A = sp.csr_matrix(A, dtype=np.float32) 51 | G = nx.from_scipy_sparse_matrix(A) 52 | 53 | if plot_on: 54 | nx.draw_networkx(G, pos=nx.fruchterman_reingold_layout(G), with_labels=False, node_size=20, edge_color='lightgray', node_color=Y, 55 | linewidths=1) 56 | plt.title('graph') 57 | plt.show() 58 | 59 | # node features 60 | F = OneHotEncoder(sparse=False, categories='auto').fit_transform(Y[...,None]) 61 | 62 | if noisy_coords: 63 | X = np.tanh(X*1.1) 64 | X = np.multiply(X, np.diag(np.random.randn(X.shape[0],1)*noise_level)) 65 | 66 | if plot_on: 67 | plt.scatter(X[:,0], X[:,1], c=Y) 68 | plt.title('noisy coords features') 69 | plt.show() 70 | 71 | F = np.concatenate((F, X), axis=-1) 72 | 73 | F_tuple = tuple(map(tuple, F)) 74 | nx.set_node_attributes(G, dict(enumerate(F_tuple)), 'features') 75 | 76 | return F.astype(np.float32), A, G 77 | 78 | 79 | def make_dataset( 80 | moon = [ [[4,0]], [[2,-2]], [[0,0]] ], #[ [[4,0], [0,0]], [[2,-2],[4,0]], [[2,-2], [0,0]] ], 81 | gaussq = [ [[0,0]], [[4,0]], [[2,-2]] ], #[ [[0,0], [4,0]], [[0,0], [2,-2]], [[4,0], [2,-2]] ], 82 | blob = [ [[[2,-2]]], [[[0,0]]], [[[4,0]]] ], #[ [[[2,-2]],[[2,-2]]], [[[4,0]], [[0,0]]], [[[0,0]], [[4,0]]] ], 83 | n_nodes=90, 84 | random_n_nodes=False, 85 | cov=.2, 86 | k_neigh=5, 87 | conn_mode='connectivity', 88 | noisy_coords=False, 89 | noise_level=.5, 90 | plot_on=False, 91 | tr_size = 0.9, 92 | samples_per_subclass=150 93 | ): 94 | n_classes = len(moon) 95 | n_subclass = len(moon[0]) 96 | print('n_classes:',n_classes,', n_subclasses:',n_subclass) 97 | 98 | tr_F = [] 99 | tr_A = [] 100 | tr_G = [] 101 | tr_C = [] 102 | 103 | val_F = [] 104 | val_A = [] 105 | val_G = [] 106 | val_C = [] 107 | 108 | te_F = [] 109 | te_A = [] 110 | te_G = [] 111 | te_C = [] 112 | 113 | for c in range(n_classes): 114 | for s in range(n_subclass): 115 | 116 | for _ in range(samples_per_subclass): 117 | F, A, G = make_instance(moon_c=moon[c][s], 118 | gaussq_c=gaussq[c][s], 119 | blob_c=blob[c][s], 120 | n_nodes=n_nodes, 121 | random_n_nodes=random_n_nodes, 122 | cov=cov, 123 | k_neigh=k_neigh, 124 | conn_mode=conn_mode, 125 | noisy_coords=noisy_coords, 126 | noise_level=noise_level, 127 | plot_on=plot_on) 128 | 129 | if np.random.rand() < tr_size: 130 | if np.random.rand() < tr_size: 131 | tr_F.append(F) 132 | tr_A.append(A) 133 | tr_G.append(G) 134 | tr_C.append(c) 135 | else: 136 | val_F.append(F) 137 | val_A.append(A) 138 | val_G.append(G) 139 | val_C.append(c) 140 | else: 141 | te_F.append(F) 142 | te_A.append(A) 143 | te_G.append(G) 144 | te_C.append(c) 145 | 146 | # one-hot class labels 147 | tr_C = np.asarray(tr_C) 148 | tr_C = OneHotEncoder(sparse=False, categories='auto').fit_transform(tr_C[...,None]) 149 | val_C = np.asarray(val_C) 150 | val_C = OneHotEncoder(sparse=False, categories='auto').fit_transform(val_C[...,None]) 151 | te_C = np.asarray(te_C) 152 | te_C = OneHotEncoder(sparse=False, categories='auto').fit_transform(te_C[...,None]) 153 | 154 | return tr_F, tr_A, tr_G, tr_C.astype(np.float32), \ 155 | val_F, val_A, val_G, val_C.astype(np.float32), \ 156 | te_F, te_A, te_G, te_C.astype(np.float32) 157 | 158 | 159 | if __name__=='__main__': 160 | 161 | ds = 'hard_small' 162 | 163 | ds_kwargs = { 164 | 'hard_small': {'n_nodes':80, 'cov':.2, 'k_neigh':3, 'samples_per_subclass':100}, 165 | 'hard_normal': {'n_nodes':200, 'cov':.2, 'k_neigh':3, 'samples_per_subclass':600}, 166 | 'easy_small': {'n_nodes':80, 'cov':.4, 'k_neigh':5, 'samples_per_subclass':100}, 167 | 'easy_normal': {'n_nodes':200, 'cov':.4, 'k_neigh':5, 'samples_per_subclass':600}, 168 | } 169 | 170 | F, A, G = make_instance( 171 | n_nodes=80, 172 | cov=.4, 173 | k_neigh=5, 174 | conn_mode='connectivity', 175 | plot_on=True) 176 | 177 | tr_F, tr_A, tr_G, tr_C, val_F, val_A, val_G, val_C, te_F, te_A, te_G, te_C = make_dataset( 178 | n_nodes=ds_kwargs[ds]['n_nodes'], 179 | random_n_nodes=True, 180 | cov=ds_kwargs[ds]['cov'], 181 | k_neigh=ds_kwargs[ds]['k_neigh'], 182 | noisy_coords=False, 183 | conn_mode='connectivity', 184 | tr_size = 0.9, 185 | samples_per_subclass=ds_kwargs[ds]['samples_per_subclass'] 186 | ) 187 | 188 | np.savez(ds, 189 | tr_adj=tr_A, 190 | tr_feat=tr_F, 191 | tr_class=tr_C, 192 | val_adj=val_A, 193 | val_feat=val_F, 194 | val_class=val_C, 195 | te_adj=te_A, 196 | te_feat=te_F, 197 | te_class=te_C) --------------------------------------------------------------------------------