├── .gitignore
├── Example.ipynb
├── LICENSE
├── README.md
├── data_loaders
    └── torch_geometric
    │   ├── torch_classification_example.py
    │   └── torch_loader.py
├── datasets
    ├── easy.npz
    ├── easy_small.npz
    ├── hard.npz
    └── hard_small.npz
├── img
    ├── sample_graph.png
    └── sample_graph2.png
└── make_dataset.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/*
2 | compute_stats.py
3 | *.pyc
4 | *.pt
5 | data_loaders/torch_geometric/data/raw/hard.npz
6 | data_loaders/spektral_tensorflow/tf_classification_example.py
7 | data_loaders/spektral_tensorflow/spektral_loader.py
8 | 


--------------------------------------------------------------------------------
/Example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from time import time\n",
 10 |     "import numpy as np\n",
 11 |     "import networkx as nx\n",
 12 |     "\n",
 13 |     "from sklearn.metrics import accuracy_score\n",
 14 |     "from sklearn.model_selection import train_test_split\n",
 15 |     "from sklearn import svm\n",
 16 |     "\n",
 17 |     "from grakel import datasets, GraphKernel, graph_from_networkx"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "### Load datasets"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "dataset_name = \"hard_small\" # options: {easy_small, easy, hard_small, hard}\n",
 34 |     "\n",
 35 |     "loaded = np.load('datasets/'+dataset_name+'.npz', allow_pickle=True)\n",
 36 |     "A_train = list(loaded['tr_adj']) # list of adjacency matrices\n",
 37 |     "X_train = loaded['tr_feat'] # node features\n",
 38 |     "y_train = loaded['tr_class'] # class labels\n",
 39 |     "A_test = list(loaded['te_adj']) # list of adjacency matrices\n",
 40 |     "X_test = loaded['te_feat'] # node features\n",
 41 |     "y_test = loaded['te_class'] # class labels\n",
 42 |     "\n",
 43 |     "# Convert to networkx format\n",
 44 |     "G_tr = []\n",
 45 |     "for a, x in zip(A_train, X_train):\n",
 46 |     "    G = nx.from_scipy_sparse_matrix(a)\n",
 47 |     "    x_tuple = tuple(map(tuple, x))\n",
 48 |     "    nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')\n",
 49 |     "    G_tr.append(G)\n",
 50 |     "G_te = []\n",
 51 |     "for a, x in zip(A_test, X_test):\n",
 52 |     "    G = nx.from_scipy_sparse_matrix(a)\n",
 53 |     "    x_tuple = tuple(map(tuple, x))\n",
 54 |     "    nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')\n",
 55 |     "    G_te.append(G)\n",
 56 |     "\n",
 57 |     "# Convert to GraKel format\n",
 58 |     "G_train = graph_from_networkx(G_tr, node_labels_tag='features')\n",
 59 |     "G_train = [g for g in G_train]\n",
 60 |     "y_train = np.argmax(y_train, axis=-1)\n",
 61 |     "G_test = graph_from_networkx(G_te, node_labels_tag='features')\n",
 62 |     "G_test = [g for g in G_test]\n",
 63 |     "y_test = np.argmax(y_test, axis=-1)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "### Train and evaluate graph kernels"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 3,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stdout",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "shortest_path -- Accuracy: 69.23 % | Took: 7.85 s\n",
 83 |       "graphlet_sampling -- Accuracy: 38.46 % | Took: 37.84 s\n",
 84 |       "pyramid_match -- Accuracy: 23.08 % | Took: 2.86 s\n",
 85 |       "svm_theta -- Accuracy: 23.08 % | Took: 2.91 s\n",
 86 |       "neighborhood_hash -- Accuracy: 69.23 % | Took: 2.71 s\n",
 87 |       "subtree_wl -- Accuracy: 15.38 % | Took: 0.03 s\n",
 88 |       "odd_sth -- Accuracy: 42.31 % | Took: 24.48 s\n",
 89 |       "propagation -- Accuracy: 53.85 % | Took: 2.61 s\n",
 90 |       "pyramid_match -- Accuracy: 23.08 % | Took: 3.47 s\n",
 91 |       "vertex_histogram -- Accuracy: 15.38 % | Took: 0.01 s\n",
 92 |       "weisfeiler_lehman -- Accuracy: 73.08 % | Took: 58.92 s\n",
 93 |       "core_framework -- Accuracy: 69.23 % | Took: 18.62 s\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "# Create a list with the graph kernels to evaluate\n",
 99 |     "# For more kernels and information:\n",
100 |     "# https://ysig.github.io/GraKeL/dev/generated/grakel.GraphKernel.html#grakel.GraphKernel\n",
101 |     "kernel_names = [\n",
102 |     "                \"shortest_path\", \n",
103 |     "                \"graphlet_sampling\", \n",
104 |     "                \"pyramid_match\", \n",
105 |     "                 \"svm_theta\",\n",
106 |     "                \"neighborhood_hash\",\n",
107 |     "                \"subtree_wl\",\n",
108 |     "                \"odd_sth\",\n",
109 |     "                \"propagation\",\n",
110 |     "                \"vertex_histogram\",\n",
111 |     "                \"weisfeiler_lehman\",\n",
112 |     "                \"core_framework\"\n",
113 |     "                ]\n",
114 |     "\n",
115 |     "for k_ in kernel_names:\n",
116 |     "    \n",
117 |     "    start = time()\n",
118 |     "    \n",
119 |     "    # General kernels\n",
120 |     "    if k_ in [\"weisfeiler_lehman\" , \"core_framework\"]:\n",
121 |     "        gk = GraphKernel(kernel=[{\"name\": k_}, {\"name\": \"shortest_path\"}], normalize=True)\n",
122 |     "    \n",
123 |     "    # Base kernels\n",
124 |     "    else:\n",
125 |     "        gk = GraphKernel(kernel=[{\"name\": k_}], normalize=True)\n",
126 |     "\n",
127 |     "    # Calculate the kernel matrix\n",
128 |     "    K_train = gk.fit_transform(G_train)\n",
129 |     "    K_test = gk.transform(G_test)\n",
130 |     "    \n",
131 |     "    # Initialise an SVM and fit\n",
132 |     "    clf = svm.SVC(kernel='precomputed', C=1)\n",
133 |     "    clf.fit(K_train, y_train)\n",
134 |     "    \n",
135 |     "    # Compute predictions on test set\n",
136 |     "    y_pred = clf.predict(K_test)\n",
137 |     "    \n",
138 |     "    # Calculate accuracy of classification\n",
139 |     "    acc = accuracy_score(y_test, y_pred)\n",
140 |     "    \n",
141 |     "    end = time()\n",
142 |     "    print(k_, \"-- Accuracy:\", str(round(acc*100, 2)), \"% | Took:\",\n",
143 |     "          str(round(end - start, 2)), \"s\")"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": []
152 |   }
153 |  ],
154 |  "metadata": {
155 |   "kernelspec": {
156 |    "display_name": "Python 3",
157 |    "language": "python",
158 |    "name": "python3"
159 |   },
160 |   "language_info": {
161 |    "codemirror_mode": {
162 |     "name": "ipython",
163 |     "version": 3
164 |    },
165 |    "file_extension": ".py",
166 |    "mimetype": "text/x-python",
167 |    "name": "python",
168 |    "nbconvert_exporter": "python",
169 |    "pygments_lexer": "ipython3",
170 |    "version": "3.6.7"
171 |   }
172 |  },
173 |  "nbformat": 4,
174 |  "nbformat_minor": 2
175 | }
176 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Filippo Bianchi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Benchmark dataset for graph classification
  2 | This repository contains datasets to quickly test graph classification algorithms, such as Graph Kernels and Graph Neural Networks.
  3 | 
  4 | The purpose of this dataset is to make the features on the nodes and the adjacency matrix to be completely uninformative if considered alone.
  5 | Therefore, an algorithm that relies only on the node features or on the graph structure will fail to achieve good classification results.
  6 | 
  7 | ## Citation
  8 | 
  9 | This dataset was formally introduced in the paper [Pyramidal Reservoir Graph Neural Network](https://arxiv.org/abs/2104.04710).
 10 | If you are using this dataset in your research, please consider citing our work:
 11 | 
 12 | ```bibtex
 13 | @inproceedings{bianchi2022pyramidal,
 14 |     title={Pyramidal Reservoir Graph Neural Network},
 15 |     author={Bianchi, Filippo Maria and Gallicchio, Claudio and Micheli, Alessio},
 16 |     journal={Neurocomputing},
 17 |     volume={470},
 18 |     pages={389--404},
 19 |     year={2022},
 20 |     publisher={Elsevier}
 21 | }
 22 | ```
 23 | 
 24 | ## Dataset details
 25 | 
 26 | The dataset consists of graphs belonging to 3 different classes. The number of nodes in each graph is variable and the feature vector on each node is a one-hot vector of size 5, which encodes the color of the node. The class is determined by the relative position of the colors on the graph.
 27 | 
 28 | ![](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/img/sample_graph.png) 
 29 | ![](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/img/sample_graph2.png) 
 30 | 
 31 | There are 4 versions of the dataset
 32 | 
 33 | - **small_easy:** 100 graphs per class, number of nodes varying in 40 and 80. Highly connected graphs.
 34 | - **easy:** 600 graphs per class, number of nodes varying in 100 and 200. Highly connected graphs.
 35 | - **small_hard:** 100 graphs per class, number of nodes varying in 40 and 80. Sparse graphs.
 36 | - **hard:** 600 graphs per class, number of nodes varying in 100 and 200. Sparse graphs.
 37 | 
 38 | In the hard dataset, it is necessary to consider higher order neighborhoods to understand the correct class and the graphs might be disconnected.
 39 | 
 40 | | Dataset    | # classes | # graphs | TR size | VAL size | TEST size | avg nodes | avg edges | Node Attr. (Dim.) |
 41 | |------------|-----------|----------|---------|----------|-----------|-----------|-----------|-------------------|
 42 | | easy_small | 3         | 300      | 239     | 30       | 31        | 58.25     | 358.8     | 5                 |
 43 | | hard_small | 3         | 300      | 245     | 29       | 26        | 58.64     | 224.94    | 5                 |
 44 | | easy       | 3         | 1800     | 1475    | 162      | 163       | 147.82    | 922.66    | 5                 |
 45 | | hard       | 3         | 1800     | 1451    | 159      | 190       | 148.32    | 572.32    | 5                 |
 46 | 
 47 | ### Format
 48 | 
 49 | The dataset is already split in training, validation and classification sets.
 50 | Each set contains:
 51 | - the list of adjacency matrices in csr_matrix format,
 52 | - the list of node features as numpy arrays,
 53 | - the class labels contained in a numpy array,
 54 | 
 55 | <img align="left" width="76" height="35" src="https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/NumPy_logo_2020.svg/512px-NumPy_logo_2020.svg.png" alt="Numpy logo">
 56 | 
 57 | ### Loader (Numpy)
 58 | 
 59 | The following code snippet shows how to load the data
 60 | 
 61 | ````python
 62 | import numpy as np
 63 | 
 64 | loaded = np.load('datasets/hard.npz', allow_pickle=True)
 65 | 
 66 | X_train = loaded['tr_feat'] # node features
 67 | A_train = list(loaded['tr_adj']) # list of adjacency matrices
 68 | y_train = loaded['tr_class'] # class labels
 69 | 
 70 | X_val = loaded['val_feat'] # node features
 71 | A_val = list(loaded['val_adj']) # list of adjacency matrices
 72 | y_val = loaded['val_class'] # class labels
 73 | 
 74 | X_test = loaded['te_feat'] # node features
 75 | A_test = list(loaded['te_adj']) # list of adjacency matrices
 76 | y_test = loaded['te_class'] # class labels
 77 | 
 78 | # OPTIONAL - Convert to networkx format
 79 | import networkx as nx
 80 | 
 81 | G_train = []
 82 | for a, x in zip(A_train, X_train):
 83 |     G = nx.from_scipy_sparse_matrix(a)
 84 |     x_tuple = tuple(map(tuple, x))
 85 |     nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')
 86 |     G_train.append(G)
 87 |     
 88 | G_val = []
 89 | for a, x in zip(A_val, X_val):
 90 |     G = nx.from_scipy_sparse_matrix(a)
 91 |     x_tuple = tuple(map(tuple, x))
 92 |     nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')
 93 |     G_val.append(G)
 94 |     
 95 | G_test = []
 96 | for a, x in zip(A_test, X_test):
 97 |     G = nx.from_scipy_sparse_matrix(a)
 98 |     x_tuple = tuple(map(tuple, x))
 99 |     nx.set_node_attributes(G, dict(enumerate(x_tuple)), 'features')
100 |     G_test.append(G)
101 | ````
102 | 
103 | <img align="left" width="30" height="30" src="https://upload.wikimedia.org/wikipedia/commons/1/10/PyTorch_logo_icon.svg" alt="Pytorch logo">
104 | 
105 | ### Loader (Pytorch)
106 | 
107 | The dataset can be processed by a GNN implemented in [Pytorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/index.html) using the function defined in [torch_loader.py](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/data_loaders/torch_geometric/torch_loader.py).
108 | 
109 | ````python
110 | from torch_geometric.loader import DataLoader
111 | from torch_loader import GraphClassificationBench
112 | 
113 | # Load "hard"
114 | train_dataset = GraphClassificationBench("data/", split='train', easy=False, small=False)
115 | val_dataset = GraphClassificationBench("data/", split='val', easy=False, small=False)
116 | test_dataset = GraphClassificationBench("data/", split='test', easy=False, small=False)
117 | 
118 | train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
119 | val_loader = DataLoader(val_dataset, batch_size=32)
120 | test_loader = DataLoader(test_dataset, batch_size=32)
121 | ````
122 | 
123 | See [torch_classification_example.py](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/data_loaders/torch_geometric/torch_classification_example.py) for a complete working example.
124 | 
125 | ## Results
126 | Classification results obtained by using Graph Kernels and other techniques are reported below.
127 | 
128 | Feel free to send a pull request if you have results you'd like to share!
129 | 
130 | #### Graph Kernels
131 | The Graph Kernels are computed with the [GraKeL](https://ysig.github.io/GraKeL/dev/index.html) library. After each kernel is computed, an SVM that uses as precomputed kernel the Graph Kernel is trained and then evaluated on the test data.
132 | As SVM implementation, the [sklearn.svm](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm) module was used.
133 | The code used to generate the results can be found in the [notebook](https://github.com/FilippoMB/Benchmark_dataset_for_graph_classification/blob/master/Example.ipynb) of this repository.
134 | 
135 | Dependecies to run the notebook:
136 | - scikitlearn ````pip install sklearn````
137 | - networkx ````pip install networkx````
138 | - grakel ````pip install grakel-dev````
139 | 
140 | |               | easy_small       |                | hard_small       |                |
141 | |--------------------|------------------|----------------|------------------|----------------|
142 | | Shortest Path      | Accuracy: 100    | Time: 20.67 s  | Accuracy: 69.23  | Time: 7.85 s   |
143 | | Graphlet Sampling  | Accuracy: 41.94  | Time: 281.35 s | Accuracy:  38.46 | Time:  37.84 s |
144 | | Pyramid Match      | Accuracy: 51.61  | Time: 2.91 s   | Accuracy:  23.08 | Time:  2.86 s  |
145 | | SVM Theta          | Accuracy: 32.26  | Time: 3.34 s   | Accuracy:  23.08 | Time:  2.91 s  |
146 | | Neighborhood Hash  | Accuracy: 90.32  | Time: 2.73 s   | Accuracy: 69.23  | Time: 2.71 s   |
147 | | Subtree WL         | Accuracy: 29.03  | Time: 0.01 s   | Accuracy: 15.38  | Time: 0.03 s   |
148 | | ODD STH            | Accuracy: 77.42  | Time: 58.75 s  | Accuracy: 42.31  | Time: 24.48 s  |
149 | | Propagation        | Accuracy: 87.1   | Time: 3.35 s   | Accuracy:  53.85 | Time:  2.61 s  |
150 | | Vertex Histogram   | Accuracy: 29.03  | Time: 0.02 s   | Accuracy: 15.38  | Time: 0.01 s   |
151 | | Weisfeiler Lehman  | Accuracy: 100    | Time: 151.81 s | Accuracy:  73.08 | Time: 58.92 s  |
152 | | Core Framework     | Accuracy: 100    | Time: 62.18 s  | Accuracy:  69.23 | Time:  18.62 s |
153 | 
154 | 
155 | #### Graph Neural Networks
156 | 
157 | Results obtained with the following GNN architecture: MP(32)-Pool-MP(32)-Pool-MP(32)-GlobalPool-Dense(Softmax). MP is a message-passing architecture. A Chebyshev convolutional layer \[1\] with K=1 and 32 hidden units was used here. Results refer to different graph pooling layers: Graclus \[2\], Node Decimation Pooling (NDP) \[3\], DiffPool \[4\], Top-K pooling \[5\], SAGpool \[6\] and MinCutPool \[7\].
158 | 
159 | 
160 | |    | easy     | hard                  |
161 | |------------|----------------------|-----------------------|
162 | | Graclus    | Accuracy: 97.5 ± 0.5 | Accuracy: 69.0 ± 1.5  |
163 | | NDP        | Accuracy: 97.9 ± 0.5 | Accuracy: 72.6 ± 0.9  |
164 | | DiffPool   | Accuracy: 98.6 ± 0.4 | Accuracy: 69.9 ± 1.9  |
165 | | Top-K      | Accuracy: 82.4 ± 8.9 | Accuracy: 42.7 ± 15.2 |
166 | | SAGPool    | Accuracy: 84.2 ± 2.3 | Accuracy: 37.7 ± 14.5 |
167 | | MinCutPool | Accuracy: 99.0 ± 0.0 | Accuracy: 73.8 ± 1.9  |
168 | 
169 | 
170 | 
171 | #### Embedding Simplicial Complexes (ESC)
172 | Techniques proposed in \[8\].
173 | 
174 | |              | easy_small       |                | hard_small       |                |
175 | |--------------------|------------------|----------------|------------------|----------------|
176 | | ESC +  RBF-SVM | Accuracy: 74.19 ± 6.84  | Time: 0.68 s| Accuracy: 48.46 ± 8.43| Time: 0.48 s|
177 | | ESC +  L1-SVM  | Accuracy: 94.19 ± 2.70  | Time: 0.68 s| Accuracy: 70.77 ± 5.83| Time: 0.48 s|
178 | | ESC +  L2-SVM  | Accuracy: 92.26 ± 2.89  | Time: 0.68 s| Accuracy: 69.23 ± 5.44| Time: 0.48 s|
179 | 
180 | |              | easy             |                | hard             |                |
181 | |--------------------|------------------|----------------|------------------|----------------|
182 | | ESC +  RBF-SVM | Accuracy: 80.37 ± 7.04 | Time: 10.94 s| Accuracy: 62.53 ± 4.58| Time: 16.65 s|
183 | | ESC +  L1-SVM  | Accuracy: 96.07 ± 0.93 | Time: 10.94 s| Accuracy: 72.21 ± 1.01| Time: 16.65 s|
184 | | ESC +  L2-SVM  | Accuracy: 93.37 ± 1.96 | Time: 10.94 s| Accuracy: 69.26 ± 1.85| Time: 16.65 s|
185 | 
186 | #### Hypergraph kernels
187 | Techniques proposed in \[9\].
188 | 
189 | |              | easy_small       |                | hard_small       |                |
190 | |--------------------|------------------|----------------|------------------|----------------|
191 | | Hist Kernel      | Accuracy: 94.0 ± 0.02 | Time: 0.72 s| Accuracy: 77.0 ± 0.02 | Time: 0.46 s|
192 | | Jaccard Kernel   | Accuracy: 94.0 ± 0.0  | Time: 0.86 s| Accuracy: 68.0 ± 0.02 | Time: 0.54 s|
193 | | Edit Kernel      | Accuracy: 94.0 ± 0.01 | Time: 9.97 s| Accuracy: 60.0 ± 0.02 | Time: 7.70 s|
194 | | Stratedit Kernel | Accuracy: 94.0 ± 0.0  | Time: 5.14 s| Accuracy: 58.0 ± 0.02 | Time: 4.79 s|
195 | 
196 | |              | easy                  |                | hard                  |                |
197 | |--------------------|-----------------------|----------------|-----------------------|----------------|
198 | | Hist Kernel        | Accuracy: 94.0 ± 0.01 | Time: 10.39  s  | Accuracy: 72.0 ± 0.01  | Time: 6.93    s |
199 | | Jaccard Kernel     | Accuracy: 94.0 ± 0.01 | Time: 14.15  s  | Accuracy: 63.0 ± 0.00  | Time: 8.11    s |
200 | | Edit Kernel        | Accuracy: 93.0 ± 0.00 | Time: 2784.47 s | Accuracy: 60.0 ± 0.00  | Time: 2183.41 s |
201 | | Stratedit Kernel   | Accuracy: 93.0 ± 0.00 | Time: 932.96  s | Accuracy: 60.0 ± 0.01  | Time: 954.87  s |
202 | 
203 | 
204 | ## References
205 | \[1\] Defferrard, M., Bresson, X., & Vandergheynst, P. (2016). Convolutional neural networks on graphs with fast localized spectral filtering. In Advances in neural information processing systems
206 | 
207 | \[2\] Dhillon, I. S., Guan, Y., & Kulis, B. (2007). Weighted graph cuts without eigenvectors a multilevel approach. IEEE transactions on pattern analysis and machine intelligence
208 | 
209 | \[3\] Bianchi, F. M., Grattarola, D., Livi, L., & Alippi, C. (2019). Hierarchical Representation Learning in Graph Neural Networks with Node Decimation Pooling
210 | 
211 | \[4\] Ying, Z., You, J., Morris, C., Ren, X., Hamilton, W., & Leskovec, J. (2018). Hierarchical graph representation learning with differentiable pooling. In Advances in neural information processing systems
212 | 
213 | \[5\] Gao, H., & Ji, S., Graph u-nets, ICML 2019
214 | 
215 | \[6\] Lee, J., Lee, I., & Kang, J., Self-attention graph pooling, ICML 2019
216 | 
217 | \[7\] F. M. Bianchi, D. Grattarola, C. Alippi, Spectral Clustering with Graph Neural Networks for Graph Pooling, ICML 2020
218 | 
219 | \[8\] Martino A, Giuliani A, Rizzi A., (Hyper) Graph Embedding and Classification via Simplicial Complexes. Algorithms. 2019 Nov; 12(11):223
220 | 
221 | \[9\] Martino A. and Rizzi A., (Hyper)graph kernels over simplicial complexes. 2020. Pattern Recognition
222 | 
223 | ## License
224 | The dataset and the code are released under the MIT License. See the attached LICENSE file.
225 | 


--------------------------------------------------------------------------------
/data_loaders/torch_geometric/torch_classification_example.py:
--------------------------------------------------------------------------------
 1 | import argparse 
 2 | import torch
 3 | import torch.nn.functional as F
 4 | from torch_geometric.loader import DataLoader
 5 | from torch_geometric.logging import log
 6 | from torch_geometric.nn import MLP, GINConv, global_add_pool
 7 | 
 8 | from torch_loader import GraphClassificationBench
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--batch_size', type=int, default=32)
12 | parser.add_argument('--hidden_channels', type=int, default=32)
13 | parser.add_argument('--num_layers', type=int, default=3)
14 | parser.add_argument('--lr', type=float, default=5e-4)
15 | parser.add_argument('--epochs', type=int, default=100)
16 | args = parser.parse_args()
17 | 
18 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19 | 
20 | 
21 | file_path = "data/"
22 | train_dataset = GraphClassificationBench(file_path, split='train', easy=False, small=False)
23 | train_loader = DataLoader(train_dataset, args.batch_size, shuffle=True)
24 | val_dataset = GraphClassificationBench(file_path, split='val', easy=False, small=False)
25 | val_loader = DataLoader(val_dataset, args.batch_size)
26 | test_dataset = GraphClassificationBench(file_path, split='test', easy=False, small=False)
27 | test_loader = DataLoader(test_dataset, args.batch_size)
28 | 
29 | 
30 | class Net(torch.nn.Module):
31 |     def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
32 |         super().__init__()
33 | 
34 |         self.convs = torch.nn.ModuleList()
35 |         for _ in range(num_layers):
36 |             mlp = MLP([in_channels, hidden_channels, hidden_channels])
37 |             self.convs.append(GINConv(nn=mlp, train_eps=False))
38 |             in_channels = hidden_channels
39 | 
40 |         self.mlp = MLP([hidden_channels, hidden_channels, out_channels],
41 |                        norm=None, dropout=0.5)
42 | 
43 |     def forward(self, x, edge_index, batch):
44 |         for conv in self.convs:
45 |             x = conv(x, edge_index).relu()
46 |         x = global_add_pool(x, batch)
47 |         return self.mlp(x)
48 | 
49 | 
50 | model = Net(train_dataset.num_features, args.hidden_channels, train_dataset.num_classes,
51 |             args.num_layers).to(device)
52 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
53 | 
54 | 
55 | def train():
56 |     model.train()
57 | 
58 |     total_loss = 0
59 |     for data in train_loader:
60 |         data = data.to(device)
61 |         optimizer.zero_grad()
62 |         out = model(data.x, data.edge_index, data.batch)
63 |         loss = F.cross_entropy(out, data.y)
64 |         loss.backward()
65 |         optimizer.step()
66 |         total_loss += float(loss) * data.num_graphs
67 |     return total_loss / len(train_loader.dataset)
68 | 
69 | 
70 | @torch.no_grad()
71 | def test(loader):
72 |     model.eval()
73 | 
74 |     total_correct = 0
75 |     for data in loader:
76 |         data = data.to(device)
77 |         pred = model(data.x, data.edge_index, data.batch).argmax(dim=-1)
78 |         total_correct += int((pred == data.y).sum())
79 |     return total_correct / len(loader.dataset)
80 | 
81 | 
82 | for epoch in range(1, args.epochs + 1):
83 |     loss = train()
84 |     train_acc = test(train_loader)
85 |     val_acc = test(val_loader)
86 |     test_acc = test(test_loader)
87 |     log(Epoch=epoch, Loss=loss, Train=train_acc, Val=val_acc, Test=test_acc)
88 | 


--------------------------------------------------------------------------------
/data_loaders/torch_geometric/torch_loader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch_geometric.data import InMemoryDataset, Data, download_url
 3 | from os import path
 4 | import numpy as np
 5 | 
 6 | 
 7 | class GraphClassificationBench(InMemoryDataset):
 8 |     """The synthetic dataset from `"Pyramidal Reservoir Graph Neural Network"
 9 |     <https://arxiv.org/abs/2104.04710>`_ paper.
10 | 
11 |     Args:
12 |         root (string): Root directory where the dataset should be saved.
13 |         split (string): If `"train"`, loads the training dataset.
14 |             If `"val"`, loads the validation dataset.
15 |             If `"test"`, loads the test dataset. Defaults to `"train"`.
16 |         easy (bool, optional): If `True`, use the easy version of the dataset.
17 |             Defaults to `True`.
18 |         small (bool, optional): If `True`, use the small version of the
19 |             dataset. Defaults to `True`.
20 |         transform (callable, optional): A function/transform that takes in an
21 |             `torch_geometric.data.Data` object and returns a transformed
22 |             version. The data object will be transformed before every access.
23 |             Defaults to `None`.
24 |         pre_transform (callable, optional): A function/transform that takes in
25 |             an `torch_geometric.data.Data` object and returns a
26 |             transformed version. The data object will be transformed before
27 |             being saved to disk. Defaults to `None`.
28 |         pre_filter (callable, optional): A function that takes in an
29 |             `torch_geometric.data.Data` object and returns a boolean
30 |             value, indicating whether the data object should be included in the
31 |             final dataset. Defaults to `None`.
32 |     """
33 |     base_url = ('http://github.com/FilippoMB/'
34 |                 'Benchmark_dataset_for_graph_classification/'
35 |                 'raw/master/datasets/')
36 |     
37 |     def __init__(self, root, split='train', easy=True, small=True, transform=None, pre_transform=None, pre_filter=None):
38 |         self.split = split.lower()
39 |         assert self.split in {'train', 'val', 'test'}
40 |         if self.split != 'val':
41 |             self.split = self.split[:2]
42 |         
43 |         self.file_name = ('easy' if easy else 'hard') + ('_small' if small else '')
44 |         
45 |         super(GraphClassificationBench, self).__init__(root, transform, pre_transform, pre_filter)
46 |         self.data, self.slices = torch.load(self.processed_paths[0])
47 | 
48 |     @property
49 |     def raw_file_names(self):
50 |         return '{}.npz'.format(self.file_name)
51 |     
52 |     @property
53 |     def processed_file_names(self):
54 |         return '{}.pt'.format(self.file_name+ '_' + self.split)
55 | 
56 |     def download(self):
57 |         download_url('{}{}.npz'.format(self.base_url, self.file_name), self.raw_dir)
58 | 
59 |     def process(self):
60 |         npz = np.load(path.join(self.raw_dir, self.raw_file_names), allow_pickle=True)
61 |         raw_data = (npz['{}_{}'.format(self.split, key)] for key in ['feat', 'adj', 'class']) 
62 |         data_list = [Data(x=torch.FloatTensor(x), 
63 |                           edge_index=torch.LongTensor(np.stack(adj.nonzero())), 
64 |                           y=torch.LongTensor(y.nonzero()[0])) for x, adj, y in zip(*raw_data)]
65 | 
66 |         if self.pre_filter is not None:
67 |             data_list = [data for data in data_list if self.pre_filter(data)]
68 |         
69 |         if self.pre_transform is not None:
70 |             data_list = [self.pre_transform(data) for data in data_list]
71 | 
72 |         self.data, self.slices = self.collate(data_list)
73 |         torch.save((self.data, self.slices), self.processed_paths[0])
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     file_path = "data/"
78 |     tr_dataset = GraphClassificationBench(file_path, split='train', easy=False, small=False)
79 |     print(tr_dataset[5])


--------------------------------------------------------------------------------
/datasets/easy.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/easy.npz


--------------------------------------------------------------------------------
/datasets/easy_small.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/easy_small.npz


--------------------------------------------------------------------------------
/datasets/hard.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/hard.npz


--------------------------------------------------------------------------------
/datasets/hard_small.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/datasets/hard_small.npz


--------------------------------------------------------------------------------
/img/sample_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/img/sample_graph.png


--------------------------------------------------------------------------------
/img/sample_graph2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FilippoMB/Benchmark_dataset_for_graph_classification/d834ecfc92c4d27fd2b5c746110e50ded15c1329/img/sample_graph2.png


--------------------------------------------------------------------------------
/make_dataset.py:
--------------------------------------------------------------------------------
  1 | from sklearn import datasets
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | from sklearn.neighbors import kneighbors_graph
  5 | import scipy.sparse as sp
  6 | import networkx as nx
  7 | from sklearn.preprocessing import OneHotEncoder
  8 | 
  9 | 
 10 | def make_instance(
 11 |         moon_c=[0,0],
 12 |         gaussq_c=[2,-2],
 13 |         blob_c=[[4,0]],
 14 |         n_nodes=90,
 15 |         random_n_nodes=False,
 16 |         cov=.2,
 17 |         k_neigh=5,
 18 |         conn_mode='connectivity',
 19 |         noisy_coords=False,
 20 |         noise_level=.1,
 21 |         plot_on=False):
 22 |     
 23 |     if random_n_nodes:
 24 |         n_nodes = np.random.randint(low=n_nodes//2, high=n_nodes)
 25 |     
 26 |     n_samples = n_nodes//3
 27 |     
 28 |     # initial features
 29 |     Xm, Ym = datasets.make_moons(n_samples=n_samples, noise=0.1)
 30 |     Xm[:,0] += moon_c[0]
 31 |     Xm[:,1] += moon_c[1]
 32 |     Xq, Yq = datasets.make_gaussian_quantiles(n_samples=n_samples, mean=gaussq_c, n_classes=2, cov=cov)
 33 |     Yq += 2
 34 |     Xb, Yb = datasets.make_blobs(n_samples=n_samples, centers=blob_c, cluster_std=cov*2)
 35 |     Yb += 4
 36 |     X = np.concatenate((Xm, Xq, Xb))
 37 |     X /= np.max(X,axis=0) 
 38 |     Y = np.concatenate((Ym, Yq, Yb))
 39 |     
 40 |     if plot_on:
 41 |         plt.scatter(X[:,0], X[:,1], c=Y)
 42 |         plt.title('initial features')
 43 |         plt.show()
 44 |     
 45 |     # build graph
 46 |     A = kneighbors_graph(X, n_neighbors=k_neigh, mode=conn_mode).todense()
 47 |     A = np.asarray(A)
 48 |     A = np.maximum(A, A.T)
 49 |     A /= A.max()  # normalize in [0,1]
 50 |     A = sp.csr_matrix(A, dtype=np.float32)
 51 |     G = nx.from_scipy_sparse_matrix(A)
 52 |     
 53 |     if plot_on:
 54 |         nx.draw_networkx(G, pos=nx.fruchterman_reingold_layout(G), with_labels=False, node_size=20, edge_color='lightgray', node_color=Y,
 55 |                  linewidths=1)
 56 |         plt.title('graph')
 57 |         plt.show()
 58 |         
 59 |     # node features
 60 |     F = OneHotEncoder(sparse=False, categories='auto').fit_transform(Y[...,None])
 61 |     
 62 |     if noisy_coords:
 63 |         X = np.tanh(X*1.1)
 64 |         X = np.multiply(X, np.diag(np.random.randn(X.shape[0],1)*noise_level))
 65 |         
 66 |         if plot_on:
 67 |             plt.scatter(X[:,0], X[:,1], c=Y)
 68 |             plt.title('noisy coords features')
 69 |             plt.show()
 70 |             
 71 |         F = np.concatenate((F, X), axis=-1)
 72 |         
 73 |     F_tuple = tuple(map(tuple, F))
 74 |     nx.set_node_attributes(G, dict(enumerate(F_tuple)), 'features')    
 75 |     
 76 |     return F.astype(np.float32), A, G
 77 | 
 78 | 
 79 | def make_dataset(
 80 |         moon =    [ [[4,0]],  [[2,-2]],  [[0,0]] ], #[ [[4,0],   [0,0]],     [[2,-2],[4,0]],      [[2,-2], [0,0]] ],           
 81 |         gaussq =  [ [[0,0]],  [[4,0]],  [[2,-2]] ], #[ [[0,0],   [4,0]],     [[0,0], [2,-2]],     [[4,0],  [2,-2]] ],          
 82 |         blob =    [ [[[2,-2]]],  [[[0,0]]],  [[[4,0]]] ], #[ [[[2,-2]],[[2,-2]]],  [[[4,0]], [[0,0]]],  [[[0,0]],  [[4,0]]] ],       
 83 |         n_nodes=90,
 84 |         random_n_nodes=False,
 85 |         cov=.2,
 86 |         k_neigh=5,
 87 |         conn_mode='connectivity',
 88 |         noisy_coords=False,
 89 |         noise_level=.5,
 90 |         plot_on=False,
 91 |         tr_size = 0.9,
 92 |         samples_per_subclass=150
 93 |         ):
 94 |     n_classes = len(moon)
 95 |     n_subclass = len(moon[0])
 96 |     print('n_classes:',n_classes,', n_subclasses:',n_subclass)
 97 |     
 98 |     tr_F = []
 99 |     tr_A = []
100 |     tr_G = []
101 |     tr_C = []
102 |     
103 |     val_F = []
104 |     val_A = []
105 |     val_G = []
106 |     val_C = []
107 |     
108 |     te_F = []
109 |     te_A = []
110 |     te_G = []
111 |     te_C = []
112 |     
113 |     for c in range(n_classes):
114 |         for s in range(n_subclass):
115 |             
116 |             for _ in range(samples_per_subclass):
117 |                 F, A, G = make_instance(moon_c=moon[c][s],
118 |                                        gaussq_c=gaussq[c][s],
119 |                                        blob_c=blob[c][s],
120 |                                        n_nodes=n_nodes,
121 |                                        random_n_nodes=random_n_nodes,
122 |                                        cov=cov,
123 |                                        k_neigh=k_neigh,
124 |                                        conn_mode=conn_mode,
125 |                                        noisy_coords=noisy_coords,
126 |                                        noise_level=noise_level,
127 |                                        plot_on=plot_on)
128 |                 
129 |                 if np.random.rand() < tr_size:
130 |                     if np.random.rand() < tr_size:
131 |                         tr_F.append(F)
132 |                         tr_A.append(A)
133 |                         tr_G.append(G)
134 |                         tr_C.append(c)
135 |                     else:
136 |                         val_F.append(F)
137 |                         val_A.append(A)
138 |                         val_G.append(G)
139 |                         val_C.append(c) 
140 |                 else:
141 |                     te_F.append(F)
142 |                     te_A.append(A)
143 |                     te_G.append(G)
144 |                     te_C.append(c)
145 |                     
146 |     # one-hot class labels
147 |     tr_C = np.asarray(tr_C)
148 |     tr_C = OneHotEncoder(sparse=False, categories='auto').fit_transform(tr_C[...,None])
149 |     val_C = np.asarray(val_C)
150 |     val_C = OneHotEncoder(sparse=False, categories='auto').fit_transform(val_C[...,None])
151 |     te_C = np.asarray(te_C)
152 |     te_C = OneHotEncoder(sparse=False, categories='auto').fit_transform(te_C[...,None])
153 |     
154 |     return tr_F, tr_A, tr_G, tr_C.astype(np.float32), \
155 |            val_F, val_A, val_G, val_C.astype(np.float32), \
156 |            te_F, te_A, te_G, te_C.astype(np.float32) 
157 |         
158 | 
159 | if __name__=='__main__':
160 | 
161 |     ds = 'hard_small'
162 |     
163 |     ds_kwargs = {
164 |         'hard_small': {'n_nodes':80, 'cov':.2, 'k_neigh':3, 'samples_per_subclass':100},
165 |         'hard_normal': {'n_nodes':200, 'cov':.2, 'k_neigh':3, 'samples_per_subclass':600},
166 |         'easy_small': {'n_nodes':80, 'cov':.4, 'k_neigh':5, 'samples_per_subclass':100},
167 |         'easy_normal': {'n_nodes':200, 'cov':.4, 'k_neigh':5, 'samples_per_subclass':600},
168 |     }
169 |     
170 |     F, A, G = make_instance(
171 |             n_nodes=80,
172 |             cov=.4, 
173 |             k_neigh=5, 
174 |             conn_mode='connectivity',
175 |             plot_on=True)
176 |     
177 |     tr_F, tr_A, tr_G, tr_C, val_F, val_A, val_G, val_C, te_F, te_A, te_G, te_C = make_dataset(
178 |             n_nodes=ds_kwargs[ds]['n_nodes'],
179 |             random_n_nodes=True,
180 |             cov=ds_kwargs[ds]['cov'],
181 |             k_neigh=ds_kwargs[ds]['k_neigh'],
182 |             noisy_coords=False,
183 |             conn_mode='connectivity',
184 |             tr_size = 0.9,
185 |             samples_per_subclass=ds_kwargs[ds]['samples_per_subclass']
186 |             )
187 |     
188 |     np.savez(ds, 
189 |          tr_adj=tr_A, 
190 |          tr_feat=tr_F,
191 |          tr_class=tr_C,
192 |          val_adj=val_A, 
193 |          val_feat=val_F,
194 |          val_class=val_C,
195 |          te_adj=te_A,
196 |          te_feat=te_F,
197 |          te_class=te_C)


--------------------------------------------------------------------------------