├── README.md ├── data ├── citeseer.mat ├── cora.mat ├── pubmed.mat └── wiki.mat ├── gcc ├── __init__.py ├── metrics.py ├── optimizer.py ├── run.py ├── tune_power.py └── utils.py ├── run_example.ipynb ├── schematic.png └── setup.py /README.md: -------------------------------------------------------------------------------- 1 | # Graph Convolutional Clustering 2 | 3 | This repository provides Python (Tensorflow) code to reproduce experiments from the WSDM '22 paper [*Efficient Graph Convolution for Joint Node Representation Learning and Clustering*](https://dl.acm.org/doi/10.1145/3488560.3498533). 4 | 5 |

6 | 7 | 8 | 9 | ## Installation 10 | 11 | ```bash 12 | python setup.py install 13 | ``` 14 | 15 | ## Run Experiments 16 | #### Parameter list 17 | For `run.py` 18 | 19 | | Parameter | Type | Default | Description | 20 | | :-------------: |:-------------:| :----:|:-------------------------------- | 21 | | `dataset` | string| `cora`| Name of the graph dataset (`cora`, `citeseer`, `pubmed` or `wiki`). | 22 | | `power` | integer| `5`| First power to test. | 23 | | `runs` | integer| `20`| Number of runs per power. | 24 | | `n_clusters` | integer| `0`| Number of clusters (`0` for ground truth). | 25 | | `max_iter` | integer| `30`| Number of iterations of the algorithm. | 26 | | `tol` | float| `10e-7`| Tolerance threshold of convergence. | 27 | 28 | For `tune_power.py` parameters are the same except for `power` which is replaced by 29 | 30 | | Parameter | Type | Default | Description | 31 | | :-------------: |:-------------:| :----:|:-------------------------------- | 32 | | `min_power` | integer| `1`| Smallest propagation order to test. | 33 | | `max_power` | integer| `150`| Largest propagation order to test. | 34 | 35 | 36 | #### Best Propagation Orders 37 | 38 | 39 | | Dataset | Propagation order | 40 | | :-------------: |:-------------:| 41 | | `citeseer` | `5`| 42 | | `cora` | `12`| 43 | | `pubmed` | `150`| 44 | | `wiki` | `4`| 45 | 46 | #### Example 47 | To adaptively tune the power on Cora use 48 | ```bash 49 | python gcc/tune_power.py --dataset=cora 50 | ``` 51 | 52 | To run the model on Cora for power `p=12` and have the average execution time 53 | ```bash 54 | python gcc/run.py --dataset=cora --power 12 55 | ``` 56 | ## Citation 57 | 58 | Please cite the following paper if you used GCC in your research 59 | 60 | ```BibTeX 61 | @inproceedings{fettal2022efficient, 62 | author = {Fettal, Chakib and Labiod, Lazhar and Nadif, Mohamed}, 63 | title = {Efficient Graph Convolution for Joint Node Representation Learning and Clustering}, 64 | year = {2022}, 65 | publisher = {Association for Computing Machinery}, 66 | doi = {10.1145/3488560.3498533}, 67 | booktitle = {Proceedings of the Fifteenth ACM International Conference on Web Search and Data Mining}, 68 | pages = {289–297}, 69 | } 70 | ``` 71 | -------------------------------------------------------------------------------- /data/citeseer.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chakib401/graph_convolutional_clustering/e13e0d5a8b87659986ff128c23a6750fef8ef6a0/data/citeseer.mat -------------------------------------------------------------------------------- /data/cora.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chakib401/graph_convolutional_clustering/e13e0d5a8b87659986ff128c23a6750fef8ef6a0/data/cora.mat -------------------------------------------------------------------------------- /data/pubmed.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chakib401/graph_convolutional_clustering/e13e0d5a8b87659986ff128c23a6750fef8ef6a0/data/pubmed.mat -------------------------------------------------------------------------------- /data/wiki.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chakib401/graph_convolutional_clustering/e13e0d5a8b87659986ff128c23a6750fef8ef6a0/data/wiki.mat -------------------------------------------------------------------------------- /gcc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chakib401/graph_convolutional_clustering/e13e0d5a8b87659986ff128c23a6750fef8ef6a0/gcc/__init__.py -------------------------------------------------------------------------------- /gcc/metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | from sklearn.metrics import confusion_matrix, silhouette_score, davies_bouldin_score 3 | from scipy.optimize import linear_sum_assignment 4 | from sklearn.metrics.cluster import normalized_mutual_info_score as nmi 5 | import numpy as np 6 | from sklearn.metrics import adjusted_rand_score as ari 7 | 8 | 9 | def ordered_confusion_matrix(y_true, y_pred): 10 | conf_mat = confusion_matrix(y_true, y_pred) 11 | w = np.max(conf_mat) - conf_mat 12 | row_ind, col_ind = linear_sum_assignment(w) 13 | conf_mat = conf_mat[row_ind, :] 14 | conf_mat = conf_mat[:, col_ind] 15 | return conf_mat 16 | 17 | 18 | def clustering_accuracy(y_true, y_pred): 19 | conf_mat = ordered_confusion_matrix(y_true, y_pred) 20 | return np.trace(conf_mat) / np.sum(conf_mat) 21 | 22 | 23 | def clustering_f1_score(y_true, y_pred, **kwargs): 24 | def cmat_to_psuedo_y_true_and_y_pred(cmat): 25 | y_true = [] 26 | y_pred = [] 27 | for true_class, row in enumerate(cmat): 28 | for pred_class, elm in enumerate(row): 29 | y_true.extend([true_class] * elm) 30 | y_pred.extend([pred_class] * elm) 31 | return y_true, y_pred 32 | 33 | conf_mat = ordered_confusion_matrix(y_true, y_pred) 34 | pseudo_y_true, pseudo_y_pred = cmat_to_psuedo_y_true_and_y_pred(conf_mat) 35 | return metrics.f1_score(pseudo_y_true, pseudo_y_pred, **kwargs) 36 | 37 | 38 | def output_metrics(X, y_true, y_pred): 39 | return [ 40 | clustering_accuracy(y_true, y_pred), 41 | nmi(y_true, y_pred), 42 | ari(y_true, y_pred), 43 | clustering_f1_score(y_true, y_pred, average='macro'), 44 | davies_bouldin_score(X, y_pred), 45 | silhouette_score(X, y_pred) 46 | ] 47 | 48 | 49 | def print_metrics(metrics_means, metrics_stds, time_mean=None, time_std=None): 50 | if time_mean is not None: print(f'time_mean:{time_mean} ', end='') 51 | print(f'loss_mean:{metrics_means[6]} ' 52 | f'acc_mean:{metrics_means[0]} ' 53 | f'ari_mean:{metrics_means[2]} ' 54 | f'nmi_mean:{metrics_means[1]} ' 55 | f'db_mean:{metrics_means[4]} ' 56 | f'sil_mean:{metrics_means[5]} ' 57 | f'f1_mean:{metrics_means[3]} ', end=' ') 58 | 59 | if time_std is not None: print(f'time_std:{time_std} ', end='') 60 | print(f'loss_std:{metrics_stds[6]} ' 61 | f'acc_std:{metrics_stds[0]} ' 62 | f'ari_std:{metrics_stds[2]} ' 63 | f'nmi_std:{metrics_stds[1]} ' 64 | f'f1_std:{metrics_stds[3]} ' 65 | f'db_std:{metrics_stds[4]} ' 66 | f'sil_std:{metrics_stds[5]} ') 67 | -------------------------------------------------------------------------------- /gcc/optimizer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from sklearn.cluster import KMeans 3 | from sklearn.decomposition import PCA 4 | 5 | 6 | def update_rule_F(XW, G, k): 7 | F = tf.math.unsorted_segment_mean(XW, G, k) 8 | return F 9 | 10 | 11 | def update_rule_W(X, F, G): 12 | _, U, V = tf.linalg.svd(tf.transpose(X) @ tf.gather(F, G), full_matrices=False) 13 | W = U @ tf.transpose(V) 14 | return W 15 | 16 | 17 | def update_rule_G(XW, F): 18 | distances = tf.reduce_sum(XW**2, axis=1, keepdims=True) + tf.reduce_sum(F**2, axis=1) - 2 * XW @ tf.transpose(F) 19 | G = tf.math.argmin(distances, 1, output_type=tf.dtypes.int32) 20 | return G 21 | 22 | 23 | def init_G_F(XW, k): 24 | km = KMeans(k).fit(XW) 25 | G = km.labels_ 26 | F = km.cluster_centers_ 27 | return G, F 28 | 29 | 30 | def init_W(X, f): 31 | pca = PCA(f, svd_solver='randomized').fit(X) 32 | W = pca.components_.T 33 | return W 34 | 35 | 36 | @tf.function 37 | def train_loop(X, F, G, W, k, max_iter, tolerance): 38 | losses = tf.TensorArray(tf.float64, size=0, dynamic_size=True) 39 | prev_loss = tf.float64.max 40 | 41 | for i in tf.range(max_iter): 42 | 43 | W = update_rule_W(X, F, G) 44 | XW = X @ W 45 | G = update_rule_G(XW, F) 46 | F = update_rule_F(XW, G, k) 47 | 48 | loss = tf.linalg.norm(X - tf.gather(F @ tf.transpose(W), G)) 49 | if prev_loss - loss < tolerance: 50 | break 51 | 52 | losses = losses.write(i, loss) 53 | prev_loss = loss 54 | 55 | return G, F, W, losses.stack() 56 | 57 | 58 | def optimize(X, k, f, max_iter=30, tolerance=10e-7): 59 | # init G and F 60 | W = init_W(X, f) 61 | G, F = init_G_F(X @ W, k) 62 | G, F, W, loss_history = train_loop(X, F, G, W, k, max_iter, tolerance) 63 | 64 | return G, F, W, loss_history 65 | -------------------------------------------------------------------------------- /gcc/run.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.util import deprecation 2 | deprecation._PRINT_DEPRECATION_WARNINGS = False 3 | 4 | import time 5 | import numpy as np 6 | from gcc.metrics import output_metrics, print_metrics 7 | from gcc.optimizer import optimize 8 | from gcc.utils import read_dataset, preprocess_dataset 9 | import tensorflow as tf 10 | 11 | flags = tf.compat.v1.flags 12 | FLAGS = flags.FLAGS 13 | 14 | # Parameters 15 | flags.DEFINE_string('dataset', 'cora', 'Name of the graph dataset (cora, citeseer, pubmed or wiki).') 16 | flags.DEFINE_integer('power', 5, 'Propagation order.') 17 | flags.DEFINE_integer('runs', 20, 'Number of runs per power.') 18 | flags.DEFINE_integer('n_clusters', 0, 'Number of clusters (0 for ground truth).') 19 | flags.DEFINE_integer('max_iter', 30, 'Number of iterations of the algorithm.') 20 | flags.DEFINE_float('tol', 10e-7, 'Tolerance threshold of convergence.') 21 | 22 | dataset = flags.FLAGS.dataset 23 | power = flags.FLAGS.power 24 | runs = flags.FLAGS.runs 25 | n_clusters = flags.FLAGS.n_clusters 26 | max_iter = flags.FLAGS.max_iter 27 | tolerance = flags.FLAGS.tol 28 | 29 | 30 | # Read the dataset 31 | adj, features, labels, n_classes = read_dataset(dataset) 32 | if n_clusters == 0: n_clusters = n_classes 33 | # Process the dataset 34 | tf_idf = (dataset == 'cora' or dataset == 'citeseer') # normalize binary word datasets 35 | norm_adj, features = preprocess_dataset(adj, features, tf_idf=tf_idf) 36 | 37 | 38 | run_metrics = [] 39 | times = [] 40 | 41 | X = features 42 | 43 | for run in range(runs): 44 | features = X 45 | t0 = time.time() 46 | for _ in range(power): 47 | features = norm_adj @ features 48 | 49 | G, F, W, losses = optimize(features, n_clusters, n_clusters, 50 | max_iter=max_iter, tolerance=tolerance) 51 | time_it_took = time.time() - t0 52 | metrics = output_metrics(features @ W, labels, G) 53 | run_metrics.append(metrics + [losses[-1]]) 54 | times.append(time_it_took) 55 | 56 | print_metrics(np.mean(run_metrics, 0), np.std(run_metrics, 0), np.mean(times), np.std(times)) 57 | 58 | -------------------------------------------------------------------------------- /gcc/tune_power.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.util import deprecation 2 | deprecation._PRINT_DEPRECATION_WARNINGS = False 3 | 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | from gcc.metrics import output_metrics, print_metrics 8 | from gcc.optimizer import optimize 9 | from gcc.utils import read_dataset, is_close, preprocess_dataset 10 | 11 | flags = tf.compat.v1.flags 12 | FLAGS = flags.FLAGS 13 | 14 | # Parameters 15 | flags.DEFINE_string('dataset', 'cora', 'Name of the graph dataset (cora, citeseer, pubmed or wiki).') 16 | flags.DEFINE_integer('min_power', 1, 'Smallest propagation order to test.') 17 | flags.DEFINE_integer('max_power', 150, 'Largest propagation order to test.') 18 | flags.DEFINE_integer('runs', 20, 'Number of runs per power.') 19 | flags.DEFINE_integer('n_clusters', 0, 'Number of clusters (0 for ground truth).') 20 | flags.DEFINE_integer('max_iter', 30, 'Number of iterations of the algorithm.') 21 | flags.DEFINE_float('tol', 10e-7, 'Tolerance threshold of convergence.') 22 | 23 | dataset = flags.FLAGS.dataset 24 | min_power = flags.FLAGS.min_power 25 | max_power = flags.FLAGS.max_power 26 | runs = flags.FLAGS.runs 27 | n_clusters = flags.FLAGS.n_clusters 28 | max_iter = flags.FLAGS.max_iter 29 | tolerance = flags.FLAGS.tol 30 | 31 | # Read the dataset 32 | adj, features, labels, n_classes = read_dataset(dataset) 33 | if n_clusters == 0: n_clusters = n_classes 34 | # Process the dataset 35 | tf_idf = (dataset == 'cora' or dataset == 'citeseer') # normalize binary word datasets 36 | norm_adj, features = preprocess_dataset(adj, features, tf_idf=tf_idf) 37 | 38 | # compute min_power matrix 39 | for power in range(1, min_power): 40 | features = norm_adj @ features 41 | 42 | # apply the algorithm from min_power to max_power matrices 43 | global_metrics_means = [] 44 | global_metrics_stds = [] 45 | value_1 = np.nan 46 | 47 | for power in range(min_power, max_power + 1): 48 | print(f"== power {power} ==") 49 | 50 | features = norm_adj @ features 51 | run_metrics = [] 52 | 53 | for run in range(runs): 54 | G, F, W, losses = optimize(features, n_clusters, n_clusters, 55 | max_iter=max_iter, tolerance=tolerance) 56 | metrics = output_metrics(features @ W, labels, G) 57 | run_metrics.append(metrics + [losses[-1] if max_iter > 0 else 0]) 58 | 59 | global_metrics_means.append(np.mean(run_metrics, 0)) 60 | global_metrics_stds.append(np.std(run_metrics, 0)) 61 | 62 | print_metrics(global_metrics_means[-1], global_metrics_stds[-1]) 63 | 64 | value = global_metrics_means[-1][-1] 65 | if is_close(value, value_1, features.shape[1] / features.shape[0]): 66 | print(f'best power: {power-1}') 67 | break 68 | value_1 = value 69 | -------------------------------------------------------------------------------- /gcc/utils.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import scipy.io as sio 3 | import scipy.sparse as sp 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.preprocessing import normalize 7 | from sklearn.feature_extraction.text import TfidfTransformer 8 | 9 | 10 | def aug_normalized_adjacency(adj, add_loops=True): 11 | if add_loops: 12 | adj = adj + sp.eye(adj.shape[0]) 13 | adj = sp.coo_matrix(adj) 14 | row_sum = np.array(adj.sum(1)) 15 | d_inv_sqrt = np.power(row_sum, -0.5).flatten() 16 | d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. 17 | d_mat_inv_sqrt = sp.diags(d_inv_sqrt) 18 | return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo() 19 | 20 | 21 | def row_normalize(mx, add_loops=True): 22 | if add_loops: 23 | mx = mx + sp.eye(mx.shape[0]) 24 | rowsum = np.array(mx.sum(1)) 25 | r_inv = np.power(rowsum, -1).flatten() 26 | r_inv[np.isinf(r_inv)] = 0. 27 | r_mat_inv = sp.diags(r_inv) 28 | mx = r_mat_inv.dot(mx) 29 | return mx 30 | 31 | 32 | def is_close(a, b, c): 33 | return np.abs(a - b) < c 34 | 35 | 36 | def read_dataset(dataset): 37 | data = sio.loadmat(os.path.join('data', f'{dataset}.mat')) 38 | features = data['fea'].astype(float) 39 | adj = data['W'] 40 | adj = adj.astype(float) 41 | if not sp.issparse(adj): 42 | adj = sp.csc_matrix(adj) 43 | if sp.issparse(features): 44 | features = features.toarray() 45 | labels = data['gnd'].reshape(-1) - 1 46 | n_classes = len(np.unique(labels)) 47 | return adj, features, labels, n_classes 48 | 49 | 50 | def preprocess_dataset(adj, features, row_norm=True, sym_norm=True, feat_norm=True, tf_idf=False): 51 | if sym_norm: 52 | adj = aug_normalized_adjacency(adj, True) 53 | if row_norm: 54 | adj = row_normalize(adj, True) 55 | 56 | if tf_idf: 57 | features = TfidfTransformer().fit_transform(features).toarray() 58 | if feat_norm: 59 | features = normalize(features) 60 | return adj, features 61 | 62 | 63 | def parse_logs(filename): 64 | import re 65 | with open(file=filename) as f: 66 | log = f.readlines() 67 | 68 | metrics_names = None 69 | metrics = [] 70 | 71 | for line in log: 72 | if line[0:4] != 'time' and line[0:4] != 'loss': continue 73 | if metrics_names is None: 74 | 75 | metrics_names = [m.group(1) for m in re.finditer(r'(\w+):', line)] 76 | for _ in metrics_names: 77 | metrics.append([]) 78 | 79 | metrics_values = [m.group(1) for m in re.finditer(r':([\d.e-]+)', line)] 80 | 81 | for i in range(len(metrics_values)): 82 | metrics[i].append(float(metrics_values[i])) 83 | metrics = np.array(metrics).T 84 | metrics = pd.DataFrame(metrics, columns=metrics_names, index=list(range(1, len(metrics)+1))) 85 | return metrics 86 | 87 | 88 | -------------------------------------------------------------------------------- /run_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "run_example.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "code", 22 | "metadata": { 23 | "id": "Fw_h_nnlNKCx" 24 | }, 25 | "source": [ 26 | "%%capture\n", 27 | "!git clone #" 28 | ], 29 | "execution_count": null, 30 | "outputs": [] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "metadata": { 35 | "id": "qLOH0qd1OS41" 36 | }, 37 | "source": [ 38 | "%%capture\n", 39 | "%cd gcc\n", 40 | "!python setup.py install" 41 | ], 42 | "execution_count": null, 43 | "outputs": [] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "metadata": { 48 | "id": "w57mW8sjUJVK" 49 | }, 50 | "source": [ 51 | "import os\n", 52 | "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'" 53 | ], 54 | "execution_count": null, 55 | "outputs": [] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "Z5nTF5HEUvpo" 61 | }, 62 | "source": [ 63 | "# Cora" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "id": "DF4YfRE9OjMs" 70 | }, 71 | "source": [ 72 | "Look for the best propagation order $p^*$" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "metadata": { 78 | "colab": { 79 | "base_uri": "https://localhost:8080/" 80 | }, 81 | "id": "NpzElNRNOai7", 82 | "outputId": "ddfe1875-549c-44cd-f361-491bb66dff83" 83 | }, 84 | "source": [ 85 | "!python gcc/tune_power.py --dataset=cora --runs=20" 86 | ], 87 | "execution_count": null, 88 | "outputs": [ 89 | { 90 | "output_type": "stream", 91 | "text": [ 92 | "== power 1 ==\n", 93 | "loss_mean:37.72894157240302 acc_mean:0.6379246676514034 ari_mean:0.37261358791757965 nmi_mean:0.4620770042057192 db_mean:0.8948988373023328 sil_mean:0.3707301004049684 f1_mean:0.5875379323579427 loss_std:0.0013980628276328748 acc_std:0.008951806999017523 ari_std:0.010327043984988763 nmi_std:0.0071500211963945156 f1_std:0.008289687723084188 db_std:0.01177424265445584 sil_std:0.0034000414079086235 \n", 94 | "== power 2 ==\n", 95 | "loss_mean:32.061075316543636 acc_mean:0.654837518463811 ari_mean:0.4255620798073257 nmi_mean:0.5246589391251739 db_mean:0.8626559380633699 sil_mean:0.40543793869424594 f1_mean:0.6439043653313294 loss_std:0.0011390987083548657 acc_std:0.0031682522272971484 ari_std:0.002349287456697349 nmi_std:0.0012570045807346504 f1_std:0.002024766755679126 db_std:0.0036744329679806076 sil_std:0.002241030101741015 \n", 96 | "== power 3 ==\n", 97 | "loss_mean:28.801749715104883 acc_mean:0.6835302806499259 ari_mean:0.4472078524440882 nmi_mean:0.5440197147869125 db_mean:0.8062240662413634 sil_mean:0.43280731728506694 f1_mean:0.6638450351611765 loss_std:0.0006675021961365709 acc_std:0.0018793221048743632 ari_std:0.0035736346663822584 nmi_std:0.002483141329244369 f1_std:0.0012903721894303609 db_std:0.0029413051501388588 sil_std:0.0016675002734286095 \n", 98 | "== power 4 ==\n", 99 | "loss_mean:26.573637123998417 acc_mean:0.7111152141802068 ari_mean:0.47016980331536207 nmi_mean:0.5565845567569746 db_mean:0.7676809604925501 sil_mean:0.45104081344237523 f1_mean:0.6781075394073648 loss_std:0.002533875721113826 acc_std:0.00539964672449809 ari_std:0.007361017238340283 nmi_std:0.005359300174492489 f1_std:0.007587547451313547 db_std:0.01062213530915524 sil_std:0.0034750207720889398 \n", 100 | "== power 5 ==\n", 101 | "loss_mean:24.872427235886594 acc_mean:0.7362629246676511 ari_mean:0.5034340478968045 nmi_mean:0.5777665263876254 db_mean:0.6918204740177638 sil_mean:0.46831015773382656 f1_mean:0.697450496432252 loss_std:0.0047862229229960335 acc_std:0.004718361766406236 ari_std:0.006338993357693836 nmi_std:0.005416792565556466 f1_std:0.004532147982065274 db_std:0.014400513342688635 sil_std:0.0005118267020640779 \n", 102 | "== power 6 ==\n", 103 | "loss_mean:23.53135583053093 acc_mean:0.7350627769571638 ari_mean:0.5043459505072073 nmi_mean:0.5808950677203828 db_mean:0.6814524231194701 sil_mean:0.47658367480022984 f1_mean:0.6932325285542913 loss_std:0.012038630494599524 acc_std:0.013421285616561052 ari_std:0.012549666079697121 nmi_std:0.010128936807729106 f1_std:0.01939037705995324 db_std:0.02287028453267302 sil_std:0.003524817876123255 \n", 104 | "== power 7 ==\n", 105 | "loss_mean:22.42047340366046 acc_mean:0.740454209748892 ari_mean:0.5111216537199358 nmi_mean:0.5873383843180119 db_mean:0.6609672268563566 sil_mean:0.4846608436918182 f1_mean:0.700398485451832 loss_std:2.6943948484124818e-05 acc_std:0.00035563811455422215 ari_std:0.00019257447504031185 nmi_std:0.00034236635364756277 f1_std:0.0004575020282286334 db_std:0.0007836066984701632 sil_std:8.119050962381412e-05 \n", 106 | "== power 8 ==\n", 107 | "loss_mean:21.486733124503317 acc_mean:0.7418759231905465 ari_mean:0.5123289272867898 nmi_mean:0.5911354976858398 db_mean:0.6480828465545738 sil_mean:0.4912129538361669 f1_mean:0.7024495661693204 loss_std:3.552713678800501e-15 acc_std:0.0 ari_std:1.1102230246251565e-16 nmi_std:1.137640067256873e-16 f1_std:2.220446049250313e-16 db_std:2.4069061620089815e-16 sil_std:9.745623540847972e-14 \n", 108 | "== power 9 ==\n", 109 | "loss_mean:20.685689799484926 acc_mean:0.7448670605613 ari_mean:0.5193139546093788 nmi_mean:0.5937434383863518 db_mean:0.644802896391757 sil_mean:0.4948019496490687 f1_mean:0.7038487623700458 loss_std:0.0002512842109534511 acc_std:0.0005209282119522219 ari_std:0.00022602903852278263 nmi_std:0.0007197508171372224 f1_std:0.0009948097495612086 db_std:0.001887216354343911 sil_std:0.0007796212820746376 \n", 110 | "== power 10 ==\n", 111 | "loss_mean:19.98787919446329 acc_mean:0.7433530280649928 ari_mean:0.5137616652228074 nmi_mean:0.5924328621568712 db_mean:0.6382305938500219 sil_mean:0.49884695650302946 f1_mean:0.703331254269269 loss_std:0.0 acc_std:2.220446049250313e-16 ari_std:2.220446049250313e-16 nmi_std:1.8742721836921975e-16 f1_std:2.220446049250313e-16 db_std:3.8459253727671276e-16 sil_std:9.203960805108611e-14 \n", 112 | "== power 11 ==\n", 113 | "loss_mean:19.373306996219437 acc_mean:0.7407680945347119 ari_mean:0.5081828709154457 nmi_mean:0.5888304506270892 db_mean:0.6324772692804062 sil_mean:0.5018974534887386 f1_mean:0.7014166226389505 loss_std:0.0006682023225912658 acc_std:0.003219275438360919 ari_std:0.00685102718211624 nmi_std:0.003440703362579738 f1_std:0.0029041388805038375 db_std:0.006420005550295566 sil_std:0.001390560804157635 \n", 114 | "== power 12 ==\n", 115 | "loss_mean:18.826891009872284 acc_mean:0.7405649926144757 ari_mean:0.5084838760178194 nmi_mean:0.5892987764449148 db_mean:0.6287249174513053 sil_mean:0.5043585071158948 f1_mean:0.7007981811566687 loss_std:0.0008271905758939664 acc_std:0.0055972761904022105 ari_std:0.010448688647551739 nmi_std:0.005518187639981098 f1_std:0.005544006781458599 db_std:0.012233074425674456 sil_std:0.002265016000610164 \n", 116 | "== power 13 ==\n", 117 | "loss_mean:18.33735714976407 acc_mean:0.739290989660266 ari_mean:0.5070188972396179 nmi_mean:0.5895378158191613 db_mean:0.6178236339144783 sil_mean:0.5075940981586503 f1_mean:0.6971267941015611 loss_std:0.0005930446552927568 acc_std:0.015334970304908508 ari_std:0.02209688942783561 nmi_std:0.009225336818219704 f1_std:0.02740271688764189 db_std:0.012434732151081394 sil_std:0.0009507052752318993 \n", 118 | "best power: 12\n" 119 | ], 120 | "name": "stdout" 121 | } 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "id": "O9V7J0MnOsK-" 128 | }, 129 | "source": [ 130 | "Run and time GCC with the $p^*$ we found" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "metadata": { 136 | "colab": { 137 | "base_uri": "https://localhost:8080/" 138 | }, 139 | "id": "eXC2AEZoOnuJ", 140 | "outputId": "149050d1-b6f5-4721-9cee-6313077d796d" 141 | }, 142 | "source": [ 143 | "!python gcc/run.py --dataset=cora --runs=20 --power=12" 144 | ], 145 | "execution_count": null, 146 | "outputs": [ 147 | { 148 | "output_type": "stream", 149 | "text": [ 150 | "time_mean:0.9566853046417236 loss_mean:18.826611717479942 acc_mean:0.742374446085672 ari_mean:0.5119025676709507 nmi_mean:0.5910296072647834 db_mean:0.6245465875542179 sil_mean:0.5051477690924092 f1_mean:0.7025674705487899 time_std:0.37521120899406357 loss_std:1.1760781033633758e-05 acc_std:0.00017613353054231088 ari_std:0.00020002554635452713 nmi_std:0.0003742634234070547 f1_std:0.0002451638024080748 db_std:0.00033162472536623207 sil_std:0.00011268071055729382 \n" 151 | ], 152 | "name": "stdout" 153 | } 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "id": "QQPZNsF9W1AJ" 160 | }, 161 | "source": [ 162 | "# Wiki" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "id": "MpIpAPrsWziY" 169 | }, 170 | "source": [ 171 | "Look for the best propagation order $p^*$" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "metadata": { 177 | "colab": { 178 | "base_uri": "https://localhost:8080/" 179 | }, 180 | "id": "_SnXSFWwWziZ", 181 | "outputId": "2d35dac9-cdf9-404d-d7a5-4d17c053afc3" 182 | }, 183 | "source": [ 184 | "!python gcc/tune_power.py --dataset=wiki --runs=20" 185 | ], 186 | "execution_count": null, 187 | "outputs": [ 188 | { 189 | "output_type": "stream", 190 | "text": [ 191 | "== power 1 ==\n", 192 | "loss_mean:32.91920724143249 acc_mean:0.5251559251559251 ari_mean:0.3498175452840336 nmi_mean:0.5409570475870014 db_mean:0.9020850823344866 sil_mean:0.40973383827514853 f1_mean:0.451675257256983 loss_std:0.0017449979242582414 acc_std:0.0012264350273391014 ari_std:0.006488968295549446 nmi_std:0.0040126774189067765 f1_std:0.0045997444604448795 db_std:0.009633257377839452 sil_std:0.001597502630994793 \n", 193 | "== power 2 ==\n", 194 | "loss_mean:27.964185640281148 acc_mean:0.5362993762993763 ari_mean:0.3351791993955956 nmi_mean:0.5428955509517364 db_mean:0.9140293808301818 sil_mean:0.39900060676008364 f1_mean:0.4611284913758483 loss_std:0.008606480068281871 acc_std:0.008074026135586318 ari_std:0.010201088127886157 nmi_std:0.0036784829535050235 f1_std:0.013886070258902525 db_std:0.016033691328648927 sil_std:0.002051351980291322 \n", 195 | "== power 3 ==\n", 196 | "loss_mean:24.989721317813597 acc_mean:0.5373388773388772 ari_mean:0.3311860803182541 nmi_mean:0.549320775305108 db_mean:0.9092105904415904 sil_mean:0.39674449262504774 f1_mean:0.4635493704558943 loss_std:0.005355526494606255 acc_std:0.0048882640000951294 ari_std:0.006440563125750927 nmi_std:0.003027882555618252 f1_std:0.007895495661664403 db_std:0.007778405524250394 sil_std:0.0014443531233519578 \n", 197 | "== power 4 ==\n", 198 | "loss_mean:22.9018097108406 acc_mean:0.5457380457380457 ari_mean:0.3351320477138375 nmi_mean:0.5508744784135976 db_mean:0.8924438992868623 sil_mean:0.3989583258465662 f1_mean:0.4637045360973941 loss_std:0.003954032603618392 acc_std:0.01183206093566087 ari_std:0.00828386731115925 nmi_std:0.0031393893533866657 f1_std:0.010457928425343507 db_std:0.020578998957610204 sil_std:0.0028164955075205356 \n", 199 | "== power 5 ==\n", 200 | "loss_mean:21.297265301412445 acc_mean:0.5416424116424117 ari_mean:0.32626874151052326 nmi_mean:0.5473644743304191 db_mean:0.8684707262275847 sil_mean:0.39976425074885696 f1_mean:0.45943639344571024 loss_std:0.008013658561169229 acc_std:0.016539386021718182 ari_std:0.00581426034278349 nmi_std:0.004713758543041092 f1_std:0.010411228243172702 db_std:0.028249774928581148 sil_std:0.005955697895368417 \n", 201 | "best power: 4\n" 202 | ], 203 | "name": "stdout" 204 | } 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": { 210 | "id": "FQOJdWmQWzia" 211 | }, 212 | "source": [ 213 | "Run and time GCC with the $p^*$ we found" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "metadata": { 219 | "colab": { 220 | "base_uri": "https://localhost:8080/" 221 | }, 222 | "id": "hAOb9ELkWzib", 223 | "outputId": "548c8b67-fa92-4ff4-c510-79cb048caef6" 224 | }, 225 | "source": [ 226 | "!python gcc/run.py --dataset=wiki --runs=20 --power=4" 227 | ], 228 | "execution_count": 5, 229 | "outputs": [ 230 | { 231 | "output_type": "stream", 232 | "text": [ 233 | "time_mean:2.890522134304047 loss_mean:22.905084648963282 acc_mean:0.5433056133056132 ari_mean:0.33836188205470885 nmi_mean:0.5517095712658524 db_mean:0.90665948906491 sil_mean:0.3973514098862267 f1_mean:0.4640283925171606 time_std:0.4022031588416949 loss_std:0.003818735516175145 acc_std:0.010529093662638574 ari_std:0.009432940837766465 nmi_std:0.0034867008470824286 f1_std:0.005921428592408881 db_std:0.010874405862369681 sil_std:0.002167428312082353 \n" 234 | ], 235 | "name": "stdout" 236 | } 237 | ] 238 | } 239 | ] 240 | } -------------------------------------------------------------------------------- /schematic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chakib401/graph_convolutional_clustering/e13e0d5a8b87659986ff128c23a6750fef8ef6a0/schematic.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools import find_packages 3 | 4 | setup(name='gcc', 5 | description='Efficient Graph Convolutional Representation Learning for Graph Clustering.', 6 | install_requires=[ 7 | "numpy~=1.19.5", 8 | "tensorflow~=2.4.1", 9 | "scipy~=1.5.3", 10 | "pandas~=0.25.1", 11 | "scikit-learn~=0.22", 12 | "setuptools~=41.4.0" 13 | ], 14 | package_data={'gcc': ['README.md']}, 15 | packages=find_packages()) 16 | --------------------------------------------------------------------------------