├── .gitignore ├── .gitmodules ├── LICENSE ├── MANIFEST.in ├── README.md ├── bin └── locale_alg ├── data └── zachary.mtx ├── docker ├── Dockerfile ├── build.sh └── run.sh ├── images └── locale.png ├── notebooks └── SDP_Community_Detection.ipynb ├── sdp_clustering ├── __init__.py └── models.py ├── setup.py └── src ├── cluster.cpp ├── cluster.h └── pybind.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | *.so 4 | build 5 | dist 6 | *.egg-info 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/pybind11"] 2 | ignore = dirty 3 | path = third_party/pybind11 4 | url = https://github.com/pybind/pybind11.git 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 CMU Locus Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE 2 | include src/*.h* 3 | recursive-include third_party/pybind11/include *.h 4 | recursive-include third_party/pybind11/pybind11 *.py 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SDP Clustering • [![PyPi][pypi-image]][pypi] [![colab][colab-image]][colab] [![License][license-image]][license] 2 | 3 | [license-image]: https://img.shields.io/badge/License-MIT-yellow.svg 4 | [license]: LICENSE 5 | 6 | [pypi-image]: https://img.shields.io/pypi/v/sdp-clustering.svg 7 | [pypi]: https://pypi.python.org/pypi/sdp-clustering 8 | 9 | [colab-image]: https://colab.research.google.com/assets/colab-badge.svg 10 | [colab]: https://colab.research.google.com/drive/16d06iAViZHJ58S-RmwAzKR_TyWi5FE-V#offline=true&sandboxMode=true 11 | 12 | * Community detection using fast low-cardinality semidefinite programming * 13 | 14 | This repository contains the source code to reproduce the experiments in the NeurIPS'20 paper [Community detection using fast low-cardinality semidefinite programming](https://arxiv.org/abs/2012.02676) by [Po-Wei Wang](https://powei.tw/) and [J. Zico Kolter](http://zicokolter.com/). 15 | 16 | ## What the package provides 17 | It detect communities (that is, clustering with unknown number of clusters) via maximizing a metric called modularity. 18 | Further, it provides sparse embeddings for nodes in a graph. 19 | 20 | #### How it works 21 | We relax the (combinatorial) modularity maximization problem to a smooth semidefinite program (SDP) by converting the Kronecker delta into a dot-product. 22 | By further controlling the cardinality (sparsity) in the dot-product space, 23 | we develop a efficient optimization algorithm that scales linearly with the number of data entries. See the paper for more details. 24 | ![Conversion](images/locale.png) 25 | 26 | ## Installation 27 | 28 | ### Via pip 29 | ```bash 30 | pip install sdp-clustering 31 | ``` 32 | 33 | ### From source 34 | ```bash 35 | git clone --recursive https://github.com/locuslab/sdp_clustering 36 | cd sdp_clustering && python setup.py install 37 | ``` 38 | 39 | #### Package Dependencies 40 | ``` 41 | conda install -c numpy scipy 42 | ``` 43 | 44 | ## Running experiments 45 | After installation, the package provides a command-line utility **locale_alg** accepting matrix-market format. 46 | For example, to detect communities in Zachary Karate Club and output the result in *labels.txt*, run 47 | ```bash 48 | locale_alg data/zachary.mtx --out labels.txt 49 | ``` 50 | To obtain the low-cardinality embedding (without rounding) with cardinality ≤2, run 51 | ```bash 52 | locale_alg data/zachary.mtx --out emb.txt --embedding --k=2 53 | ``` 54 | 55 | ### Experiment parameters 56 | All experiments can be replicated by the default parameters (k=8), except that the Amazon data requires k=16. 57 | 58 | ## API 59 | See **bin/locale_alg** for the example usage. 60 | Mainly, the package provides 3 functions 61 | ```python 62 | locale_embedding: obtain embeddings from the continuous optimization algorithm 63 | leiden_locale: obtain comminity assignments by the hierarchical Leiden-Locale algorithm 64 | init_random_seed: set random seed 65 | ``` 66 | For more details, see *sdp_clustering/models.py*. 67 | For even more details, see the Cpp implementation in the *src/* folder. 68 | -------------------------------------------------------------------------------- /bin/locale_alg: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import numpy as np 6 | from scipy.io import mmread 7 | from sdp_clustering import init_random_seed, leiden_locale, locale_embedding 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('graph_input', type=str, 12 | help='Inputting_graph in symmetric matrix market format') 13 | parser.add_argument('--k', type=int, default=8, 14 | help='Cadinality for embeddings (int)') 15 | parser.add_argument('--eps', type=float, default=1e-6, 16 | help='Stopping criterion for optimization problem (float)') 17 | parser.add_argument('--max_outer', type=int, default=10, 18 | help='Maximum number of outer iterations (int)') 19 | parser.add_argument('--max_lv', type=int, default=10, 20 | help='Maximum number of levels in an outer iteration (int)') 21 | parser.add_argument('--max_inner', type=int, default=2, 22 | help='Maximum number of inner iters for optimization (int)') 23 | parser.add_argument('--seed', type=int, default=1234, 24 | help='random seed (int)') 25 | parser.add_argument('--verbose', type=int, default=1, 26 | help='Verbosity') 27 | parser.add_argument('--embedding', action='store_true', 28 | help='Output embedding instead of labels') 29 | parser.add_argument('--out', type=str, default=None, 30 | help='Output clustering labels or embeddings (default no output)') 31 | args = parser.parse_args() 32 | 33 | 34 | graph = mmread(args.graph_input) 35 | 36 | init_random_seed(args.seed) 37 | if args.embedding: 38 | E = locale_embedding(graph, args.k, args.eps, args.max_inner, args.verbose) 39 | if args.out: 40 | E.savetxt(args.out) 41 | else: 42 | labels = leiden_locale(graph, args.k, args.eps, args.max_outer, args.max_lv, args.max_inner, args.verbose) 43 | if args.out: 44 | np.savetxt(args.out, labels, fmt='%d', delimiter='\n') 45 | 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /data/zachary.mtx: -------------------------------------------------------------------------------- 1 | %%MatrixMarket matrix coordinate pattern symmetric 2 | % Zachary karate club from http://konect.cc/networks/ucidata-zachary/ 3 | 34 34 78 4 | 1 2 5 | 1 3 6 | 2 3 7 | 1 4 8 | 2 4 9 | 3 4 10 | 1 5 11 | 1 6 12 | 1 7 13 | 5 7 14 | 6 7 15 | 1 8 16 | 2 8 17 | 3 8 18 | 4 8 19 | 1 9 20 | 3 9 21 | 3 10 22 | 1 11 23 | 5 11 24 | 6 11 25 | 1 12 26 | 1 13 27 | 4 13 28 | 1 14 29 | 2 14 30 | 3 14 31 | 4 14 32 | 6 17 33 | 7 17 34 | 1 18 35 | 2 18 36 | 1 20 37 | 2 20 38 | 1 22 39 | 2 22 40 | 24 26 41 | 25 26 42 | 3 28 43 | 24 28 44 | 25 28 45 | 3 29 46 | 24 30 47 | 27 30 48 | 2 31 49 | 9 31 50 | 1 32 51 | 25 32 52 | 26 32 53 | 29 32 54 | 3 33 55 | 9 33 56 | 15 33 57 | 16 33 58 | 19 33 59 | 21 33 60 | 23 33 61 | 24 33 62 | 30 33 63 | 31 33 64 | 32 33 65 | 9 34 66 | 10 34 67 | 14 34 68 | 15 34 69 | 16 34 70 | 19 34 71 | 20 34 72 | 21 34 73 | 23 34 74 | 24 34 75 | 27 34 76 | 28 34 77 | 29 34 78 | 30 34 79 | 31 34 80 | 32 34 81 | 33 34 82 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel 2 | 3 | ARG USER_ID 4 | ARG GROUP_ID 5 | ARG USER_NAME 6 | ARG HOME_DIR 7 | 8 | #RUN addgroup --gid ${GROUP_ID} ${USER_NAME} || groupmod -n ${USER_NAME} $(getent group ${GROUP_ID}) 9 | RUN apt-get -q update; apt-get -q -y install sudo vim 10 | #RUN conda install -y jupyter matplotlib line_profiler scipy 11 | #RUN python -m pip install pytorch_memlab setproctitle termcolor 12 | #RUN python -m pip install cvxpy 13 | #RUN conda install -y -q pytorch=1.3.1 -c pytorch 14 | RUN adduser --quiet --disabled-password --system --no-create-home --uid ${USER_ID} --gid ${GROUP_ID} --gecos '' --shell /bin/bash ${USER_NAME} 15 | RUN usermod -d ${HOME_DIR} ${USER_NAME} 16 | RUN adduser --quiet ${USER_NAME} sudo ; echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers 17 | 18 | RUN mkdir -p /data 19 | WORKDIR /data 20 | 21 | USER ${USER_NAME} 22 | -------------------------------------------------------------------------------- /docker/build.sh: -------------------------------------------------------------------------------- 1 | docker image build \ 2 | --build-arg USER_ID=$(id -u ${USER}) \ 3 | --build-arg GROUP_ID=$(id -g ${USER}) \ 4 | --build-arg USER_NAME=$(whoami) \ 5 | --build-arg HOME_DIR=$HOME \ 6 | -t cluster . 7 | -------------------------------------------------------------------------------- /docker/run.sh: -------------------------------------------------------------------------------- 1 | DATA_VOLUME="-v $(pwd)/..:/data" 2 | HOME_VOLUME="-v $HOME:$HOME" 3 | docker run --rm --privileged --runtime=nvidia -it --net=host --ipc=host ${DATA_VOLUME} ${HOME_VOLUME} -v /etc/localtime:/etc/localtime:ro cluster bash -l 4 | -------------------------------------------------------------------------------- /images/locale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/locuslab/sdp_clustering/446a2bbae2b15c7c9fac15af74414be8950e867f/images/locale.png -------------------------------------------------------------------------------- /notebooks/SDP_Community_Detection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "SDP Community Detection.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | } 13 | }, 14 | "cells": [ 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "id": "PffV6Yr7PQbp" 19 | }, 20 | "source": [ 21 | "# Community detection using fast low-cardinality semidefinite programming\n", 22 | "Welcome! This is the Colab for our [SDP community detection paper](https://arxiv.org/abs/2012.02676) in NeurIPS'20.\n", 23 | "\n", 24 | "In this tutorial, we will demonstrate how to use the sdp_clustering package.\n", 25 | "## Installation\n", 26 | "The following command will download the code repository and compile it." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "metadata": { 32 | "id": "D5xvysRyxNpD" 33 | }, 34 | "source": [ 35 | "%%capture\n", 36 | "!rm -rf sdp_clustering\n", 37 | "!git clone --recursive https://github.com/locuslab/sdp_clustering sdp_clustering\n", 38 | "!cd sdp_clustering && python setup.py develop" 39 | ], 40 | "execution_count": 1, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "colab": { 47 | "base_uri": "https://localhost:8080/" 48 | }, 49 | "id": "4rYMD-1cAP67", 50 | "outputId": "70bc0e82-fc28-4179-8599-d2742ec34347" 51 | }, 52 | "source": [ 53 | "cd sdp_clustering/" 54 | ], 55 | "execution_count": 2, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "text": [ 60 | "/content/sdp_clustering\n" 61 | ], 62 | "name": "stdout" 63 | } 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "id": "VDXv4Qd4P6Bj" 70 | }, 71 | "source": [ 72 | "## Demonstration: Zachary Karate club\n", 73 | "In the following session, we will demo the package by showing results for the famous example from Zachary Karate Club.\n", 74 | "\n", 75 | "First, we load the graph and plot it." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "metadata": { 81 | "id": "aOYd2q79KVGm" 82 | }, 83 | "source": [ 84 | "data = 'data/zachary.mtx'" 85 | ], 86 | "execution_count": 3, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "metadata": { 92 | "colab": { 93 | "base_uri": "https://localhost:8080/", 94 | "height": 319 95 | }, 96 | "id": "-89lXvnLK3Mg", 97 | "outputId": "b37bc8d4-d086-4b16-8d4a-02b54944a02a" 98 | }, 99 | "source": [ 100 | "import matplotlib.pyplot as plt\n", 101 | "import networkx as nx\n", 102 | "import numpy as np\n", 103 | "from scipy.io import mmread\n", 104 | "graph = mmread(data)\n", 105 | "gx = nx.convert_matrix.from_numpy_matrix(graph.todense())\n", 106 | "nx.draw(gx)" 107 | ], 108 | "execution_count": 4, 109 | "outputs": [ 110 | { 111 | "output_type": "display_data", 112 | "data": { 113 | "image/png": "\n", 114 | "text/plain": [ 115 | "
" 116 | ] 117 | }, 118 | "metadata": { 119 | "tags": [] 120 | } 121 | } 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "id": "QKUOvlKvR9fp" 128 | }, 129 | "source": [ 130 | "### Community detection\n", 131 | "The goal of **community detection** is to find the hidden community / clusters inside a network. That is, it is a clustering problem with unknown number of clusters.\n", 132 | "\n", 133 | "One way to solve the community detection problem by maximizing a metric function called **modularity**, which can be defined as\n", 134 | "\n", 135 | "$Q(c):=\\frac{1}{2m}\\sum_{ij} \\left[a_{ij}-\\frac{d_id_j}{2m}\\right]\\mathbb{1}(c_i=c_j),$\n", 136 | "\n", 137 | "where $a_{ij}$ is the edge weight connecting nodes $i$ and $j$, \n", 138 | "$d_i=\\sum_j a_{ij}$ is the degree for node $i$, $m=\\sum_{ij}a_{ij}/2$ is the sum of edge weights,\n", 139 | "and $c_i\\in [r]$ is the community assignment for node $i$ among the $r$ possible communities. The higher the modularity, the more organized the community assignment. Thus, we can maximize the metric function to obtain a good community assignment. However, the maximization problem is NP-complete and highly.\n", 140 | "Thus, we proposed a smooth semidefinite relaxation to approximate the problem.\n", 141 | "\n", 142 | "### The locale relaxation\n", 143 | "The core idea of the Locale relaxation is that we can transform a Kronecker delta into a dot product\n", 144 | "\n", 145 | "$\\mathbb{1}(c_i=c_j) = v_i^T v_j,\\;\\text{where }\\; v_i\\in\\{e(1),e(2),\\ldots\\}$, and the set $\\{e(t)\\}$ are the standard basis. \n", 146 | "\n", 147 | "For example, if two nodes are in the same cluster, we can assign both of them as $e(1)$, so $\\mathbb{1}(c_i=c_j) = e(1)^Te(1) = 1$, and zero otherwise. \n", 148 | "\n", 149 | "The new encoding is exact and is still NP-complete, but we can further relax the standard basis into a smooth and continuous set containing the basis.\n", 150 | "![Alt text](https://raw.githubusercontent.com/locuslab/sdp_clustering/main/images/locale.png)\n", 151 | "Further, we can control the cardinality / sparsity of the relaxation to control the smoothness of the relaxed problem.\n", 152 | "We called the relaxation the **Lo**w-**ca**rdina**l**ity **e**mbedding, that is, the **Locale** relxation.\n", 153 | "\n", 154 | "## The API\n", 155 | "Here, we show the embedding obtained from the demo graph (*zachary.mtx*).\n", 156 | "The embeddings are sparse and presented as\n", 157 | "\n", 158 | "$\\text{index}_1:\\text{value}_1\\;\\;\\;\\;\\text{index}_2:\\text{value}_2 \\ldots$ \n", 159 | "\n", 160 | "for the 34 nodes in the graph." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "metadata": { 166 | "colab": { 167 | "base_uri": "https://localhost:8080/" 168 | }, 169 | "id": "espamqlIK5q8", 170 | "outputId": "6152178c-9e83-4bda-87e6-e7064b6cecbd" 171 | }, 172 | "source": [ 173 | "from sdp_clustering import init_random_seed, leiden_locale, locale_embedding\n", 174 | "\n", 175 | "init_random_seed(1234)\n", 176 | "# obtain the embedding\n", 177 | "E = locale_embedding(graph, k=2, eps=1e-3, max_inner=10, verbose=0)\n", 178 | "print(E)" 179 | ], 180 | "execution_count": 5, 181 | "outputs": [ 182 | { 183 | "output_type": "stream", 184 | "text": [ 185 | "(1) 2:0.86\t7:0.52\t\n", 186 | "(2) 2:0.98\t8:0.21\t\n", 187 | "(3) 8:0.84\t2:0.54\t\n", 188 | "(4) 2:0.91\t8:0.41\t\n", 189 | "(5) 7:1.00\t6:0.10\t\n", 190 | "(6) 7:0.99\t6:0.11\t\n", 191 | "(7) 7:0.99\t6:0.11\t\n", 192 | "(8) 2:0.90\t8:0.43\t\n", 193 | "(9) 9:0.99\t8:0.13\t\n", 194 | "(10) 8:0.92\t9:0.39\t\n", 195 | "(11) 7:0.99\t6:0.10\t\n", 196 | "(12) 2:0.83\t7:0.56\t\n", 197 | "(13) 2:0.98\t8:0.18\t\n", 198 | "(14) 2:0.90\t8:0.44\t\n", 199 | "(15) 9:0.88\t10:0.48\t\n", 200 | "(16) 9:0.88\t10:0.48\t\n", 201 | "(17) 7:0.99\t6:0.14\t\n", 202 | "(18) 2:0.99\t7:0.16\t\n", 203 | "(19) 9:0.88\t10:0.48\t\n", 204 | "(20) 2:1.00\t7:0.03\t\n", 205 | "(21) 9:0.88\t10:0.48\t\n", 206 | "(22) 2:0.99\t7:0.16\t\n", 207 | "(23) 9:0.88\t10:0.48\t\n", 208 | "(24) 10:0.73\t24:0.69\t\n", 209 | "(25) 24:0.83\t32:0.55\t\n", 210 | "(26) 24:0.79\t32:0.61\t\n", 211 | "(27) 10:0.79\t9:0.62\t\n", 212 | "(28) 24:0.91\t10:0.42\t\n", 213 | "(29) 32:0.84\t8:0.54\t\n", 214 | "(30) 10:0.84\t9:0.55\t\n", 215 | "(31) 9:0.99\t10:0.13\t\n", 216 | "(32) 32:0.83\t24:0.55\t\n", 217 | "(33) 9:0.89\t10:0.46\t\n", 218 | "(34) 9:0.84\t10:0.54\t\n", 219 | "\n" 220 | ], 221 | "name": "stdout" 222 | } 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": { 228 | "id": "9SzIf1hbZapb" 229 | }, 230 | "source": [ 231 | "We show that the embeddings captures the cluster structure by showing their covariance matrix. The optimal assignment of the demo graph has 4 clusters, \n", 232 | "so correspondingly the optimal covariance graph has 4 blocks. " 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "metadata": { 238 | "colab": { 239 | "base_uri": "https://localhost:8080/", 240 | "height": 283 241 | }, 242 | "id": "nUs-47aALBZX", 243 | "outputId": "3eb6edf4-8591-4d93-b329-43be303675f0" 244 | }, 245 | "source": [ 246 | "# obtain clustering result\n", 247 | "labels = leiden_locale(graph)\n", 248 | "# re-order the embedding\n", 249 | "order = sorted(list(range(len(labels))), key=lambda x: labels[x])\n", 250 | "V = E.to_scipy()[order]\n", 251 | "# create the covariance matrix\n", 252 | "C = (V@V.T).todense()\n", 253 | "plt.imshow(C)" 254 | ], 255 | "execution_count": 6, 256 | "outputs": [ 257 | { 258 | "output_type": "execute_result", 259 | "data": { 260 | "text/plain": [ 261 | "" 262 | ] 263 | }, 264 | "metadata": { 265 | "tags": [] 266 | }, 267 | "execution_count": 6 268 | }, 269 | { 270 | "output_type": "display_data", 271 | "data": { 272 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD5CAYAAADhukOtAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAATS0lEQVR4nO3deZBV5ZnH8e/TK7KJrLYsooISjBFcUGcsJzEjOpqoSaVSJjUOVUOFLFouNRnHcabUpFJxqUkyzpTR0WjCpFwnG46aEIKkYmJEwCCCG2IgikAj0gmiNt3NM3/c01Md+31v9+17z+nG9/ep6urbz7nved/b8PS5933POY+5OyLy/lc32AMQkWIo2UUSoWQXSYSSXSQRSnaRRCjZRRLRUE1jMzsHuAWoB77j7jeWe/74sfU+fWpjr3hrV1O0TWvbwcH4waP3Rts4FoxPawy32dnVe0zd3t4fHtuEhj3RNu0e3l+zdUTbbGqbFN1WhOZX479POXC8y172eXswAQac7GZWD9wKnAW8Bqwys4fc/blYm+lTG3lq6dRe8Vvbese63bLkY8H4x+evjLbp8Ppg/D8OWxWM3942ObqvdXvDY/vChF9G22zcNzEYn9nUGm1z4UOXR7fVjIf/CALMvPzJ/PuX3K305dFt1byNnwe87O6vuPs+4H7ggir2JyI5qibZJwOv9vj5tSwmIkNQ7hN0ZrbIzFab2eqdu7ry7k5EIqpJ9q1Azw+0U7LYn3H3O9z9JHc/acK48GdpEclfNbPxq4CZZnYEpSS/CPhsuQatXU3BybhLxrwaeHbJT0/bHIx/ZdIT0TZdhC/u+fobc4Lxa8a/GN3X7lEbg/GOSB8Ahw/fEYy3+/5om5mXxScci1A3fHgw3nHKrGC8fsXTeQ5HMvXHzIhu63rx5Yr2NeBkd/dOM7sUWEpp6e1ud98w0P2JSL6qWmd390eBR2s0FhHJkc6gE0mEkl0kEUp2kUQo2UUSUdUEXaVa2w4OnuseW14DePjonwbjt7bNjLYZUdcejC9ecmYw/vNTPhDd199P+3Uwft0vPxltg0WW5cqcm340T8X3VwCb0hKMz7o5vMCy8eQ8RyPdNl08Ibpt+r9WtvSmI7tIIpTsIolQsoskQskukgglu0giCp2NP3j03uAdZspd1BKbdS938UzMb+avC8ZvPmxZtM0h9eELRKad9Z1om2GR20+9G7ldFcANfCi6rRC7dgfDj997YjB+KPF/M6mdlt901mxfOrKLJELJLpIIJbtIIpTsIolQsoskQskukohCl94cCxZwiN0zDuIXtQxEp4f/tpXrP2ZfpBAFQJ2F7zVXrs2gs/BFOvvjq4VShBoejnVkF0mEkl0kEUp2kUQo2UUSoWQXSYS5Vz4T/f+NzTYDe4AuoNPdTyr3/JOOH+ahks1ff+OYaJvYraTOiFzUAvFZ9+9OezwYv2lX/BZXz+4J16r84qGPRdu80H5YMD6r+fVom79d/vnotpop80999KJwOWs5sKz05fzJ36xtffYePuLub9RgPyKSI72NF0lEtcnuwM/NbI2ZLarFgEQkH9W+jT/d3bea2URgmZm94O6/6vmE7I/AIoBpkws9YU9EeqjqyO7uW7PvrcCPgXmB56g+u8gQMOBDrZmNAOrcfU/2eD7w1XJtdnY1cntb79ntcvXRYwUcyt1KKnau+027wvXZ/2lcuAY7QOuYtcF4I/GCD3Ob/hCMt3v8FkNHf25wZ8OtuTkY97mRlZIn46shUjsNRxwe3db5+y2V7auKcUwCfmylCygagHvd/WdV7E9EcjTgZHf3V4DjazgWEcmRlt5EEqFkF0mEkl0kEUp2kUQUepbL2/ubWLe394Uwu0fFl75i9dFjlVrKiV3UElteA5hYPyIYf+TtYdE2A6kIM9jqRo0Mxrd8JByf8mSeo5Fub31wUnTbsAqX3nRkF0mEkl0kEUp2kUQo2UUSoWQXSURVt6Wq1OwPNfm9D/eeXTy0viva5pRHrgzGv1umPnqsGEOs4MTsxnej+3qifWwwft7weJsOD7+eRotf9XfOBRdHtxXBVz07qP1LbZS7LZWO7CKJULKLJELJLpIIJbtIIpTsIolQsoskotALYdq9kY37JvaKHz58R7yRhZcGYxebQLw+eqxSS+yeceX6iS2vQXyJrVwbe+6V6LYiFLcAK4NFR3aRRCjZRRKhZBdJhJJdJBFKdpFE9Dkbb2Z3Ax8DWt39g1lsLPAAMB3YDHza3Xf3ta9m62BmU2uveLuHZ88B8HDllXK3eIpdCBOrj16uUkusn0aLz6wP5EIYn31kdFshdCHM+15/juzfA855T+xqYLm7zwSWZz+LyBDWZ7JnVVnffE/4AmBx9ngxcGGNxyUiNTbQz+yT3H1b9ng7pbpvIjKEVT1B56W7X0RPwDKzRWa22sxWt71Z5rO5iORqoMm+w8xaALLvvWfdMj3rs48Zq8l/kcEy0Ox7CFiQPV4ALKnNcEQkL33eg87M7gM+DIwHdgDXAT8BHgSmAVsoLb29dxKvl+ZpU73lqst7xWdetrLScQ/IS3eeHIwf/blVFe/LTj4uvi1yUUu55bWfLfl+xWOopbMPmzOo/UttlLsHXZ/r7O7+mcimj1Y1KhEplD5EiyRCyS6SCCW7SCKU7CKJULKLJELJLpIIJbtIIpTsIolQsoskQskukgglu0gilOwiiVCyiyRCyS6SCCW7SCKU7CKJULKLJELJLpIIJbtIIpTsIolQsoskQskukgglu0gi+kx2M7vbzFrNbH2P2PVmttXM1mZf5/a7R7feX0XxyJdIAgZanx3gW+4+J/t6tLbDEpFaG2h9dhE5wFTzmf1SM1uXvc0/JPakniWbu97aW0V3IlKNgSb7bcBRwBxgG/CN2BN7lmyuHzligN2JSLUGlOzuvsPdu9x9P3AnMK+2wxKRWhtQsptZS48fPwGsjz1XRIaGgdZn/zClt/AObAY+7+7b+upstI31U6x3pee64cPj/U9pCW/YtbvcoIPhrjd2hZ/e3BzdVd2okRXt60C19PW1wfhRD34hGL/h3PvyHE6frl17fnRb++5hwfiG826NtvnFO2OC8S37JkTbPHxsdKqqZhpaDo1u69y2vVcsj/rsd/XVTkSGFp1BJ5IIJbtIIpTsIolQsoskos8JuiJ0nDIrum3WzRuC8cfvPTHaZn9jOH7YzU8E4z73mOi+tnwkPBs/5Ybwvg5UsVn3TZ++PRg/4atfzHM4fZq0tSu6rb59fzB+6ktXRNs0vB1elarriI9hHL+Nb6yRF66aHt0248res/Hl6Mgukgglu0gilOwiiVCyiyRCyS6SiD7Pja+l2LnxMvgWvvT7YPzGb342GH/62tvyHE6f1rTvi27b3jU6GJ/eEL+e4timgyoew9yvfSkYn/jtyldqOs8Mry41PLamov2UOzdeR3aRRCjZRRKhZBdJhJJdJBFKdpFEKNlFEjEkLoQRKUJd4uV/dGQXSYSSXSQRSnaRRCjZRRKhZBdJRH/qs081sxVm9pyZbTCzy7P4WDNbZmYbs+/53zFfpB+6vC74lbr+/AY6gX9w99nAqcAlZjYbuBpY7u4zgeXZzyIyRPWnPvs2d386e7wHeB6YDFwALM6ethi4MK9Bikj1KjqpxsymA3OBlcCkHvXdtgOTIm0WAYsAhhGv6SYi+er3BxkzGwn8ELjC3f/Uc5uX7oARPD2pZ332RuIFFEUkX/1KdjNrpJTo97j7j7Lwju7Szdn31nyGKCK10J/ZeKNUtfV5d/9mj00PAQuyxwuAJbUfnojUSn8+s/8lcDHwrJl1F/G+BrgReNDMFgJbgE/nM0QRqYX+1Gf/NRC8gR2gu0eKHCB0poFIIpTsIolQsoskQskukgglu0gilOwiiVCyiyRCyS6SCCW7SCKU7CKJUJEIAeDatecH45O2dgXj5eqjF+HE5qYy294Oxs/f+Mlom2umPhKMT6h/J9rmj/Pag/GJ3442iWqbEX494x+rfF8xOrKLJELJLpIIJbtIIpTsIolQsoskQskukggtvQkA7buHBeP17fuD8e1doyvuo5ZVWWLLa+Vs2xMf89bOygsaNQzrqLhNzP7G2M2gakdHdpFEKNlFEqFkF0mEkl0kEUp2kUT0ORtvZlOB/6ZUuNGBO9z9FjO7HvgcsDN76jXu/mheA5V8bTjv1mD81JeuCManN+yO7qsuXPavpspd1BKbdV91woPRNi917A3Gd3YdFG0z85//GIx3RlvEtTzwYjAevgxpYPqz9NZdn/1pMxsFrDGzZdm2b7n7v9VwPCKSk/5UhNkGbMse7zGz7vrsInIAqegz+3vqswNcambrzOxuMwuelWBmi8xstZmt7iB8/a+I5K+a+uy3AUcBcygd+b8Raqf67CJDw4Drs7v7Dnfvcvf9wJ3AvPyGKSLVGnB9djNr6fG0TwDraz88EamVauqzf8bM5lBajtsMfD6XEUohfvHOmGC84e3wMtqxTfElqSLE7hkH8YtaYstrAEc3jgjGj2oIXwgEsH3+YcH4+P/aEm0Ts++4w4Px+hW7Kt5XTDX12bWmLnIA0Rl0IolQsoskQskukgglu0gidFsqAWDLvgnBeF3t7rxUU+UqtcSUu6glNuteb/Hj4b7RtbuVVOdB9eH+a9aDjuwiyVCyiyRCyS6SCCW7SCKU7CKJULKLJEJLbwLAw8eGLx4Zx2+D8blNX8pzOH3647z4jVBilVpi94yD+EUt5ZbXnr3y28H4kccuDMa9PX5sXXL2fwbj/zj91Gib7Vf+Ra9Yxz1PRp+vI7tIIpTsIolQsoskQskukgglu0gizD3/6h3dRttYP8U+Wlh/Inna+L0Tg/FX5t9V8b4W/uH0YPzxFcdF29x30S29Yn/38e08v649uISgI7tIIpTsIolQsoskQskukgglu0gi+lOffRjwK6A5e/4P3P06MzsCuB8YB6wBLnb3fXkOVmQoKXeue6V2tYeLVHQOj6+WjanrnW4NxIta9Ge07cCZ7n48pSKO55jZqcBNlOqzzwB2A+Gz/0VkSOgz2b3krezHxuzLgTOBH2TxxcCFuYxQRGqiv1Vc67M6b63AMmAT0ObundlTXgMmR9qqPrvIENCvZM9KM88BplAqzTyrvx2oPrvI0FDRDIO7twErgNOAMWbWPcE3Bdha47GJSA31pz77BDMbkz0+CDgLeJ5S0n8qe9oCYElegxSR6vXntlQtwGIzq6f0x+FBd3/YzJ4D7jezrwG/Ayo/+18OWJ1nhi8CKUrbjKbotv2N4VtJtTzwYrRNrD56rFILxG8ltfAPfx2Mx5bXAH4yc2kwfvZfzYm2md/15V6x13f/e/T5/anPvg6YG4i/Qunzu4gcAHQGnUgilOwiiVCyiyRCyS6SiCFRJKL+mBnRbZsuDtcNb/lNZzAORP+ENT+yKhhvOCI8Ewvw1gcnBePD/vepeP8HoIaWQ4PxF66aHozPuDJejKAI4x+rvE1XmW31K3aF42XaxAo4/P6G8K2kyl3UEpt1X/r62nib56f0iu0aHr8WTUd2kUQo2UUSoWQXSYSSXSQRSnaRRCjZRRKhijAiAxSqjw5w92Xhi1FC94zrNv+HvS9qAZhx/GvRNks/8HCv2LyzX2X1M++qIoxIypTsIolQsoskQskukgglu0giCp2NN7OdwJbsx/HAG4V13pv6V//vx/4Pd/fg1WOFJvufdWy22t1PGpTO1b/6T7B/vY0XSYSSXSQRg5nsdwxi3+pf/SfX/6B9ZheRYultvEgiBiXZzewcM3vRzF42s6sHof/NZvasma01s9UF9He3mbWa2foesbFmtszMNmbfDym4/+vNbGv2O1hrZufm1PdUM1thZs+Z2QYzuzyLF/L6y/Rf1OsfZmZPmdkzWf9fyeJHmNnKLAceMLN4iZtacfdCvyjdw28TcCTQBDwDzC54DJuB8QX2dwZwArC+R+xm4Ors8dXATQX3fz3w5QJeewtwQvZ4FPASMLuo11+m/6JevwEjs8eNwErgVOBB4KIsfjvwxbzHMhhH9nnAy+7+irvvA+4HLhiEcRTG3X8FvPme8AXA4uzxYuDCgvsvhLtvc/ens8d7KBUFnUxBr79M/4XwkreyHxuzLwfOBH6QxXP99+82GMk+GXi1x8+vUeAvP+PAz81sjZktKrjvbpPcfVv2eDsQvmd1vi41s3XZ2/zcPkZ0M7PplOoGrmQQXv97+oeCXr+Z1ZvZWqAVWEbpnW2bu3ffD72QHEh1gu50dz8B+BvgEjM7YzAH46X3ckUvi9wGHAXMAbYB38izMzMbCfwQuMLd/9RzWxGvP9B/Ya/f3bvcfQ4whdI721l59VXOYCT7VmBqj5+nZLHCuPvW7Hsr8GMGpxrtDjNrAci+txbZubvvyP4T7gfuJMffgZk1Ukq0e9z9R1m4sNcf6r/I19/N3duAFcBpwBgz6y7SUkgODEayrwJmZrORTcBFwENFdW5mI8xsVPdjYD6wvnyrXDwELMgeLwCWFNl5d6JlPkFOvwMzM+Au4Hl3/2aPTYW8/lj/Bb7+CWY2Jnt8EHAWpXmDFcCnsqcV8++f9wxgZIbyXEqzopuAfym47yMprQA8A2woon/gPkpvFTsofT5bCIwDlgMbgV8AYwvu//vAs8A6SonXklPfp1N6i74OWJt9nVvU6y/Tf1Gv/0PA77J+1gPX9vh/+BTwMvA/QHPe/w91Bp1IIlKdoBNJjpJdJBFKdpFEKNlFEqFkF0mEkl0kEUp2kUQo2UUS8X+wEGK/dPLORAAAAABJRU5ErkJggg==\n", 273 | "text/plain": [ 274 | "
" 275 | ] 276 | }, 277 | "metadata": { 278 | "tags": [], 279 | "needs_background": "light" 280 | } 281 | } 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": { 287 | "id": "JHnBnJacNLdl" 288 | }, 289 | "source": [ 290 | "## Command line utilities\n", 291 | "We can also use the command-line utilities **locale_alg** provided by the package to perform clustering and embedding. For example, to detect communities in Zachary Karate Club and output the result in labels.txt, run" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "metadata": { 297 | "colab": { 298 | "base_uri": "https://localhost:8080/" 299 | }, 300 | "id": "rSpww8unxSiV", 301 | "outputId": "b5a322e6-1977-4b94-dc87-8c573c7424bd" 302 | }, 303 | "source": [ 304 | "!locale_alg {data} --out labels.txt --max_outer=3" 305 | ], 306 | "execution_count": 7, 307 | "outputs": [ 308 | { 309 | "output_type": "stream", 310 | "text": [ 311 | "iter 1(1)\topt fval 0.39981914\tn_comm 5\n", 312 | "iter 1(1)\trnd fval 0.21186720\tn_comm 12\n", 313 | "iter 1(2)\topt fval 0.41978961\tn_comm 4\n", 314 | "iter 1(2)\trnd fval 0.36875412\tn_comm 5\n", 315 | "iter 1(3)\topt fval 0.41978961\tn_comm 4\n", 316 | "iter 1(3)\trnd fval 0.41978961\tn_comm 4\n", 317 | "iter 1(4)\topt fval 0.41978961\tn_comm 4\n", 318 | "iter 1(4)\trnd fval 0.41978961\tn_comm 4\n", 319 | "\n", 320 | "iter 2(1)\topt fval 0.41978955\tn_comm 4\n", 321 | "iter 2(1)\trnd fval 0.23134451\tn_comm 11\n", 322 | "iter 2(2)\topt fval 0.41978961\tn_comm 4\n", 323 | "iter 2(2)\trnd fval 0.41978961\tn_comm 4\n", 324 | "iter 2(3)\topt fval 0.41978961\tn_comm 4\n", 325 | "iter 2(3)\trnd fval 0.41978961\tn_comm 4\n", 326 | "\n", 327 | "iter 3(1)\topt fval 0.41978961\tn_comm 4\n", 328 | "iter 3(1)\trnd fval 0.25969756\tn_comm 10\n", 329 | "iter 3(2)\topt fval 0.41978958\tn_comm 4\n", 330 | "iter 3(2)\trnd fval 0.41978961\tn_comm 4\n", 331 | "iter 3(3)\topt fval 0.41978961\tn_comm 4\n", 332 | "iter 3(3)\trnd fval 0.41978961\tn_comm 4\n", 333 | "\n" 334 | ], 335 | "name": "stdout" 336 | } 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": { 342 | "id": "eYqQxvEAbzbq" 343 | }, 344 | "source": [ 345 | "The output shows the modularity (fval) and number of clusters (n_comm) for each hierarchical iterations. Note that opt means the modularity after relaxation, and rnd is the result after rounding. We can see that the final modularity is 0.4198, which is optimal.\n", 346 | "\n", 347 | "To obtain embeddings, run" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "metadata": { 353 | "colab": { 354 | "base_uri": "https://localhost:8080/" 355 | }, 356 | "id": "XwZMS2AvxU42", 357 | "outputId": "41d191bb-6e3f-4220-c8e3-28d571353813" 358 | }, 359 | "source": [ 360 | "!locale_alg {data} --out emb.txt --embedding --verbose=0 --max_inner=10 --k=2\n", 361 | "!cat emb.txt" 362 | ], 363 | "execution_count": 8, 364 | "outputs": [ 365 | { 366 | "output_type": "stream", 367 | "text": [ 368 | "2:0.8434904217720032\t7:0.5371441841125488\t\n", 369 | "2:0.9853476881980896\t8:0.1705576628446579\t\n", 370 | "8:0.7674331068992615\t2:0.6411290764808655\t\n", 371 | "2:0.9597187042236328\t8:0.2809625267982483\t\n", 372 | "7:0.9988347291946411\t6:0.04826069995760918\t\n", 373 | "7:0.9983335137367249\t6:0.05770694091916084\t\n", 374 | "7:0.9985255599021912\t6:0.05428372323513031\t\n", 375 | "2:0.9383329749107361\t8:0.34573298692703247\t\n", 376 | "9:0.955811619758606\t10:0.2939799726009369\t\n", 377 | "8:0.9194113612174988\t9:0.39329734444618225\t\n", 378 | "7:0.9986326098442078\t6:0.05227864533662796\t\n", 379 | "2:0.8082109093666077\t7:0.5888931751251221\t\n", 380 | "2:0.9845686554908752\t7:0.1749989092350006\t\n", 381 | "2:0.9362026453018188\t8:0.35146087408065796\t\n", 382 | "9:0.8369048833847046\t10:0.5473482608795166\t\n", 383 | "9:0.8372026085853577\t10:0.5468928813934326\t\n", 384 | "7:0.997382640838623\t6:0.07230319827795029\t\n", 385 | "2:0.9853029847145081\t7:0.17081592977046967\t\n", 386 | "9:0.8374990820884705\t10:0.5464388132095337\t\n", 387 | "2:0.999264121055603\t7:0.03835659846663475\t\n", 388 | "9:0.8377942442893982\t10:0.5459861755371094\t\n", 389 | "2:0.9853131175041199\t7:0.17075775563716888\t\n", 390 | "9:0.8380880355834961\t10:0.5455349087715149\t\n", 391 | "24:0.7275087833404541\t10:0.6860983967781067\t\n", 392 | "24:0.8944594860076904\t32:0.44714900851249695\t\n", 393 | "24:0.8783431649208069\t32:0.47803062200546265\t\n", 394 | "10:0.7863637208938599\t9:0.617763876914978\t\n", 395 | "24:0.9432942867279053\t10:0.3319576382637024\t\n", 396 | "32:0.8080738186836243\t8:0.5890812277793884\t\n", 397 | "10:0.8409537076950073\t9:0.5411071181297302\t\n", 398 | "9:0.949730634689331\t10:0.31306809186935425\t\n", 399 | "32:0.7668662071228027\t24:0.641806960105896\t\n", 400 | "9:0.8350643515586853\t10:0.5501522421836853\t\n", 401 | "9:0.8096125721931458\t10:0.5869647264480591\t\n" 402 | ], 403 | "name": "stdout" 404 | } 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": { 410 | "id": "lqIll033cmvf" 411 | }, 412 | "source": [ 413 | "That's all! For more details, please refer to our [paper](https://arxiv.org/abs/2012.02676) and [github repo](https://github.com/locuslab/sdp_clustering)." 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "metadata": { 419 | "id": "dQlivnZ9c04z" 420 | }, 421 | "source": [ 422 | "" 423 | ], 424 | "execution_count": null, 425 | "outputs": [] 426 | } 427 | ] 428 | } -------------------------------------------------------------------------------- /sdp_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import locale_embedding, leiden_locale, init_random_seed 2 | 3 | __all__ = ['locale_embedding', 'leiden_locale', 'init_random_seed'] 4 | -------------------------------------------------------------------------------- /sdp_clustering/models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import csr_matrix 3 | import sdp_clustering._cpp as _cpp 4 | 5 | class SparseMat(object): 6 | def __init__(self, indptr, indices, data): 7 | self.indptr = indptr 8 | self.indices = indices 9 | self.data = data 10 | 11 | def copy(self): 12 | return SparseMat(self.indptr.copy(), self.indices.copy(), self.data.copy()) 13 | 14 | @classmethod 15 | def from_scipy(cls, A): 16 | if not A is csr_matrix: 17 | A = A.tocsr() 18 | indptr = A.indptr 19 | indices = A.indices 20 | data = np.asarray(A.data, dtype=np.float32) 21 | 22 | return cls(indptr, indices, data) 23 | 24 | def to_scipy(self): 25 | return csr_matrix((self.data, self.indices, self.indptr)) 26 | 27 | @classmethod 28 | def zeros(cls, n, k): 29 | indptr = np.arange(0, n*k+1, k, dtype=np.int32) 30 | indices = np.zeros(n*k, dtype=np.int32) 31 | data = np.zeros(n*k, dtype=np.float32) 32 | 33 | return cls(indptr, indices, data) 34 | 35 | @classmethod 36 | def zeros_like(cls, mat, n=None): 37 | if n is None: n = mat.indptr.shape[0]-1 38 | indptr = np.zeros(n+1, dtype=np.int32) 39 | indices = np.zeros_like(mat.indices, dtype=np.int32) 40 | data = np.zeros_like(mat.data) 41 | 42 | return cls(indptr, indices, data) 43 | 44 | def __str__(self): 45 | n = self.indptr.shape[0]-1 46 | s = [] 47 | for i in range(n): 48 | s.append(f'({i+1}) ') 49 | for p in range(self.indptr[i], self.indptr[i+1]): 50 | if self.data[p].item() == 0.0: continue 51 | s.append(f'{self.indices[p].item()+1}:{self.data[p].item():1.2f}\t') 52 | s.append('\n') 53 | return ''.join(s) 54 | 55 | def savetxt(self, fname): 56 | f = open(fname, 'w') 57 | n = self.indptr.shape[0]-1 58 | for i in range(n): 59 | for p in range(self.indptr[i], self.indptr[i+1]): 60 | idx, val = self.indices[p].item()+1, self.data[p].item() 61 | if val == 0.0: continue 62 | f.write(f'{idx}:{val}\t') 63 | f.write('\n') 64 | f.close() 65 | 66 | def init_random_seed(seed): 67 | _cpp.init_random_seed(seed) 68 | 69 | def solve_locale(A, Adiag, k, comm=None, n_comm=None, max_iter=100, eps=1e-3, shrink=0, comm_init=0, rnd_card=1, verbose=False): 70 | n = A.indptr.shape[0]-1 71 | if k>n: k=n 72 | k_ = max(10, k) # preallocate for increased rank 73 | 74 | V = SparseMat.zeros(n,k) 75 | buf = np.zeros(n*k_*2) 76 | d, s, g = np.zeros(n), np.zeros(n*k_), np.zeros(n*k_) 77 | 78 | queue = np.zeros(n, dtype=np.int32) 79 | is_in = np.zeros(n, dtype=np.int32) 80 | 81 | if comm is None: comm = np.zeros(n, dtype=np.int32) 82 | else: comm = comm.copy() 83 | if n_comm is None: n_comm = np.zeros(1, dtype=np.int32) 84 | else: n_comm = np.array([n_comm], dtype=np.int32) 85 | 86 | fval = _cpp.solve_locale( 87 | max_iter, eps, 88 | A.indptr, A.indices, A.data, Adiag, 89 | V.indptr, V.indices, V.data, 90 | buf, s, d, g, 91 | queue, is_in, 92 | comm, n_comm, 93 | shrink, comm_init, rnd_card, verbose) 94 | 95 | return fval, comm, n_comm.item(), V 96 | 97 | def aggregate_clusters(A, Adiag, comm, n_comm): 98 | G = SparseMat.zeros_like(A, n_comm) 99 | Gdiag = np.zeros(n_comm) 100 | 101 | #print(A.indptr.shape, A.indices.shape, A.data.shape, Adiag.shape) 102 | #print(G.indptr.shape, G.indices.shape, G.data.shape, Gdiag.shape) 103 | #print(comm.shape) 104 | 105 | _cpp.aggregate_clusters( 106 | A.indptr, A.indices, A.data, Adiag, 107 | G.indptr, G.indices, G.data, Gdiag, 108 | comm) 109 | 110 | return G, Gdiag 111 | 112 | def merge_clusters(comm, comm_next, new_comm): 113 | #for i, ic in enumerate(new_comm.tolist()): 114 | # comm_next[ic] = comm[i] 115 | _cpp.merge(comm, comm_next, new_comm) 116 | 117 | def split_clusters(comm, comm_next): 118 | #for i, ic in enumerate(comm.tolist()): 119 | # comm[i] = comm_next[ic] 120 | _cpp.split(comm, comm_next) 121 | 122 | def locale_embedding(A, k=8, eps=1e-6, max_inner=10, verbose=False): 123 | A = SparseMat.from_scipy(A) 124 | n = len(A.indptr)-1 125 | Adiag = np.zeros(n) 126 | fval, _, _, V = solve_locale(A, Adiag, k, comm=None, eps=eps, max_iter=max_inner, comm_init=False, rnd_card=0, verbose=verbose) 127 | return V 128 | 129 | def leiden_locale(A, k=8, eps=1e-6, max_outer=10, max_lv=10, max_inner=2, verbose=0): 130 | A = SparseMat.from_scipy(A) 131 | n = len(A.indptr)-1 132 | Adiag = np.zeros(n) 133 | comm_init = None 134 | for it in range(max_outer): 135 | comms = [] 136 | G, Gdiag = A.copy(), Adiag.copy() 137 | for lv in range(max_lv): 138 | # LocaleEmbedding and LocaleRounding 139 | fval, comm, n_comm, V = solve_locale(G, Gdiag, k, comm=comm_init, eps=eps, max_iter=max_inner, comm_init=comm_init is not None, verbose=verbose) 140 | if verbose: print(f'iter {it+1}({lv+1})\topt fval {fval:.8f}\tn_comm {n_comm}') 141 | if 1: # LeidenRefine 142 | fval, new_comm, new_n_comm, V = solve_locale(G, Gdiag, k, comm=comm, n_comm=n_comm, eps=1e-4, max_iter=1, shrink=1, verbose=verbose) 143 | if verbose: print(f'iter {it+1}({lv+1})\trnd fval {fval:.8f}\tn_comm {new_n_comm}') 144 | else: # If k=1, this branch equals the Louvain algorithm 145 | new_comm, new_n_comm = comm.copy(), n_comm 146 | 147 | if new_n_comm == len(comm): break 148 | 149 | comm_init = np.zeros(new_n_comm, dtype=np.int32) 150 | merge_clusters(comm, comm_init, new_comm) 151 | 152 | # Aggregrate 153 | comms.append(new_comm.copy()) 154 | G, Gdiag = aggregate_clusters(G, Gdiag, new_comm, new_n_comm) 155 | 156 | for lv in reversed(range(len(comms)-1)): 157 | split_clusters(comms[lv], comms[lv+1]) 158 | comm_init = comms[0].copy() 159 | if verbose: print() 160 | 161 | return comm_init 162 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | DIR = os.path.abspath(os.path.dirname(__file__)) 4 | sys.path.append(os.path.join(DIR, "third_party", "pybind11")) 5 | print(sys.path) 6 | 7 | from glob import glob 8 | from setuptools import setup 9 | import pybind11 10 | from pybind11.setup_helpers import Pybind11Extension, build_ext # noqa: E402 11 | 12 | del sys.path[-1] 13 | 14 | pkg_name = 'sdp_clustering' 15 | ext_name = '_cpp' 16 | __version__ = "0.0.3" 17 | 18 | ext_modules = [ 19 | Pybind11Extension(pkg_name+'.'+ext_name, 20 | include_dirs = ['./src'], 21 | sources = sorted(glob('src/*.c*')), 22 | define_macros = [('EXTENSION_NAME', ext_name)], 23 | extra_compile_args = ['-O3', '-Wall', '-g'], 24 | ), 25 | ] 26 | 27 | setup( 28 | name=pkg_name, 29 | version=__version__, 30 | install_requires=['numpy', 'scipy'], 31 | author="Po-Wei Wang", 32 | author_email="poweiw@cs.cmu.edu", 33 | url="https://github.com/locuslab/sdp_clustering", 34 | description="SDP-based clustering by maximum modularity", 35 | long_description="", 36 | scripts=['bin/locale_alg'], 37 | ext_modules=ext_modules, 38 | extras_require={"test": "pytest"}, 39 | cmdclass={"build_ext": build_ext}, 40 | packages=[pkg_name], 41 | zip_safe=False, 42 | classifiers=[ 43 | "License :: OSI Approved :: MIT License", 44 | ], 45 | ) 46 | -------------------------------------------------------------------------------- /src/cluster.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #ifndef __unix__ 11 | #include 12 | #endif 13 | 14 | #include "cluster.h" 15 | 16 | #define MEPS 1e-20 17 | 18 | inline int min(int x, int y) { return (x<=y)?x:y; } 19 | 20 | #define NS_PER_SEC 1000000000 21 | int64_t wall_clock_ns() 22 | { 23 | #ifdef __unix__ 24 | struct timespec tspec; 25 | int r = clock_gettime(CLOCK_MONOTONIC, &tspec); 26 | assert(r==0); 27 | return tspec.tv_sec*NS_PER_SEC + tspec.tv_nsec; 28 | #else 29 | struct timeval tv; 30 | int r = gettimeofday( &tv, NULL ); 31 | assert(r==0); 32 | return tv.tv_sec*NS_PER_SEC + tv.tv_usec*1000; 33 | #endif 34 | } 35 | 36 | double wall_time_diff(int64_t ed, int64_t st) 37 | { 38 | return (double)(ed-st)/(double)NS_PER_SEC; 39 | } 40 | 41 | // ------------- Sparse BLAS-like utils -------------------- 42 | 43 | // perform y += a * xi for sparse matrix X 44 | void axipy(float *__restrict__ y, const float a, SparseMat X, int i) 45 | { 46 | int p = X.indptr[i]; 47 | const int ed = X.indptr[i+1]; 48 | const int *__restrict__ indices = X.indices; 49 | const float *__restrict__ data = X.data; 50 | 51 | for (; pval, vy = ((SparsePair*)py)->val; 121 | 122 | if (vx < vy) return 1; 123 | else if (vx > vy) return -1; 124 | else return 0; 125 | } 126 | #else 127 | bool val_cmp(const SparsePair &x, const SparsePair &y) 128 | { 129 | return x.val > y.val; 130 | } 131 | #endif 132 | 133 | int idx_cmp(const void *px, const void *py) 134 | { 135 | int vx = ((SparsePair*)px)->idx, vy = ((SparsePair*)py)->idx; 136 | 137 | if (vx < vy) return -1; 138 | else if (vx > vy) return 1; 139 | else return 0; 140 | } 141 | 142 | // ------------- randomization utils ------------------- 143 | 144 | void randperm(int *perm, int k) 145 | { 146 | for (int i=0; iis_in[x]) return; 192 | Q->queue[Q->rear] = x; 193 | Q->rear = (Q->rear + 1) % Q->cap; 194 | Q->len++; 195 | Q->is_in[x] = 1; 196 | } 197 | 198 | int ring_pop(Ring *Q) 199 | { 200 | int x = Q->queue[Q->front]; 201 | Q->front = (Q->front + 1) % Q->cap; 202 | Q->len--; 203 | Q->is_in[x] = 0; 204 | 205 | return x; 206 | } 207 | 208 | void ring_reset(Ring *Q) 209 | { 210 | //for (int i=0; icap; i++) Q->queue[i] = i; 211 | randperm(Q->queue, Q->cap); 212 | Q->len = Q->cap; 213 | Q->front = Q->rear = 0; 214 | for (int i=0; icap; i++) Q->is_in[i] = 1; 215 | } 216 | 217 | /* -------------- main algorihtm ------------------------*/ 218 | 219 | float solve_locale(int max_iter, float eps, 220 | int n, SparseMat A, float *Adiag, SparseMat V, 221 | SparsePair *buf, float *__restrict__ s, float *__restrict__ d, float *__restrict__ g, 222 | Ring *Q, int *comm, int *n_comm, int shrink, int comm_init, int rnd_card, int verbose) 223 | { 224 | int64_t time_st = wall_clock_ns(); 225 | if (verbose>1) fprintf(stderr, "n_comm %d\n", *n_comm); 226 | double fval = 0; 227 | double m = 0; 228 | for (int i=0; i1) fprintf(stderr, "k = %d m %lf\n", k, m); 263 | if (verbose>1) fprintf(stderr, "inner 0 fval %f time %.4e\n", fval, wall_time_diff(time_now, time_st)); 264 | 265 | int iter=0, first=0, nvisited=0; 266 | double delta = 0; 267 | int rank = n; 268 | 269 | ring_reset(Q); 270 | while (1) { 271 | if (nvisited >= n || Q->len == 0) { 272 | iter ++; 273 | fval += delta/(2*m); 274 | time_now = wall_clock_ns(); 275 | if (verbose>1) fprintf(stderr, "inner %d fval %.8e delta %.2e %s %s qlen %d time %.4e\n", iter, fval, delta/(2*m), shrink?"shrink":"", first?"first":"", Q->len, wall_time_diff(time_now, time_st)); 276 | double scaled_delta = fabs(delta/(2*m)); 277 | if ((!shrink && (scaled_delta < eps || iter>=max_iter)) || (shrink && (scaled_delta < eps || Q->len==0 || iter>=15))) { 278 | if (shrink) break; 279 | if (rnd_card == 0) return fval; 280 | shrink ^= 1; 281 | first = shrink; 282 | ring_reset(Q); 283 | } else { 284 | first = 0; 285 | } 286 | 287 | delta = 0, nvisited = 0; 288 | } 289 | 290 | int i = ring_pop(Q); 291 | int ic = comm[i]; 292 | nvisited++; 293 | 294 | if (A.indptr[i]==A.indptr[i+1]) continue; 295 | if (*n_comm) { // avoid singleton 296 | int nonsingleton = 0; 297 | for (int p=A.indptr[i]; pk)? k : nbuf; 329 | std::partial_sort(buf, buf+mid, buf+nbuf, val_cmp); 330 | int npos = nbuf; 331 | for (int q=nbuf-1; q>=0; q--) 332 | if (buf[q].val <= 0) npos--; 333 | 334 | float gv, gnrm; 335 | if (npos == 0) { // gv=0 at best 336 | if (old_gv > -MEPS) { // vi'g = 0 -> g[indices of vi] = 0 337 | buf[0] = {V.indices[V.indptr[i]], V.data[V.indptr[i]]}; 338 | } else if (buf[0].val == 0) { // g0 = 0 339 | buf[0].val = 1; 340 | } else { // increase rank 341 | buf[0] = {rank++, 1}; 342 | if(rank > n*10) {fprintf(stderr, "rank explode\n"); exit(0);} 343 | } 344 | gnrm = 1, gv = 0, npos = 1; 345 | } else { // update 346 | //if (shrink) randpick(buf, npos, g, first?-100:old_gv); 347 | npos = min(npos, V.indptr[i+1]-V.indptr[i]); 348 | if (shrink) npos = min(npos, rnd_card); 349 | gv = gnrm = snrm2_pairs(buf, npos); 350 | } 351 | 352 | if ( (gv - old_gv <= MEPS) && !first ) { // if not increasing, continue 353 | axipy(s, d[i], V, i); 354 | continue; 355 | } 356 | 357 | if (gnrm <= MEPS) { 358 | fprintf(stderr, "nbuf %d npos %d v0 %e v1 %e buf0 %g %d\n", nbuf, npos, V.data[V.indptr[i]], V.data[V.indptr[i]+1], buf[0].val, buf[0].val<0); 359 | fprintf(stderr, "i=%d gnrm %e gv %e old_gv %e gv-old_gv %e first %d shrink %d\n", i, gnrm, gv, old_gv, gv-old_gv, first, shrink); 360 | exit(0); 361 | } 362 | // copy vector from buf 363 | for (int p=V.indptr[i], q=0; p= npos) buf[q] = {0, 0}; 365 | V.indices[p] = buf[q].idx; 366 | V.data[p] = buf[q].val / gnrm; 367 | } 368 | axipy(s, d[i], V, i); 369 | 370 | delta += (gv - old_gv)*2; 371 | 372 | /* Update one block in Locale: End */ 373 | 374 | // push neighbors to queue 375 | for (int p=A.indptr[i]; p=0; i--) { 424 | G.indptr[i+1] = G.indptr[i]; 425 | } 426 | G.indptr[0] = 0; 427 | 428 | #if 0 429 | double mA = 0; 430 | for (int i=0; i 2 | #include 3 | 4 | #include "cluster.h" 5 | 6 | namespace py = pybind11; 7 | 8 | using arr = py::array; 9 | float *fptr(arr& a) { return (float*) a.mutable_data(); } 10 | int *iptr(arr& a) { return (int*) a.mutable_data(); } 11 | 12 | static int has_inited = 0; 13 | 14 | void init_random_seed(long int seed) 15 | { 16 | has_inited = 1; 17 | srand48(seed); 18 | srandom(seed); 19 | } 20 | 21 | SparseMat SparseMat_init(arr indptr, arr indices, arr data) 22 | { 23 | SparseMat X; 24 | X.indptr = iptr(indptr); 25 | X.indices = iptr(indices); 26 | X.data = fptr(data); 27 | return X; 28 | } 29 | 30 | float py_solve_locale(int max_iter, float eps, 31 | arr Aindptr, arr Aindices, arr Adata, arr Adiag, 32 | arr Vindptr, arr Vindices, arr Vdata, 33 | arr buf, arr s, arr d, arr g, 34 | arr queue, arr is_in, 35 | arr comm, arr n_comm, 36 | int shrink, int comm_init, 37 | int rnd_card, int verbose) 38 | { 39 | if (!has_inited) init_random_seed((long int) time(NULL)); 40 | SparseMat A = SparseMat_init(Aindptr, Aindices, Adata); 41 | SparseMat V = SparseMat_init(Vindptr, Vindices, Vdata); 42 | 43 | int n = Aindptr.shape(0)-1; 44 | Ring Q = {0, 0, 0, n, iptr(queue), iptr(is_in)}; 45 | 46 | float fval = solve_locale(max_iter, eps, 47 | n, A, fptr(Adiag), V, 48 | (SparsePair*)buf.mutable_data(), fptr(s), fptr(d), fptr(g), 49 | &Q, 50 | iptr(comm), iptr(n_comm), 51 | shrink, comm_init, 52 | rnd_card, verbose); 53 | return fval; 54 | } 55 | 56 | void py_aggregate_clusters( 57 | arr Aindptr, arr Aindices, arr Adata, arr Adiag, 58 | arr Gindptr, arr Gindices, arr Gdata, arr Gdiag, 59 | arr comm) 60 | { 61 | SparseMat A = SparseMat_init(Aindptr, Aindices, Adata); 62 | SparseMat G = SparseMat_init(Gindptr, Gindices, Gdata); 63 | 64 | int nA = Aindptr.shape(0)-1; 65 | int nG = Gindptr.shape(0)-1; 66 | aggregate_clusters( 67 | nA, A, fptr(Adiag), 68 | nG, G, fptr(Gdiag), 69 | iptr(comm)); 70 | } 71 | 72 | void py_merge(arr comm, arr comm_next, arr new_comm) 73 | { 74 | int n = comm.shape(0); 75 | merge(n, iptr(comm), iptr(comm_next), iptr(new_comm)); 76 | } 77 | 78 | void py_split(arr comm, arr comm_next) 79 | { 80 | int n = comm.shape(0); 81 | split(n, iptr(comm), iptr(comm_next)); 82 | } 83 | 84 | PYBIND11_MODULE(EXTENSION_NAME, m) { 85 | m.def("init_random_seed", &init_random_seed, "Init random seed"); 86 | m.def("solve_locale", &py_solve_locale, "Solve locale optimization"); 87 | m.def("aggregate_clusters", &py_aggregate_clusters, "Form hypergraph"); 88 | m.def("merge", &py_merge, "Merge (cpu)"); 89 | m.def("split", &py_split, "Split (cpu)"); 90 | } 91 | --------------------------------------------------------------------------------