├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── conf.py ├── index.rst ├── readme.md ├── readthedocs.yml ├── source │ ├── markov_clustering.rst │ └── modules.rst └── static ├── markov_clustering ├── __init__.py ├── drawing.py ├── mcl.py ├── modularity.py └── utils.py ├── requirements.txt ├── setup.py ├── static ├── example.png ├── example_best.png └── example_coarse.png └── tests ├── test_mc.py └── test_modularity.py /.gitignore: -------------------------------------------------------------------------------- 1 | .cache 2 | markov_clustering.egg-info 3 | *__pycache__ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Guy Allard 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include tests/*.py 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Markov Clustering 2 | [![Documentation Status](https://readthedocs.org/projects/markov-clustering/badge/)](http://markov-clustering.readthedocs.io/en/latest/) 3 | 4 | This module implements of the MCL algorithm in python. 5 | 6 | The MCL algorithm was developed by Stijn van Dongen at the University of Utrecht. 7 | 8 | Details of the algorithm can be found on the [MCL homepage](https://micans.org/mcl/). 9 | 10 | 11 | ## Features 12 | 13 | - Sparse matrix support 14 | - Pruning 15 | 16 | ## Requirements 17 | 18 | - Core requirements 19 | - Python 3.x 20 | - numpy 21 | - scipy 22 | - scikit-learn 23 | 24 | - Optional (required for visualization) 25 | - networkx 26 | - matplotlib 27 | 28 | - To run the tests 29 | - pytest 30 | 31 | 32 | ## Installation 33 | 34 | The recommended installation method is via pip. 35 | 36 | To install with all requirements including support for visualization: 37 | ``` 38 | pip install markov_clustering[drawing] 39 | ``` 40 | 41 | To install with only support for the core MCL clustering: 42 | ``` 43 | pip install markov_clustering 44 | ``` 45 | 46 | 47 | ## Example 48 | 49 | ![example visualization](static/example.png) 50 | 51 | We will use NetworkX to generate the adjacency matrix for a random geometric graph which contains 200 nodes 52 | with random coordinates ranging from (-1,-1) to (1,1). Nodes are considered adjacent if the distance between 53 | them is <= 0.3 units. 54 | 55 | This example assumes that the optional dependencies (matplotlib and networkx) have been installed 56 | 57 | ```python 58 | import markov_clustering as mc 59 | import networkx as nx 60 | import random 61 | 62 | # number of nodes to use 63 | numnodes = 200 64 | 65 | # generate random positions as a dictionary where the key is the node id and the value 66 | # is a tuple containing 2D coordinates 67 | positions = {i:(random.random() * 2 - 1, random.random() * 2 - 1) for i in range(numnodes)} 68 | 69 | # use networkx to generate the graph 70 | network = nx.random_geometric_graph(numnodes, 0.3, pos=positions) 71 | 72 | # then get the adjacency matrix (in sparse form) 73 | matrix = nx.to_scipy_sparse_matrix(network) 74 | ``` 75 | 76 | We can then run the MCL algorithm on the adjacency matrix and retrieve the clusters. 77 | ```python 78 | result = mc.run_mcl(matrix) # run MCL with default parameters 79 | clusters = mc.get_clusters(result) # get clusters 80 | ``` 81 | 82 | Finally, we can draw the results. The draw_graph function only requires the adjacency matrix and the 83 | cluster list, but we will pass some extra parameters such as the node positions, set the node size, 84 | disable labels and set the color for edges. 85 | ```python 86 | mc.draw_graph(matrix, clusters, pos=positions, node_size=50, with_labels=False, edge_color="silver") 87 | ``` 88 | This should result in an image similar to the one at the top of this section. 89 | 90 | 91 | If the clustering is too fine for your taste, reducing the MCL inflation parameter to 1.4 (from the default of 2) 92 | will result in coarser clustering. e.g. 93 | ``` 94 | result = mc.run_mcl(matrix, inflation=1.4) 95 | clusters = mc.get_clusters(result) 96 | mc.draw_graph(matrix, clusters, pos=positions, node_size=50, with_labels=False, edge_color="silver") 97 | ``` 98 | ![coarse example](static/example_coarse.png) 99 | 100 | 101 | ## Choosing Hyperparameters 102 | 103 | Choosing appropriate values for hyperparameters (e.g. cluster inflation/expansion parameters) can be difficult. 104 | 105 | To assist with the evaluation of the clustering quality, we include an implementation of the modularity measure. 106 | Refer to 'Malliaros, Fragkiskos D., and Michalis Vazirgiannis. "Clustering and community detection in directed networks: A survey." Physics Reports 533.4 (2013): 95-142' 107 | for a detailed description. 108 | 109 | Briefly, the modularity (Q) can be considered to be the fraction of graph edges which belong to a cluster 110 | minus the fraction expected due to random chance, where the value of Q lies in the range [-1, 1]. High, positive 111 | Q values suggest higher clustering quality. 112 | 113 | We can use the modularity measure to optimize the clustering parameters. In the following example, 114 | we will determine the modularity for a range of cluster inflation values, allowing us to pick the best 115 | cluster inflation value for the given graph. 116 | 117 | Continuing from the previous example: 118 | 119 | ```python 120 | # perform clustering using different inflation values from 1.5 and 2.5 121 | # for each clustering run, calculate the modularity 122 | for inflation in [i / 10 for i in range(15, 26)]: 123 | result = mc.run_mcl(matrix, inflation=inflation) 124 | clusters = mc.get_clusters(result) 125 | Q = mc.modularity(matrix=result, clusters=clusters) 126 | print("inflation:", inflation, "modularity:", Q) 127 | ``` 128 | 129 | ``` 130 | inflation: 1.5 modularity: 0.7256870762382928 131 | inflation: 1.6 modularity: 0.7432262129804642 132 | inflation: 1.7 modularity: 0.7859467455621318 133 | inflation: 1.8 modularity: 0.8030876061752096 134 | inflation: 1.9 modularity: 0.8194196576112109 135 | inflation: 2.0 modularity: 0.8262072262823568 136 | inflation: 2.1 modularity: 0.8339806510839622 137 | inflation: 2.2 modularity: 0.8307322929171664 138 | inflation: 2.3 modularity: 0.8272367770637663 139 | inflation: 2.4 modularity: 0.8274133182684847 140 | inflation: 2.5 modularity: 0.8279076336416934 141 | ``` 142 | 143 | From the output, we see that an inflation value of 2.1 gives the highest modularity score, 144 | so we will use that as our final cluster inflation parameter. 145 | 146 | ```python 147 | # cluster using the optimized cluster inflation value 148 | result = mc.run_mcl(matrix, inflation=2.1) 149 | clusters = mc.get_clusters(result) 150 | mc.draw_graph(matrix, clusters, pos=positions, node_size=50, with_labels=False, edge_color="silver") 151 | ``` 152 | ![best example](static/example_best.png) 153 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = MarkovClustering 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Markov Clustering documentation build configuration file, created by 5 | # sphinx-quickstart on Sat Sep 30 16:58:12 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('../')) 23 | 24 | import matplotlib 25 | matplotlib.use('agg') 26 | 27 | # -- General configuration ------------------------------------------------ 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = ['sphinx.ext.autodoc', 37 | 'sphinx.ext.viewcode'] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # parser for markdown 43 | from recommonmark.parser import CommonMarkParser 44 | source_parsers = { 45 | '.md': CommonMarkParser, 46 | } 47 | 48 | # The suffix(es) of source filenames. 49 | # You can specify multiple suffix as a list of string: 50 | # 51 | source_suffix = ['.rst', '.md'] 52 | 53 | 54 | # The master toctree document. 55 | master_doc = 'index' 56 | 57 | # General information about the project. 58 | project = 'Markov Clustering' 59 | copyright = '2017, Guy Allard' 60 | author = 'Guy Allard' 61 | 62 | # The version info for the project you're documenting, acts as replacement for 63 | # |version| and |release|, also used in various other places throughout the 64 | # built documents. 65 | # 66 | # The short X.Y version. 67 | version = '0.0.2' 68 | # The full version, including alpha/beta/rc tags. 69 | release = '0.0.2' 70 | 71 | # The language for content autogenerated by Sphinx. Refer to documentation 72 | # for a list of supported languages. 73 | # 74 | # This is also used if you do content translation via gettext catalogs. 75 | # Usually you set "language" from the command line for these cases. 76 | language = None 77 | 78 | # List of patterns, relative to source directory, that match files and 79 | # directories to ignore when looking for source files. 80 | # This patterns also effect to html_static_path and html_extra_path 81 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # If true, `todo` and `todoList` produce output, else they produce nothing. 87 | todo_include_todos = False 88 | 89 | 90 | # -- Options for HTML output ---------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | # 95 | html_theme = 'sphinx_rtd_theme' 96 | 97 | # Theme options are theme-specific and customize the look and feel of a theme 98 | # further. For a list of options available for each theme, see the 99 | # documentation. 100 | # 101 | # html_theme_options = {} 102 | 103 | # Add any paths that contain custom static files (such as style sheets) here, 104 | # relative to this directory. They are copied after the builtin static files, 105 | # so a file named "default.css" will overwrite the builtin "default.css". 106 | html_static_path = ['_static'] 107 | 108 | # Custom sidebar templates, must be a dictionary that maps document names 109 | # to template names. 110 | # 111 | # This is required for the alabaster theme 112 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 113 | html_sidebars = { 114 | '**': [ 115 | 'about.html', 116 | 'navigation.html', 117 | 'relations.html', # needs 'show_related': True theme option to display 118 | 'searchbox.html', 119 | 'donate.html', 120 | ] 121 | } 122 | 123 | 124 | # -- Options for HTMLHelp output ------------------------------------------ 125 | 126 | # Output file base name for HTML help builder. 127 | htmlhelp_basename = 'MarkovClusteringdoc' 128 | 129 | 130 | # -- Options for LaTeX output --------------------------------------------- 131 | 132 | latex_elements = { 133 | # The paper size ('letterpaper' or 'a4paper'). 134 | # 135 | # 'papersize': 'letterpaper', 136 | 137 | # The font size ('10pt', '11pt' or '12pt'). 138 | # 139 | # 'pointsize': '10pt', 140 | 141 | # Additional stuff for the LaTeX preamble. 142 | # 143 | # 'preamble': '', 144 | 145 | # Latex figure (float) alignment 146 | # 147 | # 'figure_align': 'htbp', 148 | } 149 | 150 | # Grouping the document tree into LaTeX files. List of tuples 151 | # (source start file, target name, title, 152 | # author, documentclass [howto, manual, or own class]). 153 | latex_documents = [ 154 | (master_doc, 'MarkovClustering.tex', 'Markov Clustering Documentation', 155 | 'Guy Allard', 'manual'), 156 | ] 157 | 158 | 159 | # -- Options for manual page output --------------------------------------- 160 | 161 | # One entry per manual page. List of tuples 162 | # (source start file, name, description, authors, manual section). 163 | man_pages = [ 164 | (master_doc, 'markovclustering', 'Markov Clustering Documentation', 165 | [author], 1) 166 | ] 167 | 168 | 169 | # -- Options for Texinfo output ------------------------------------------- 170 | 171 | # Grouping the document tree into Texinfo files. List of tuples 172 | # (source start file, target name, title, author, 173 | # dir menu entry, description, category) 174 | texinfo_documents = [ 175 | (master_doc, 'MarkovClustering', 'Markov Clustering Documentation', 176 | author, 'MarkovClustering', 'One line description of project.', 177 | 'Miscellaneous'), 178 | ] 179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Markov Clustering documentation master file, created by 2 | sphinx-quickstart on Sat Sep 30 18:06:08 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Markov Clustering for Python 7 | ============================ 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | readme 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/readme.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /docs/readthedocs.yml: -------------------------------------------------------------------------------- 1 | python: 2 | version: 3 3 | extra_requirements: 4 | - drawing 5 | -------------------------------------------------------------------------------- /docs/source/markov_clustering.rst: -------------------------------------------------------------------------------- 1 | markov\_clustering package 2 | ========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | markov\_clustering\.drawing module 8 | ---------------------------------- 9 | 10 | .. automodule:: markov_clustering.drawing 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | markov\_clustering\.mcl module 17 | ------------------------------ 18 | 19 | .. automodule:: markov_clustering.mcl 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | 25 | markov\_clustering\.modularity module 26 | ------------------------------------- 27 | 28 | .. automodule:: markov_clustering.modularity 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | 34 | markov\_clustering\.utils module 35 | -------------------------------- 36 | 37 | .. automodule:: markov_clustering.utils 38 | :members: 39 | :undoc-members: 40 | :show-inheritance: 41 | 42 | 43 | Module contents 44 | --------------- 45 | 46 | .. automodule:: markov_clustering 47 | :members: 48 | :undoc-members: 49 | :show-inheritance: 50 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | markov_clustering 2 | ================= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | markov_clustering 8 | -------------------------------------------------------------------------------- /docs/static: -------------------------------------------------------------------------------- 1 | ../static -------------------------------------------------------------------------------- /markov_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from .mcl import * 3 | from .modularity import * 4 | 5 | try: 6 | from .drawing import * 7 | except ImportError: 8 | sys.stderr.write("Visualization not supported to missing libraries.\n") 9 | 10 | __version_info__ = ("0", "0", "6", "dev") 11 | __date__ = "13 Dec 2018" 12 | 13 | __version__ = ".".join(__version_info__) 14 | __author__ = "Guy Allard" 15 | __contributors__ = "Jona Harris, Mounir Mallek" 16 | __contact__ = "guyallard01@gmail.com" 17 | __homepage__ = "https://github.com/guyallard/markov_clustering.git" 18 | -------------------------------------------------------------------------------- /markov_clustering/drawing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Visualization of clusters 3 | """ 4 | import sys 5 | 6 | try: 7 | import networkx as nx 8 | except ImportError: 9 | sys.stderr.write("Networkx not present\n") 10 | raise 11 | 12 | try: 13 | from matplotlib.pylab import show, cm, axis 14 | except ImportError: 15 | sys.stderr.write("Matplotlib not present\n") 16 | raise 17 | 18 | 19 | def draw_graph(matrix, clusters, **kwargs): 20 | """ 21 | Visualize the clustering 22 | 23 | :param matrix: The unprocessed adjacency matrix 24 | :param clusters: list of tuples containing clusters as returned 25 | by 'get_clusters' 26 | :param kwargs: Additional keyword arguments to be passed to 27 | networkx.draw_networkx 28 | """ 29 | # make a networkx graph from the adjacency matrix 30 | graph = nx.Graph(matrix) 31 | 32 | # map node to cluster id for colors 33 | cluster_map = {node: i for i, cluster in enumerate(clusters) for node in cluster} 34 | colors = [cluster_map[i] for i in range(len(graph.nodes()))] 35 | 36 | # if colormap not specified in kwargs, use a default 37 | if not kwargs.get("cmap", False): 38 | kwargs["cmap"] = cm.tab20 39 | 40 | # draw 41 | nx.draw_networkx(graph, node_color=colors, **kwargs) 42 | axis("off") 43 | show(block=False) 44 | -------------------------------------------------------------------------------- /markov_clustering/mcl.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import isspmatrix, dok_matrix, csc_matrix 3 | import sklearn.preprocessing 4 | from .utils import MessagePrinter 5 | 6 | 7 | def sparse_allclose(a, b, rtol=1e-5, atol=1e-8): 8 | """ 9 | Version of np.allclose for use with sparse matrices 10 | """ 11 | c = np.abs(a - b) - rtol * np.abs(b) 12 | # noinspection PyUnresolvedReferences 13 | return c.max() <= atol 14 | 15 | 16 | def normalize(matrix): 17 | """ 18 | Normalize the columns of the given matrix 19 | 20 | :param matrix: The matrix to be normalized 21 | :returns: The normalized matrix 22 | """ 23 | return sklearn.preprocessing.normalize(matrix, norm="l1", axis=0) 24 | 25 | 26 | def inflate(matrix, power): 27 | """ 28 | Apply cluster inflation to the given matrix by raising 29 | each element to the given power. 30 | 31 | :param matrix: The matrix to be inflated 32 | :param power: Cluster inflation parameter 33 | :returns: The inflated matrix 34 | """ 35 | if isspmatrix(matrix): 36 | return normalize(matrix.power(power)) 37 | 38 | return normalize(np.power(matrix, power)) 39 | 40 | 41 | def expand(matrix, power): 42 | """ 43 | Apply cluster expansion to the given matrix by raising 44 | the matrix to the given power. 45 | 46 | :param matrix: The matrix to be expanded 47 | :param power: Cluster expansion parameter 48 | :returns: The expanded matrix 49 | """ 50 | if isspmatrix(matrix): 51 | return matrix ** power 52 | 53 | return np.linalg.matrix_power(matrix, power) 54 | 55 | 56 | def add_self_loops(matrix, loop_value): 57 | """ 58 | Add self-loops to the matrix by setting the diagonal 59 | to loop_value 60 | 61 | :param matrix: The matrix to add loops to 62 | :param loop_value: Value to use for self-loops 63 | :returns: The matrix with self-loops 64 | """ 65 | shape = matrix.shape 66 | assert shape[0] == shape[1], "Error, matrix is not square" 67 | 68 | if isspmatrix(matrix): 69 | new_matrix = matrix.todok() 70 | else: 71 | new_matrix = matrix.copy() 72 | 73 | for i in range(shape[0]): 74 | new_matrix[i, i] = loop_value 75 | 76 | if isspmatrix(matrix): 77 | return new_matrix.tocsc() 78 | 79 | return new_matrix 80 | 81 | 82 | def prune(matrix, threshold): 83 | """ 84 | Prune the matrix so that very small edges are removed. 85 | The maximum value in each column is never pruned. 86 | 87 | :param matrix: The matrix to be pruned 88 | :param threshold: The value below which edges will be removed 89 | :returns: The pruned matrix 90 | """ 91 | if isspmatrix(matrix): 92 | pruned = dok_matrix(matrix.shape) 93 | pruned[matrix >= threshold] = matrix[matrix >= threshold] 94 | pruned = pruned.tocsc() 95 | else: 96 | pruned = matrix.copy() 97 | pruned[pruned < threshold] = 0 98 | 99 | # keep max value in each column. same behaviour for dense/sparse 100 | num_cols = matrix.shape[1] 101 | row_indices = matrix.argmax(axis=0).reshape((num_cols,)) 102 | col_indices = np.arange(num_cols) 103 | pruned[row_indices, col_indices] = matrix[row_indices, col_indices] 104 | 105 | return pruned 106 | 107 | 108 | def converged(matrix1, matrix2): 109 | """ 110 | Check for convergence by determining if 111 | matrix1 and matrix2 are approximately equal. 112 | 113 | :param matrix1: The matrix to compare with matrix2 114 | :param matrix2: The matrix to compare with matrix1 115 | :returns: True if matrix1 and matrix2 approximately equal 116 | """ 117 | if isspmatrix(matrix1) or isspmatrix(matrix2): 118 | return sparse_allclose(matrix1, matrix2) 119 | 120 | return np.allclose(matrix1, matrix2) 121 | 122 | 123 | def iterate(matrix, expansion, inflation): 124 | """ 125 | Run a single iteration (expansion + inflation) of the mcl algorithm 126 | 127 | :param matrix: The matrix to perform the iteration on 128 | :param expansion: Cluster expansion factor 129 | :param inflation: Cluster inflation factor 130 | """ 131 | # Expansion 132 | matrix = expand(matrix, expansion) 133 | 134 | # Inflation 135 | matrix = inflate(matrix, inflation) 136 | 137 | return matrix 138 | 139 | 140 | def get_clusters(matrix): 141 | """ 142 | Retrieve the clusters from the matrix 143 | 144 | :param matrix: The matrix produced by the MCL algorithm 145 | :returns: A list of tuples where each tuple represents a cluster and 146 | contains the indices of the nodes belonging to the cluster 147 | """ 148 | if not isspmatrix(matrix): 149 | # cast to sparse so that we don't need to handle different 150 | # matrix types 151 | matrix = csc_matrix(matrix) 152 | 153 | # get the attractors - non-zero elements of the matrix diagonal 154 | attractors = matrix.diagonal().nonzero()[0] 155 | 156 | # somewhere to put the clusters 157 | clusters = set() 158 | 159 | # the nodes in the same row as each attractor form a cluster 160 | for attractor in attractors: 161 | cluster = tuple(matrix.getrow(attractor).nonzero()[1].tolist()) 162 | clusters.add(cluster) 163 | 164 | return sorted(list(clusters)) 165 | 166 | 167 | def run_mcl(matrix, expansion=2, inflation=2, loop_value=1, 168 | iterations=100, pruning_threshold=0.001, pruning_frequency=1, 169 | convergence_check_frequency=1, verbose=False): 170 | """ 171 | Perform MCL on the given similarity matrix 172 | 173 | :param matrix: The similarity matrix to cluster 174 | :param expansion: The cluster expansion factor 175 | :param inflation: The cluster inflation factor 176 | :param loop_value: Initialization value for self-loops 177 | :param iterations: Maximum number of iterations 178 | (actual number of iterations will be less if convergence is reached) 179 | :param pruning_threshold: Threshold below which matrix elements will be set 180 | set to 0 181 | :param pruning_frequency: Perform pruning every 'pruning_frequency' 182 | iterations. 183 | :param convergence_check_frequency: Perform the check for convergence 184 | every convergence_check_frequency iterations 185 | :param verbose: Print extra information to the console 186 | :returns: The final matrix 187 | """ 188 | assert expansion > 1, "Invalid expansion parameter" 189 | assert inflation > 1, "Invalid inflation parameter" 190 | assert loop_value >= 0, "Invalid loop_value" 191 | assert iterations > 0, "Invalid number of iterations" 192 | assert pruning_threshold >= 0, "Invalid pruning_threshold" 193 | assert pruning_frequency > 0, "Invalid pruning_frequency" 194 | assert convergence_check_frequency > 0, "Invalid convergence_check_frequency" 195 | 196 | printer = MessagePrinter(verbose) 197 | 198 | printer.print("-" * 50) 199 | printer.print("MCL Parameters") 200 | printer.print("Expansion: {}".format(expansion)) 201 | printer.print("Inflation: {}".format(inflation)) 202 | if pruning_threshold > 0: 203 | printer.print("Pruning threshold: {}, frequency: {} iteration{}".format( 204 | pruning_threshold, pruning_frequency, "s" if pruning_frequency > 1 else "")) 205 | else: 206 | printer.print("No pruning") 207 | printer.print("Convergence check: {} iteration{}".format( 208 | convergence_check_frequency, "s" if convergence_check_frequency > 1 else "")) 209 | printer.print("Maximum iterations: {}".format(iterations)) 210 | printer.print("{} matrix mode".format("Sparse" if isspmatrix(matrix) else "Dense")) 211 | printer.print("-" * 50) 212 | 213 | # Initialize self-loops 214 | if loop_value > 0: 215 | matrix = add_self_loops(matrix, loop_value) 216 | 217 | # Normalize 218 | matrix = normalize(matrix) 219 | 220 | # iterations 221 | for i in range(iterations): 222 | printer.print("Iteration {}".format(i + 1)) 223 | 224 | # store current matrix for convergence checking 225 | last_mat = matrix.copy() 226 | 227 | # perform MCL expansion and inflation 228 | matrix = iterate(matrix, expansion, inflation) 229 | 230 | # prune 231 | if pruning_threshold > 0 and i % pruning_frequency == pruning_frequency - 1: 232 | printer.print("Pruning") 233 | matrix = prune(matrix, pruning_threshold) 234 | 235 | # Check for convergence 236 | if i % convergence_check_frequency == convergence_check_frequency - 1: 237 | printer.print("Checking for convergence") 238 | if converged(matrix, last_mat): 239 | printer.print("Converged after {} iteration{}".format(i + 1, "s" if i > 0 else "")) 240 | break 241 | 242 | printer.print("-" * 50) 243 | 244 | return matrix 245 | -------------------------------------------------------------------------------- /markov_clustering/modularity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Computation of the modularity of a clustering 3 | """ 4 | 5 | import numpy as np 6 | from fractions import Fraction 7 | from itertools import permutations 8 | 9 | from scipy.sparse import isspmatrix, dok_matrix, find 10 | from .mcl import sparse_allclose 11 | 12 | def is_undirected(matrix): 13 | """ 14 | Determine if the matrix reprensents a directed graph 15 | 16 | :param matrix: The matrix to tested 17 | :returns: boolean 18 | """ 19 | if isspmatrix(matrix): 20 | return sparse_allclose(matrix, matrix.transpose()) 21 | 22 | return np.allclose(matrix, matrix.T) 23 | 24 | 25 | def convert_to_adjacency_matrix(matrix): 26 | """ 27 | Converts transition matrix into adjacency matrix 28 | 29 | :param matrix: The matrix to be converted 30 | :returns: adjacency matrix 31 | """ 32 | for i in range(matrix.shape[0]): 33 | 34 | if isspmatrix(matrix): 35 | col = find(matrix[:,i])[2] 36 | else: 37 | col = matrix[:,i].T.tolist()[0] 38 | 39 | coeff = max( Fraction(c).limit_denominator().denominator for c in col ) 40 | matrix[:,i] *= coeff 41 | 42 | return matrix 43 | 44 | 45 | def delta_matrix(matrix, clusters): 46 | """ 47 | Compute delta matrix where delta[i,j]=1 if i and j belong 48 | to same cluster and i!=j 49 | 50 | :param matrix: The adjacency matrix 51 | :param clusters: The clusters returned by get_clusters 52 | :returns: delta matrix 53 | """ 54 | if isspmatrix(matrix): 55 | delta = dok_matrix(matrix.shape) 56 | else: 57 | delta = np.zeros(matrix.shape) 58 | 59 | for i in clusters : 60 | for j in permutations(i, 2): 61 | delta[j] = 1 62 | 63 | return delta 64 | 65 | 66 | def modularity(matrix, clusters): 67 | """ 68 | Compute the modularity 69 | 70 | :param matrix: The adjacency matrix 71 | :param clusters: The clusters returned by get_clusters 72 | :returns: modularity value 73 | """ 74 | matrix = convert_to_adjacency_matrix(matrix) 75 | m = matrix.sum() 76 | 77 | if isspmatrix(matrix): 78 | matrix_2 = matrix.tocsr(copy=True) 79 | else : 80 | matrix_2 = matrix 81 | 82 | if is_undirected(matrix): 83 | expected = lambda i,j : (( matrix_2[i,:].sum() + matrix[:,i].sum() )* 84 | ( matrix[:,j].sum() + matrix_2[j,:].sum() )) 85 | else: 86 | expected = lambda i,j : ( matrix_2[i,:].sum()*matrix[:,j].sum() ) 87 | 88 | delta = delta_matrix(matrix, clusters) 89 | indices = np.array(delta.nonzero()) 90 | 91 | Q = sum( matrix[i, j] - expected(i, j)/m for i, j in indices.T )/m 92 | 93 | return Q 94 | -------------------------------------------------------------------------------- /markov_clustering/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple togglable printer class 3 | """ 4 | 5 | 6 | class MessagePrinter(object): 7 | def __init__(self, enabled): 8 | self._enabled = enabled 9 | 10 | def enable(self): 11 | self._enabled = True 12 | 13 | def disable(self): 14 | self._enabled = False 15 | 16 | def print(self, string): 17 | if self._enabled: 18 | print(string) 19 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | numpy 3 | scipy 4 | scikit-learn 5 | matplotlib 6 | networkx 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | import sys 4 | 5 | if sys.version_info[0] < 3: 6 | raise Exception('markov_clustering requires Python 3') 7 | 8 | distmeta = {} 9 | for line in open(os.path.join('markov_clustering', '__init__.py')): 10 | try: 11 | field, value = (x.strip() for x in line.split('=')) 12 | except ValueError: 13 | continue 14 | if field == '__version_info__': 15 | value = value.strip('[]()') 16 | value = '.'.join(x.strip(' \'"') for x in value.split(',')) 17 | else: 18 | value = value.strip('\'"') 19 | distmeta[field] = value 20 | 21 | long_description = "See {}".format(distmeta["__homepage__"]) 22 | 23 | setup( 24 | name="markov_clustering", 25 | version=distmeta["__version_info__"], 26 | description="Implementation of the Markov clustering (MCL) algorithm in python.", 27 | long_description=long_description, 28 | author=distmeta["__author__"], 29 | author_email=distmeta["__contact__"], 30 | url=distmeta["__homepage__"], 31 | license="MIT", 32 | platforms=["linux"], 33 | packages=["markov_clustering"], 34 | python_requires='~=3.0', 35 | install_requires=[ 36 | "numpy", 37 | "scipy>=0.19.0", 38 | "scikit-learn", 39 | ], 40 | extras_require={ 41 | "drawing": ["networkx", "matplotlib"] 42 | }, 43 | entry_points={ 44 | }, 45 | classifiers=[ 46 | "Development Status :: 3 - Alpha", 47 | "Intended Audience :: Developers", 48 | "Intended Audience :: Science/Research", 49 | "Operating System :: POSIX :: Linux", 50 | "Programming Language :: Python", 51 | "Topic :: Scientific/Engineering", 52 | "License :: OSI Approved :: MIT License", 53 | ], 54 | keywords = "bioinformatics clustering" 55 | ) 56 | -------------------------------------------------------------------------------- /static/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuyAllard/markov_clustering/28787cf64ef06bf024ff915246008c767ea830cf/static/example.png -------------------------------------------------------------------------------- /static/example_best.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuyAllard/markov_clustering/28787cf64ef06bf024ff915246008c767ea830cf/static/example_best.png -------------------------------------------------------------------------------- /static/example_coarse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GuyAllard/markov_clustering/28787cf64ef06bf024ff915246008c767ea830cf/static/example_coarse.png -------------------------------------------------------------------------------- /tests/test_mc.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import markov_clustering as mc 4 | from scipy.sparse import csc_matrix 5 | 6 | test_matrices = [ 7 | ( # normalize 8 | [[1, 1, 0], 9 | [0, 1, 1], 10 | [0, 0, 1]], 11 | 12 | [[1, 0.5, 0], 13 | [0, 0.5, 0.5], 14 | [0, 0, 0.5]] 15 | ), 16 | ( # inflate 17 | [[0.5, 0.5], 18 | [1, 1]], 19 | 20 | [[0.2, 0.2], 21 | [0.8, 0.8]] 22 | ), 23 | ( # expand 24 | [[1, 0.5, 0], 25 | [0, 0.5, 0.5], 26 | [0, 0, 0.5]], 27 | 28 | [[1, 0.75, 0.25], 29 | [0, 0.25, 0.5 ], 30 | [0, 0, 0.25]] 31 | ), 32 | ( # self loops 33 | [[0, 0.5, 0], 34 | [0, 0, 0.5], 35 | [0.5, 0, 0.5]], 36 | 37 | [[2, 0.5, 0], 38 | [0, 2, 0.5], 39 | [0.5, 0, 2]] 40 | ), 41 | ( # prune 42 | [[2, 0.5, 0], 43 | [0, 2, 0.5], 44 | [0.5, 0, 2]], 45 | 46 | [[2, 0, 0], 47 | [0, 2, 0], 48 | [0, 0, 2]] 49 | ), 50 | # converged 51 | [[2, 0.5, 0], 52 | [0, 2, 0.5], 53 | [0.5, 0, 2]], 54 | ( 55 | # iterate 56 | [[1, 1, 0], 57 | [1, 1, 1], 58 | [0, 1, 1]], 59 | 60 | [[ 0.44444444, 0.23529412, 0.11111111], 61 | [ 0.44444444, 0.52941176, 0.44444444], 62 | [ 0.11111111, 0.23529412, 0.44444444]] 63 | ), 64 | ( 65 | # mcl algorithm 66 | [[1, 1, 1, 0, 0, 0, 0], 67 | [1, 1, 1, 0, 0, 0, 0], 68 | [1, 1, 1, 1, 0, 0, 0], 69 | [0, 0, 1, 1, 1, 0, 1], 70 | [0, 0, 0, 1, 1, 1, 1], 71 | [0, 0, 0, 0, 1, 1, 1], 72 | [0, 0, 0, 1, 1, 1, 1]], 73 | 74 | [[0., 0., 0., 0., 0., 0., 0.], 75 | [0., 0., 0., 0., 0., 0., 0.], 76 | [1., 1., 1., 0., 0., 0., 0.], 77 | [0., 0., 0., 0., 0., 0., 0.], 78 | [0., 0., 0., 0.5, 0.5, 0.5, 0.5], 79 | [0., 0., 0., 0., 0., 0., 0.], 80 | [0., 0., 0., 0.5, 0.5, 0.5, 0.5]] 81 | ), 82 | ] 83 | 84 | def test_normalize(): 85 | source = np.matrix(test_matrices[0][0]) 86 | target = np.matrix(test_matrices[0][1]) 87 | 88 | norm = mc.normalize(source) 89 | assert np.array_equal(norm, target) 90 | 91 | 92 | def test_normalize_sparse(): 93 | source = csc_matrix(test_matrices[0][0]) 94 | target = np.matrix(test_matrices[0][1]) 95 | 96 | norm = mc.normalize(source).todense() 97 | assert np.array_equal(norm, target) 98 | 99 | 100 | def test_inflate(): 101 | source = np.matrix(test_matrices[1][0]) 102 | target = np.matrix(test_matrices[1][1]) 103 | 104 | inflated = mc.inflate(source, 2) 105 | assert np.array_equal(inflated, target) 106 | 107 | 108 | def test_inflate_sparse(): 109 | source = csc_matrix(test_matrices[1][0]) 110 | target = np.matrix(test_matrices[1][1]) 111 | 112 | inflated = mc.inflate(source, 2).todense() 113 | assert np.array_equal(inflated, target) 114 | 115 | 116 | def test_expand(): 117 | source = np.matrix(test_matrices[2][0]) 118 | target = np.matrix(test_matrices[2][1]) 119 | 120 | expanded = mc.expand(source, 2) 121 | assert np.array_equal(expanded, target) 122 | 123 | 124 | def test_expand_sparse(): 125 | source = csc_matrix(test_matrices[2][0]) 126 | target = np.matrix(test_matrices[2][1]) 127 | 128 | expanded = mc.expand(source, 2).todense() 129 | assert np.array_equal(expanded, target) 130 | 131 | 132 | def test_add_self_loops(): 133 | source = np.matrix(test_matrices[3][0]) 134 | target = np.matrix(test_matrices[3][1]) 135 | 136 | looped = mc.add_self_loops(source, 2) 137 | assert np.array_equal(looped, target) 138 | 139 | 140 | def test_add_self_loops_sparse(): 141 | source = csc_matrix(test_matrices[3][0]) 142 | target = np.matrix(test_matrices[3][1]) 143 | 144 | looped = mc.add_self_loops(source, 2).todense() 145 | assert np.array_equal(looped, target) 146 | 147 | 148 | def test_prune(): 149 | source = np.matrix(test_matrices[4][0]) 150 | target = np.matrix(test_matrices[4][1]) 151 | 152 | pruned = mc.prune(source, 1) 153 | assert np.array_equal(pruned, target) 154 | 155 | 156 | def test_prune_sparse(): 157 | source = csc_matrix(test_matrices[4][0]) 158 | target = np.matrix(test_matrices[4][1]) 159 | 160 | pruned = mc.prune(source, 1).todense() 161 | assert np.array_equal(pruned, target) 162 | 163 | 164 | def test_converged(): 165 | source = np.matrix(test_matrices[5]) 166 | assert mc.converged(source, source) 167 | 168 | source2 = source.copy() 169 | source2[0,0] = 2.2 170 | assert not mc.converged(source, source2) 171 | 172 | 173 | def test_converged_sparse(): 174 | source = csc_matrix(test_matrices[5]) 175 | assert mc.converged(source, source) 176 | 177 | source2 = source.copy() 178 | source2[0,0] = 2.2 179 | assert not mc.converged(source, source2) 180 | 181 | 182 | def test_iterate(): 183 | source = np.matrix(test_matrices[6][0]) 184 | target = np.matrix(test_matrices[6][1]) 185 | 186 | iterated = mc.normalize(mc.iterate(source, 2, 2)) 187 | assert np.array_equal(np.round(iterated, 4), np.round(target, 4)) 188 | 189 | 190 | def test_iterate_sparse(): 191 | source = csc_matrix(test_matrices[6][0]) 192 | target = np.matrix(test_matrices[6][1]) 193 | 194 | iterated = mc.normalize(mc.iterate(source, 2, 2)).todense() 195 | assert np.array_equal(np.round(iterated, 4), np.round(target, 4)) 196 | 197 | 198 | def test_mcl(): 199 | source = np.matrix(test_matrices[7][0]) 200 | target = np.matrix(test_matrices[7][1]) 201 | 202 | result = mc.run_mcl(source) 203 | assert np.array_equal(np.round(result,4), np.round(target, 4)) 204 | 205 | 206 | def test_mcl_sparse(): 207 | source = csc_matrix(test_matrices[7][0]) 208 | target = np.matrix(test_matrices[7][1]) 209 | 210 | result = mc.run_mcl(source).todense() 211 | assert np.array_equal(np.round(result, 4), np.round(target, 4)) 212 | 213 | 214 | def test_get_clusters(): 215 | source = np.matrix(test_matrices[7][1]) 216 | target = [(0,1,2), (3,4,5,6)] 217 | result = mc.get_clusters(source) 218 | assert result == target 219 | 220 | 221 | def test_get_clusers_sparse(): 222 | source = csc_matrix(test_matrices[7][1]) 223 | target = [(0,1,2), (3,4,5,6)] 224 | result = mc.get_clusters(source) 225 | assert result == target 226 | -------------------------------------------------------------------------------- /tests/test_modularity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from scipy.sparse import csc_matrix 4 | import markov_clustering as mc 5 | 6 | test_matrices = [ 7 | ( # is undirected 8 | [[1, 1, 0], 9 | [0, 1, 1], 10 | [0, 0, 1]], 11 | 12 | False 13 | ), 14 | ( # is undirected 15 | [[1, 0, 0], 16 | [0, 1, 1], 17 | [0, 1, 1]], 18 | 19 | True 20 | ), 21 | ( # convert to adjacency matrix 22 | [[1, 0.5, 0 ], 23 | [0, 0.5, 2/3], 24 | [0, 0, 1/3]], 25 | 26 | [[1, 1, 0], 27 | [0, 1, 2], 28 | [0, 0, 1]] 29 | ), 30 | ( # delta matrix 31 | [(0,1,2), (3,4,5,6)], 32 | 33 | [[0, 1, 1, 0, 0, 0, 0], 34 | [1, 0, 1, 0, 0, 0, 0], 35 | [1, 1, 0, 0, 0, 0, 0], 36 | [0, 0, 0, 0, 1, 1, 1], 37 | [0, 0, 0, 1, 0, 1, 1], 38 | [0, 0, 0, 1, 1, 0, 1], 39 | [0, 0, 0, 1, 1, 1, 0]] 40 | ), 41 | ( # compute modularity 42 | [[1/3, 1/3, 1/4, 0 , 0 , 0 , 0 ], 43 | [1/3, 1/3, 1/4, 0 , 0 , 0 , 0 ], 44 | [1/3, 1/3, 1/4, 1/4, 0 , 0 , 0 ], 45 | [0 , 0 , 1/4, 1/4, 1/4, 0 , 1/4], 46 | [0 , 0 , 0 , 1/4, 1/4, 1/3, 1/4], 47 | [0 , 0 , 0 , 0 , 1/4, 1/3, 1/4], 48 | [0 , 0 , 0 , 1/4, 1/4, 1/3, 1/4]], 49 | 50 | -284/625 51 | ), 52 | ] 53 | 54 | def test_is_undirected_1(): 55 | source = np.matrix(test_matrices[0][0]) 56 | target = test_matrices[0][1] 57 | 58 | norm = mc.is_undirected(source) 59 | assert norm == target 60 | 61 | 62 | def test_is_undirected_1_sparse(): 63 | source = csc_matrix(test_matrices[0][0]) 64 | target = test_matrices[0][1] 65 | 66 | norm = mc.is_undirected(source) 67 | assert norm == target 68 | 69 | 70 | def test_is_undirected_2(): 71 | source = np.matrix(test_matrices[1][0]) 72 | target = test_matrices[1][1] 73 | 74 | norm = mc.is_undirected(source) 75 | assert norm == target 76 | 77 | 78 | def test_is_undirected_2_sparse(): 79 | source = csc_matrix(test_matrices[1][0]) 80 | target = test_matrices[1][1] 81 | 82 | norm = mc.is_undirected(source) 83 | assert norm == target 84 | 85 | 86 | def test_conversion(): 87 | source = np.matrix(test_matrices[2][0]) 88 | target = np.matrix(test_matrices[2][1]) 89 | 90 | converted = mc.convert_to_adjacency_matrix(source) 91 | assert np.array_equal(converted, target) 92 | 93 | 94 | def test_conversion_sparse(): 95 | source = csc_matrix(test_matrices[2][0]) 96 | target = np.matrix(test_matrices[2][1]) 97 | 98 | converted = mc.convert_to_adjacency_matrix(source).todense() 99 | assert np.array_equal(converted, target) 100 | 101 | 102 | def test_delta_matrix(): 103 | source = test_matrices[3][0] 104 | target = np.matrix(test_matrices[3][1]) 105 | 106 | delta = mc.delta_matrix(np.matrix(test_matrices[4][0]), source) 107 | assert np.array_equal(delta, target) 108 | 109 | 110 | def test_delta_matrix_sparse(): 111 | source = test_matrices[3][0] 112 | target = np.matrix(test_matrices[3][1]) 113 | 114 | delta = mc.delta_matrix( csc_matrix(test_matrices[4][0]), source).todense() 115 | assert np.array_equal(delta, target) 116 | 117 | 118 | def test_modularity(): 119 | source = np.matrix(test_matrices[4][0]) 120 | target = test_matrices[4][1] 121 | clusters = mc.get_clusters(mc.run_mcl(source)) 122 | 123 | quality = mc.modularity(source, clusters) 124 | assert np.isclose(quality, target) 125 | 126 | 127 | def test_modularity_sparse(): 128 | source = csc_matrix(test_matrices[4][0]) 129 | target = test_matrices[4][1] 130 | clusters = mc.get_clusters(mc.run_mcl(source)) 131 | 132 | quality = mc.modularity(source, clusters) 133 | assert np.isclose(quality, target) 134 | --------------------------------------------------------------------------------