├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── gmatch4py ├── __init__.py ├── bag_of_cliques.pyx ├── base.pxd ├── base.pyx ├── bon.pyx ├── embedding │ ├── __init__.py │ ├── deepwalk.pyx │ ├── graph.pyx │ ├── graph2vec.pyx │ ├── node2vec.pyx │ ├── skipgram.pyx │ └── walks.pyx ├── ged │ ├── __init__.py │ ├── abstract_graph_edit_dist.pxd │ ├── abstract_graph_edit_dist.pyx │ ├── bipartite_graph_matching_2.pyx │ ├── graph_edit_dist.pxd │ ├── graph_edit_dist.pyx │ ├── greedy_edit_distance.pyx │ └── hausdorff_edit_distance.pyx ├── graph.pxd ├── graph.pyx ├── helpers │ ├── __init__.py │ ├── general.pyx │ └── reader.pyx ├── jaccard.pyx ├── kernels │ ├── __init__.py │ ├── adjacency.pyx │ ├── random_walk_kernel.pyx │ ├── shortest_path_kernel.pyx │ └── weisfeiler_lehman.pyx ├── mcs.pyx ├── vertex_edge_overlap.pyx └── vertex_ranking.pyx ├── logo2.png ├── requirements.txt ├── setup.py └── test ├── gmatch4py_performance_test.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | *.cpp 106 | *.c 107 | .DS_Store 108 | .idea 109 | .vscode -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | 5 | notifications: 6 | email: false 7 | 8 | install: 9 | - pip install cython numpy networkx scipy scikit-learn pandas gensim joblib gensim psutil --upgrade 10 | - pip install . 11 | 12 | script: 13 | - echo "1" 14 | 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Jacques Fize 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](logo2.png) 2 | 3 | 4 | [![Build Status](https://travis-ci.com/Jacobe2169/GMatch4py.svg?branch=master)](https://travis-ci.com/Jacobe2169/GMatch4py) 5 | # GMatch4py a graph matching library for Python 6 | 7 | 8 | GMatch4py is a library dedicated to graph matching. Graph structure are stored in NetworkX graph objects. 9 | GMatch4py algorithms were implemented with Cython to enhance performance. 10 | 11 | ## Requirements 12 | 13 | * Python 3 14 | * Numpy and Cython installed (if not : `(sudo) pip(3) install numpy cython`) 15 | 16 | ## Installation 17 | 18 | To install `GMatch4py`, run the following commands: 19 | 20 | ```bash 21 | git clone https://github.com/Jacobe2169/GMatch4py.git 22 | cd GMatch4py 23 | (sudo) pip(3) install . 24 | ``` 25 | 26 | ## Get Started 27 | ### Graph input format 28 | 29 | In `GMatch4py`, algorithms manipulate `networkx.Graph`, a complete graph model that 30 | comes with a large spectrum of parser to load your graph from various inputs : `*.graphml,*.gexf,..` (check [here](https://networkx.github.io/documentation/stable/reference/readwrite/index.html) to see all the format accepted) 31 | 32 | ### Use GMatch4py 33 | If you want to use algorithms like *graph edit distances*, here is an example: 34 | 35 | ```python 36 | # Gmatch4py use networkx graph 37 | import networkx as nx 38 | # import the GED using the munkres algorithm 39 | import gmatch4py as gm 40 | ``` 41 | 42 | In this example, we use generated graphs using `networkx` helpers: 43 | ```python 44 | g1=nx.complete_bipartite_graph(5,4) 45 | g2=nx.complete_bipartite_graph(6,4) 46 | ``` 47 | 48 | All graph matching algorithms in `Gmatch4py` work this way: 49 | * Each algorithm is associated with an object, each object having its specific parameters. In this case, the parameters are the edit costs (delete a vertex, add a vertex, ...) 50 | * Each object is associated with a `compare()` function with two parameters. First parameter is **a list of the graphs** you want to **compare**, i.e. measure the distance/similarity (depends on the algorithm). Then, you can specify a sample of graphs to be compared to all the other graphs. To this end, the second parameter should be **a list containing the indices** of these graphs (based on the first parameter list). If you rather compute the distance/similarity **between all graphs**, just use the `None` value. 51 | 52 | ```python 53 | ged=gm.GraphEditDistance(1,1,1,1) # all edit costs are equal to 1 54 | result=ged.compare([g1,g2],None) 55 | print(result) 56 | ``` 57 | 58 | The output is a similarity/distance matrix : 59 | ```python 60 | array([[0., 14.], 61 | [10., 0.]]) 62 | ``` 63 | This output result is "raw", if you wish to have normalized results in terms of distance (or similarity) you can use : 64 | 65 | ```python 66 | ged.similarity(result) 67 | # or 68 | ged.distance(result) 69 | ``` 70 | 71 | ## Exploit nodes and edges attributes 72 | 73 | In this latest version, we add the possibility to exploit graph attributes ! To do so, the `base.Base` is extended with the `set_attr_graph_used(node_attr,edge_attr)` method. 74 | 75 | ```python 76 | import networkx as nx 77 | import gmatch4py as gm 78 | ged = gm.GraphEditDistance(1,1,1,1) 79 | ged.set_attr_graph_used("theme","color") # Edge colors and node themes attributes will be used. 80 | ``` 81 | 82 | ## List of algorithms 83 | 84 | * Graph Embedding 85 | * Graph2Vec [1] 86 | * Node Embedding 87 | * DeepWalk [7] 88 | * Node2vec [8] 89 | * Graph kernels 90 | * Random Walk Kernel (*debug needed*) [3] 91 | * Geometrical 92 | * K-Step 93 | * Shortest Path Kernel [3] 94 | * Weisfeiler-Lehman Kernel [4] 95 | * Subtree Kernel 96 | * Graph Edit Distance [5] 97 | * Approximated Graph Edit Distance 98 | * Hausdorff Graph Edit Distance 99 | * Bipartite Graph Edit Distance 100 | * Greedy Edit Distance 101 | * Vertex Ranking [2] 102 | * Vertex Edge Overlap [2] 103 | * Bag of Nodes (a bag of words model using nodes as vocabulary) 104 | * Bag of Cliques (a bag of words model using cliques as vocabulary) 105 | * MCS [6] 106 | 107 | 108 | ## Publications associated 109 | 110 | * [1] Narayanan, Annamalai and Chandramohan, Mahinthan and Venkatesan, Rajasekar and Chen, Lihui and Liu, Yang. Graph2vec: Learning distributed representations of graphs. MLG 2017, 13th International Workshop on Mining and Learning with Graphs (MLGWorkshop 2017). 111 | * [2] Papadimitriou, P., Dasdan, A., & Garcia-Molina, H. (2010). Web graph similarity for anomaly detection. Journal of Internet Services and Applications, 1(1), 19-30. 112 | * [3] Vishwanathan, S. V. N., Schraudolph, N. N., Kondor, R., & Borgwardt, K. M. (2010). Graph kernels. Journal of Machine Learning Research, 11(Apr), 1201-1242. 113 | * [4] Shervashidze, N., Schweitzer, P., Leeuwen, E. J. V., Mehlhorn, K., & Borgwardt, K. M. (2011). Weisfeiler-lehman graph kernels. Journal of Machine Learning Research, 12(Sep), 2539-2561. 114 | * [5] Fischer, A., Riesen, K., & Bunke, H. (2017). Improved quadratic time approximation of graph edit distance by combining Hausdorff matching and greedy assignment. Pattern Recognition Letters, 87, 55-62. 115 | * [6] A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer, Pattern Recognition Letters, 1998 116 | * [7] Perozzi, B., Al-Rfou, R., & Skiena, S. (2014, August). Deepwalk: Online learning of social representations. In Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 701-710). ACM. 117 | * [8] node2vec: Scalable Feature Learning for Networks. Aditya Grover and Jure Leskovec. Knowledge Discovery and Data Mining, 2016. 118 | 119 | ## Author(s) 120 | 121 | Jacques Fize, *jacques[dot]fize[at]cirad[dot]fr* 122 | 123 | Some algorithms from other projects were integrated to Gmatch4py. **Be assured that 124 | each code is associated with a reference to the original.** 125 | 126 | 127 | ## CHANGELOG 128 | 129 | ### 18.06.2022 130 | * Debug the `skipgram` import 131 | * Gmatch4py should work with new gensim version 132 | 133 | 134 | ### 7.05.2019 135 | 136 | * Debug (problems with float edge weight) 137 | * Add the `AbstractEditDistance.edit_path(G,H)` method that return the edit path, the cost matrix and the selected cost index in the cost matrix 138 | * Add a tqdm progress bar for the `gmatch4py.helpers.reader.import_dir()` function 139 | 140 | ### 12.03.2019 141 | 142 | * Add Node2vec 143 | 144 | ### 05.03.2019 145 | 146 | * Add Graph Embedding algorithms 147 | * Remove depreciated methods and classes 148 | * Add logo 149 | * Update documentation 150 | 151 | 152 | ### 25.02.2019 153 | * Add New Graph Class. Features : Cython Extensions, precomputed values (degrees, neighbor info), hash representation of edges and nodes for a faster comparison 154 | * Some algorithms are parallelized such as graph edit distances or Jaccard 155 | 156 | ## TODO List 157 | 158 | * Debug algorithms --> Random Walk Kernel, Deltacon 159 | * Optimize algorithms --> Vertex Ranking 160 | 161 | -------------------------------------------------------------------------------- /gmatch4py/__init__.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | # Graph Edit Distance algorithms import 4 | from .ged.graph_edit_dist import * 5 | from .ged.greedy_edit_distance import * 6 | from .ged.bipartite_graph_matching_2 import * 7 | from .ged.hausdorff_edit_distance import * 8 | 9 | # Kernels algorithms import 10 | from .kernels.weisfeiler_lehman import * 11 | from .kernels.shortest_path_kernel import * 12 | 13 | # Graph Embedding import 14 | from .embedding.graph2vec import * 15 | from .embedding.deepwalk import * 16 | from .embedding.node2vec import * 17 | # Helpers import 18 | from .helpers.reader import * 19 | from .helpers.general import * 20 | 21 | # Basic algorithms import 22 | from .bag_of_cliques import * 23 | from .mcs import * 24 | from .vertex_edge_overlap import * 25 | from .vertex_ranking import * 26 | from .jaccard import * 27 | from .bon import * 28 | -------------------------------------------------------------------------------- /gmatch4py/bag_of_cliques.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | import copy 4 | from typing import Sequence 5 | 6 | import networkx as nx 7 | import numpy as np 8 | cimport numpy as np 9 | from scipy.sparse import csr_matrix,lil_matrix 10 | import sys 11 | 12 | from .base cimport Base 13 | 14 | 15 | cdef class BagOfCliques(Base): 16 | """ 17 | The Bag of Cliques is representation of a graph corpus using the well-known *bag of words* model. Here, instead of 18 | word, we use unique cliques found in the graphs as a vocabulary. A clique is a highly connected graph where all the vertices are connected by an edge. 19 | 20 | The resulting representation is then use to compute similarity value between graphs. For this purpose, we use the cosine 21 | similarity. 22 | """ 23 | 24 | def __init__(self): 25 | """ 26 | Constructor of Bag Of Cliques. 27 | """ 28 | Base.__init__(self,0,True) 29 | 30 | 31 | cpdef np.ndarray compare(self,list listgs, list selected): 32 | b=BagOfCliques() 33 | bog=b.get_bag_of_cliques(listgs).astype(np.float32) 34 | cdef int n=bog.shape[0] 35 | cdef np.ndarray scores = np.zeros((n,n)) 36 | cdef int i 37 | for i in range(len(scores)): 38 | if selected: 39 | if not i in selected: 40 | continue 41 | bog_i=bog[i] 42 | for j in range(i,len(scores)): 43 | bog_j=bog[j] 44 | scores[i,j]=(np.dot(bog_i,bog_j.T))/(np.sqrt(np.sum(bog_i**2))*np.sqrt(np.sum(bog_j**2))) # Can be computed in one line 45 | scores[j,i]=scores[i,j] 46 | return scores 47 | 48 | def get_unique_cliques(self, graphs): 49 | """ 50 | Return a cliques found in a set of graphs 51 | Parameters 52 | ---------- 53 | graphs: networkx.Graph array 54 | list of graphs 55 | 56 | Returns 57 | ------- 58 | list 59 | Cliques set 60 | """ 61 | t = {} 62 | c_ = 0 63 | cdef list clique_vocab = [] 64 | cdef list cli_temp 65 | cdef list cliques 66 | cdef int len_graphs=len(graphs) 67 | cdef int km= -1 68 | for g in graphs: 69 | km+=1 70 | if not g: 71 | continue 72 | cliques = list(nx.find_cliques(nx.Graph(g))) 73 | for clique in cliques: 74 | cli_temp = copy.deepcopy(clique) 75 | new_clique = False 76 | for i in range(len(clique)): 77 | flag = False 78 | v = None # vertex deleted 79 | for vertex in cli_temp: 80 | if vertex in t: 81 | v = vertex 82 | flag = True 83 | 84 | if not flag in t: 85 | v = cli_temp[0] 86 | t[v] = {} 87 | new_clique = True 88 | t = t[v] 89 | cli_temp.remove(v) 90 | 91 | if new_clique: 92 | c_ += 1 93 | clique_vocab.append(clique) 94 | return clique_vocab 95 | 96 | 97 | def clique2str(self,cliques): 98 | """ 99 | Return a "hash" string of a clique 100 | 101 | Parameters 102 | ---------- 103 | cliques: array 104 | 105 | Returns 106 | ------- 107 | str 108 | hash of a clique 109 | """ 110 | try: 111 | return "".join(sorted(cliques)) 112 | except: 113 | return "".join(sorted(list(map(str,cliques)))) 114 | 115 | def transform_clique_vocab(self,clique_vocab): 116 | """ 117 | Transform cliques found in `get_unique_cliques()` in a proper format to build the "Bag of Cliques" 118 | 119 | Parameters 120 | ---------- 121 | clique_vocab : array 122 | contains cliques 123 | Returns 124 | ------- 125 | dict 126 | new clique vocab format 127 | """ 128 | cdef dict new_vocab={} 129 | cdef int len_voc=len(clique_vocab) 130 | for c in range(len_voc): 131 | #print(c) 132 | new_vocab[self.clique2str(clique_vocab[c])]=c 133 | return new_vocab 134 | 135 | def get_bag_of_cliques(self, graphs): 136 | """ 137 | Return a the Bag of Cliques representation from a graph set. 138 | 139 | Parameters 140 | ---------- 141 | graphs : networkx.Graph array 142 | list of graphs 143 | 144 | Returns 145 | ------- 146 | np.ndarray 147 | bag of cliques 148 | """ 149 | cdef list clique_vocab=self.get_unique_cliques(graphs) 150 | cdef dict map_str_cliques=self.transform_clique_vocab(clique_vocab) 151 | cdef int l_v=len(clique_vocab) 152 | boc = np.zeros((len(graphs), l_v)) 153 | cdef np.ndarray vector 154 | cdef list cliques 155 | cdef str hash 156 | 157 | for g in range(len(graphs)): 158 | #sys.stdout.write("\r{0}/{1}".format(g,len(graphs))) 159 | gr = graphs[g] 160 | vector = np.zeros(l_v) 161 | cliques = list(nx.find_cliques(nx.Graph(gr))) 162 | for clique in cliques: 163 | hash=self.clique2str(clique) 164 | if hash in map_str_cliques: 165 | vector[map_str_cliques[hash]] = 1 166 | boc[g] = vector 167 | return boc 168 | -------------------------------------------------------------------------------- /gmatch4py/base.pxd: -------------------------------------------------------------------------------- 1 | cimport numpy as np 2 | 3 | cdef class Base: 4 | ## Attribute(s) 5 | cdef int type_alg 6 | cdef bint normalized 7 | cdef int cpu_count 8 | cdef str node_attr_key 9 | cdef str edge_attr_key 10 | ## Methods 11 | cpdef np.ndarray compare(self,list graph_list, list selected) 12 | cpdef np.ndarray compare_old(self,list listgs, list selected) 13 | cpdef np.ndarray distance(self, np.ndarray matrix) 14 | cpdef np.ndarray similarity(self, np.ndarray matrix) 15 | cpdef bint isAccepted(self,G,index,selected) 16 | cpdef np.ndarray get_selected_array(self,selected,size_corpus) 17 | 18 | cpdef set_attr_graph_used(self, str node_attr_key, str edge_attr_key) 19 | 20 | -------------------------------------------------------------------------------- /gmatch4py/base.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | import numpy as np 4 | cimport numpy as np 5 | import networkx as nx 6 | cimport cython 7 | import multiprocessing 8 | 9 | 10 | 11 | cpdef np.ndarray minmax_scale(np.ndarray matrix): 12 | """ 13 | Optimize so it can works with Cython 14 | :param matrix: 15 | :return: 16 | """ 17 | cdef double min_,max_ 18 | cdef np.ndarray x 19 | x=np.ma.masked_invalid(matrix) 20 | max_=np.max(x) 21 | return x/(max_) 22 | 23 | 24 | cdef class Base: 25 | """ 26 | This class define the common methods to all Graph Matching algorithm. 27 | 28 | Attributes 29 | ---------- 30 | type_alg : int 31 | Indicate the type of measure returned by the algorithm : 32 | 33 | * 0 : similarity 34 | * 1 : distance 35 | normalized : bool 36 | Indicate if the algorithm return normalized results (between 0 and 1) 37 | 38 | """ 39 | def __cinit__(self): 40 | self.type_alg=0 41 | self.normalized=False 42 | 43 | def __init__(self,type_alg,normalized,node_attr_key="",edge_attr_key=""): 44 | """ 45 | Constructor of Base 46 | 47 | Parameters 48 | ---------- 49 | type_alg : int 50 | Indicate the type of measure returned by the algorithm : 51 | 52 | * **0** : similarity 53 | * **1** : distance 54 | normalized : bool 55 | Indicate if the algorithm return normalized results (between 0 and 1) 56 | """ 57 | if type_alg <0: 58 | self.type_alg=0 59 | elif type_alg >1 : 60 | self.type_alg=1 61 | else: 62 | self.type_alg=type_alg 63 | self.normalized=normalized 64 | self.cpu_count=multiprocessing.cpu_count() 65 | self.node_attr_key=node_attr_key 66 | self.edge_attr_key=edge_attr_key 67 | 68 | cpdef set_attr_graph_used(self, str node_attr_key, str edge_attr_key): 69 | """ 70 | Set graph attribute used by the algorithm to compare graphs. 71 | Parameters 72 | ---------- 73 | node_attr_key : str 74 | key of the node attribute 75 | edge_attr_key: str 76 | key of the edge attribute 77 | 78 | """ 79 | self.node_attr_key=node_attr_key 80 | self.edge_attr_key=edge_attr_key 81 | 82 | cpdef np.ndarray get_selected_array(self,selected,size_corpus): 83 | """ 84 | Return an array which define which graph will be compared in the algorithms. 85 | Parameters 86 | ---------- 87 | selected : list 88 | indices of graphs you wish to compare 89 | size_corpus : 90 | size of your dataset 91 | 92 | Returns 93 | ------- 94 | np.ndarray 95 | selected vector (1 -> selected, 0 -> not selected) 96 | """ 97 | cdef double[:] selected_test = np.zeros(size_corpus) 98 | if not selected == None: 99 | for ix in range(len(selected)): 100 | selected_test[selected[ix]]=1 101 | return np.array(selected_test) 102 | else: 103 | return np.array(selected_test)+1 104 | 105 | 106 | cpdef np.ndarray compare_old(self,list listgs, list selected): 107 | """ 108 | Soon will be depreciated ! To store the old version of an algorithm. 109 | Parameters 110 | ---------- 111 | listgs : list 112 | list of graphs 113 | selected 114 | selected graphs 115 | 116 | Returns 117 | ------- 118 | np.ndarray 119 | distance/similarity matrix 120 | """ 121 | pass 122 | 123 | @cython.boundscheck(False) 124 | cpdef np.ndarray compare(self,list graph_list, list selected): 125 | """ 126 | Return the similarity/distance matrix using the current algorithm. 127 | 128 | >>>Base.compare([nx.Graph(),nx.Graph()],None) 129 | >>>Base.compare([nx.Graph(),nx.Graph()],[0,1]) 130 | 131 | Parameters 132 | ---------- 133 | graph_list : networkx.Graph array 134 | Contains the graphs to compare 135 | selected : int array 136 | Sometimes, you only wants to compute similarity of some graphs to every graphs. If so, indicate their indices in 137 | `graph_list`, else, put the None value. 138 | the None value 139 | Returns 140 | ------- 141 | np.ndarray 142 | distance/similarity matrix 143 | 144 | """ 145 | pass 146 | 147 | cpdef np.ndarray distance(self, np.ndarray matrix): 148 | """ 149 | Return a normalized distance matrix 150 | Parameters 151 | ---------- 152 | matrix : np.ndarray 153 | Similarity/distance matrix you wish to transform 154 | 155 | Returns 156 | ------- 157 | np.ndarray 158 | distance matrix 159 | """ 160 | if self.type_alg == 1: 161 | if not self.normalized: 162 | matrix=np.ma.getdata(minmax_scale(matrix)) 163 | return matrix 164 | else: 165 | if not self.normalized: 166 | matrix=np.ma.getdata(minmax_scale(matrix)) 167 | return 1-matrix 168 | 169 | cpdef np.ndarray similarity(self, np.ndarray matrix): 170 | """ 171 | Return a normalized similarity matrix 172 | Parameters 173 | ---------- 174 | matrix : np.ndarray 175 | Similarity/distance matrix you wish to transform 176 | 177 | Returns 178 | ------- 179 | np.array 180 | similarity matrix 181 | """ 182 | if self.type_alg == 0: 183 | return matrix 184 | else: 185 | if not self.normalized: 186 | matrix=np.ma.getdata(minmax_scale(matrix)) 187 | return 1-matrix 188 | 189 | 190 | cpdef bint isAccepted(self,G,index,selected): 191 | """ 192 | Indicate if the graph will be compared to the other. A graph is "accepted" if : 193 | * G exists(!= None) and not empty (|vertices(G)| >0) 194 | * If selected graph to compare were indicated, check if G exists in selected 195 | 196 | Parameters 197 | ---------- 198 | G : networkx.Graph 199 | Graph 200 | index : int 201 | index in the graph list parameter in `Base.compare()` 202 | selected : int array 203 | `selected` parameter value in `Base.compare()` 204 | 205 | Returns 206 | ------- 207 | bool : 208 | if is accepted 209 | """ 210 | f=True 211 | if not G: 212 | f=False 213 | elif len(G)== 0: 214 | f=False 215 | if selected: 216 | if not index in selected: 217 | f=False 218 | return f 219 | -------------------------------------------------------------------------------- /gmatch4py/bon.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | import networkx as nx 4 | import numpy as np 5 | cimport numpy as np 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | from .base cimport Base 8 | 9 | cdef class BagOfNodes(Base): 10 | """ 11 | We could call this algorithm Bag of nodes 12 | """ 13 | def __init__(self): 14 | Base.__init__(self,0,True) 15 | 16 | cpdef np.ndarray compare(self,list graph_list, list selected): 17 | nodes = list() 18 | for g in graph_list: 19 | nodes.extend(list(g.nodes())) 20 | 21 | vocabulary = list(set(nodes)) 22 | hash_voc = {} 23 | i = 0 24 | for se in vocabulary: 25 | hash_voc[se] = i 26 | i += 1 27 | n, m = len(graph_list), len(hash_voc) 28 | bow_matrix = np.zeros((n, m)) 29 | i = 0 30 | for g in range(len(graph_list)): 31 | graph = graph_list[g] 32 | nodes = list(graph.nodes()) 33 | for nod in nodes: 34 | j = hash_voc[nod] 35 | bow_matrix[i, j] = 1 36 | i += 1 37 | 38 | sim_matrix = cosine_similarity(bow_matrix) 39 | np.fill_diagonal(sim_matrix, 1) 40 | return sim_matrix 41 | -------------------------------------------------------------------------------- /gmatch4py/embedding/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jacquesfize/GMatch4py/4fc0a822514c65c0d8b12d090b5b89c0af50ef2a/gmatch4py/embedding/__init__.py -------------------------------------------------------------------------------- /gmatch4py/embedding/deepwalk.pyx: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import random 7 | 8 | from io import open 9 | from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter 10 | from collections import Counter 11 | from concurrent.futures import ProcessPoolExecutor 12 | import logging 13 | from multiprocessing import cpu_count 14 | 15 | import networkx as nx 16 | import numpy as np 17 | cimport numpy as np 18 | from six import text_type as unicode 19 | from six import iteritems 20 | from six.moves import range 21 | 22 | from gensim.models import Word2Vec 23 | from sklearn.metrics.pairwise import cosine_similarity 24 | from joblib import Parallel, delayed 25 | import psutil 26 | 27 | cimport cython 28 | from ..base cimport Base 29 | import gmatch4py.embedding.graph as graph2 30 | import gmatch4py.embedding.walks as serialized_walks 31 | from .skipgram import Skipgram 32 | 33 | 34 | p = psutil.Process(os.getpid()) 35 | try: 36 | p.set_cpu_affinity(list(range(cpu_count()))) 37 | except AttributeError: 38 | try: 39 | p.cpu_affinity(list(range(cpu_count()))) 40 | except AttributeError: 41 | pass 42 | 43 | 44 | def process(gr, number_walks = 10, walk_length = 40, window_size = 5, vertex_freq_degree = False, workers = 1, representation_size = 64, max_memory_data_size = 1000000000, seed = 0): 45 | """ 46 | Return a DeepWalk embedding for a graph 47 | 48 | Parameters 49 | ---------- 50 | gr : nx.Graph 51 | graph 52 | number_walks : int, optional 53 | Number of walk (the default is 10) 54 | walk_length : int, optional 55 | Length of the random walk started at each node (the default is 40) 56 | window_size : int, optional 57 | Window size of skipgram model. (the default is 5) 58 | vertex_freq_degree : bool, optional 59 | Use vertex degree to estimate the frequency of nodes (the default is False) 60 | workers : int, optional 61 | Number of parallel processes (the default is 1) 62 | representation_size : int, optional 63 | Number of latent dimensions to learn for each node (the default is 64) 64 | max_memory_data_size : int, optional 65 | 'Size to start dumping walks to disk, instead of keeping them in memory. (the default is 1000000000) 66 | seed : int, optional 67 | Seed for random walk generator (the default is 0) 68 | 69 | Returns 70 | ------- 71 | np.array 72 | DeepWalk embedding 73 | """ 74 | 75 | if len(gr.edges())<1: 76 | return np.zeros((1,representation_size)) 77 | G = graph2.from_networkx(gr.copy(), undirected=gr.is_directed()) 78 | num_walks = len(G.nodes()) * number_walks 79 | 80 | data_size = num_walks * walk_length 81 | 82 | #print("Data size (walks*length): {}".format(data_size)) 83 | 84 | if data_size < max_memory_data_size: 85 | #print("Walking...") 86 | walks = graph2.build_deepwalk_corpus(G, num_paths=number_walks, 87 | path_length=walk_length, alpha=0, rand=random.Random(seed)) 88 | #print("Training...") 89 | model = Word2Vec(walks, vector_size=representation_size, 90 | window=window_size, min_count=0, sg=1, hs=1, workers=workers) 91 | else: 92 | #print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format( 93 | # data_size, max_memory_data_size)) 94 | #print("Walking...") 95 | 96 | walks_filebase = "temp.walks" 97 | walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=number_walks, 98 | path_length=walk_length, alpha=0, rand=random.Random(seed), 99 | num_workers=workers) 100 | 101 | #print("Counting vertex frequency...") 102 | if not vertex_freq_degree: 103 | vertex_counts = serialized_walks.count_textfiles( 104 | walk_files, workers) 105 | else: 106 | # use degree distribution for frequency in tree 107 | vertex_counts = G.degree(nodes=G.iterkeys()) 108 | 109 | #print("Training...") 110 | walks_corpus = serialized_walks.WalksCorpus(walk_files) 111 | model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, 112 | size=representation_size, 113 | window=window_size, min_count=0, trim_rule=None, workers=workers) 114 | 115 | return model.wv.vectors 116 | 117 | 118 | cdef class DeepWalk(Base): 119 | """ 120 | Based on : 121 | @inproceedings{Perozzi:2014:DOL:2623330.2623732, 122 | author = {Perozzi, Bryan and Al-Rfou, Rami and Skiena, Steven}, 123 | title = {DeepWalk: Online Learning of Social Representations}, 124 | booktitle = {Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, 125 | series = {KDD '14}, 126 | year = {2014}, 127 | isbn = {978-1-4503-2956-9}, 128 | location = {New York, New York, USA}, 129 | pages = {701--710}, 130 | numpages = {10}, 131 | url = {http://doi.acm.org/10.1145/2623330.2623732}, 132 | doi = {10.1145/2623330.2623732}, 133 | acmid = {2623732}, 134 | publisher = {ACM}, 135 | address = {New York, NY, USA}, 136 | keywords = {deep learning, latent representations, learning with partial labels, network classification, online learning, social networks}, 137 | } 138 | 139 | Original Code : https://github.com/phanein/deepwalk 140 | 141 | Modified by : Jacques Fize 142 | """ 143 | 144 | def __init__(self): 145 | Base.__init__(self,0,False) 146 | 147 | def extract_embedding(self, listgs): 148 | """ 149 | Extract DeepWalk embedding of each graph in `listgs` 150 | 151 | Parameters 152 | ---------- 153 | listgs : list 154 | list of graphs 155 | 156 | Returns 157 | ------- 158 | list 159 | list of embeddings 160 | """ 161 | 162 | from tqdm import tqdm 163 | models = Parallel(n_jobs = cpu_count())(delayed(process)(nx.Graph(g)) for g in tqdm(listgs,desc="Extracting Embeddings...")) 164 | return models 165 | 166 | @cython.boundscheck(False) 167 | cpdef np.ndarray compare(self,list listgs, list selected): 168 | # Selected is ignored 169 | models = self.extract_embedding(listgs) 170 | vector_matrix = np.array([mod.mean(axis=0) for mod in models]) # Average nodes representations 171 | cs = cosine_similarity(vector_matrix) 172 | return cs 173 | 174 | -------------------------------------------------------------------------------- /gmatch4py/embedding/graph.pyx: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Graph utilities.""" 5 | 6 | import logging 7 | import sys 8 | from io import open 9 | from os import path 10 | from time import time 11 | from glob import glob 12 | from six.moves import range, zip, zip_longest 13 | from six import iterkeys 14 | 15 | try: 16 | from collections.abc import Iterable 17 | except ImportError: 18 | from collections import Iterable 19 | from collections import defaultdict 20 | 21 | import random 22 | from random import shuffle 23 | from itertools import product,permutations 24 | from scipy.io import loadmat 25 | from scipy.sparse import issparse 26 | 27 | logger = logging.getLogger("deepwalk") 28 | 29 | 30 | __author__ = "Bryan Perozzi" 31 | __email__ = "bperozzi@cs.stonybrook.edu" 32 | 33 | LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" 34 | 35 | class Graph(defaultdict): 36 | """Efficient basic implementation of nx `Graph' – Undirected graphs with self loops""" 37 | def __init__(self): 38 | super(Graph, self).__init__(list) 39 | 40 | def nodes(self): 41 | return self.keys() 42 | 43 | def adjacency_iter(self): 44 | return self.iteritems() 45 | 46 | def subgraph(self, nodes={}): 47 | subgraph = Graph() 48 | 49 | for n in nodes: 50 | if n in self: 51 | subgraph[n] = [x for x in self[n] if x in nodes] 52 | 53 | return subgraph 54 | 55 | def make_undirected(self): 56 | 57 | t0 = time() 58 | 59 | for v in self.keys(): 60 | for other in self[v]: 61 | if v != other: 62 | self[other].append(v) 63 | 64 | t1 = time() 65 | logger.info('make_directed: added missing edges {}s'.format(t1-t0)) 66 | 67 | self.make_consistent() 68 | return self 69 | 70 | def make_consistent(self): 71 | t0 = time() 72 | for k in iterkeys(self): 73 | self[k] = list(sorted(set(self[k]))) 74 | 75 | t1 = time() 76 | logger.info('make_consistent: made consistent in {}s'.format(t1-t0)) 77 | 78 | self.remove_self_loops() 79 | 80 | return self 81 | 82 | def remove_self_loops(self): 83 | 84 | removed = 0 85 | t0 = time() 86 | 87 | for x in self: 88 | if x in self[x]: 89 | self[x].remove(x) 90 | removed += 1 91 | 92 | t1 = time() 93 | 94 | logger.info('remove_self_loops: removed {} loops in {}s'.format(removed, (t1-t0))) 95 | return self 96 | 97 | def check_self_loops(self): 98 | for x in self: 99 | for y in self[x]: 100 | if x == y: 101 | return True 102 | 103 | return False 104 | 105 | def has_edge(self, v1, v2): 106 | if v2 in self[v1] or v1 in self[v2]: 107 | return True 108 | return False 109 | 110 | def degree(self, nodes=None): 111 | if isinstance(nodes, Iterable): 112 | return {v:len(self[v]) for v in nodes} 113 | else: 114 | return len(self[nodes]) 115 | 116 | def order(self): 117 | "Returns the number of nodes in the graph" 118 | return len(self) 119 | 120 | def number_of_edges(self): 121 | "Returns the number of nodes in the graph" 122 | return sum([self.degree(x) for x in self.keys()])/2 123 | 124 | def number_of_nodes(self): 125 | "Returns the number of nodes in the graph" 126 | return self.order() 127 | 128 | def random_walk(self, path_length, alpha=0, rand=random.Random(), start=None): 129 | """ Returns a truncated random walk. 130 | 131 | path_length: Length of the random walk. 132 | alpha: probability of restarts. 133 | start: the start node of the random walk. 134 | """ 135 | G = self 136 | if start: 137 | path = [start] 138 | else: 139 | # Sampling is uniform w.r.t V, and not w.r.t E 140 | path = [rand.choice(list(G.keys()))] 141 | 142 | while len(path) < path_length: 143 | cur = path[-1] 144 | if len(G[cur]) > 0: 145 | if rand.random() >= alpha: 146 | path.append(rand.choice(G[cur])) 147 | else: 148 | path.append(path[0]) 149 | else: 150 | break 151 | return [str(node) for node in path] 152 | 153 | # TODO add build_walks in here 154 | 155 | def build_deepwalk_corpus(G, num_paths, path_length, alpha=0, 156 | rand=random.Random(0)): 157 | walks = [] 158 | 159 | nodes = list(G.nodes()) 160 | 161 | for cnt in range(num_paths): 162 | rand.shuffle(nodes) 163 | for node in nodes: 164 | walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node)) 165 | 166 | return walks 167 | 168 | def build_deepwalk_corpus_iter(G, num_paths, path_length, alpha=0, 169 | rand=random.Random(0)): 170 | walks = [] 171 | 172 | nodes = list(G.nodes()) 173 | 174 | for cnt in range(num_paths): 175 | rand.shuffle(nodes) 176 | for node in nodes: 177 | yield G.random_walk(path_length, rand=rand, alpha=alpha, start=node) 178 | 179 | 180 | def clique(size): 181 | return from_adjlist(permutations(range(1,size+1))) 182 | 183 | 184 | # http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python 185 | def grouper(n, iterable, padvalue=None): 186 | "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')" 187 | return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue) 188 | 189 | def parse_adjacencylist(f): 190 | adjlist = [] 191 | for l in f: 192 | if l and l[0] != "#": 193 | introw = [int(x) for x in l.strip().split()] 194 | row = [introw[0]] 195 | row.extend(set(sorted(introw[1:]))) 196 | adjlist.extend([row]) 197 | 198 | return adjlist 199 | 200 | def parse_adjacencylist_unchecked(f): 201 | adjlist = [] 202 | for l in f: 203 | if l and l[0] != "#": 204 | adjlist.extend([[int(x) for x in l.strip().split()]]) 205 | 206 | return adjlist 207 | 208 | def load_adjacencylist(file_, undirected=False, chunksize=10000, unchecked=True): 209 | 210 | if unchecked: 211 | parse_func = parse_adjacencylist_unchecked 212 | convert_func = from_adjlist_unchecked 213 | else: 214 | parse_func = parse_adjacencylist 215 | convert_func = from_adjlist 216 | 217 | adjlist = [] 218 | 219 | t0 = time() 220 | 221 | total = 0 222 | with open(file_) as f: 223 | for idx, adj_chunk in enumerate(map(parse_func, grouper(int(chunksize), f))): 224 | adjlist.extend(adj_chunk) 225 | total += len(adj_chunk) 226 | 227 | t1 = time() 228 | 229 | logger.info('Parsed {} edges with {} chunks in {}s'.format(total, idx, t1-t0)) 230 | 231 | t0 = time() 232 | G = convert_func(adjlist) 233 | t1 = time() 234 | 235 | logger.info('Converted edges to graph in {}s'.format(t1-t0)) 236 | 237 | if undirected: 238 | t0 = time() 239 | G = G.make_undirected() 240 | t1 = time() 241 | logger.info('Made graph undirected in {}s'.format(t1-t0)) 242 | 243 | return G 244 | 245 | 246 | def load_edgelist(file_, undirected=True): 247 | G = Graph() 248 | with open(file_) as f: 249 | for l in f: 250 | x, y = l.strip().split()[:2] 251 | x = int(x) 252 | y = int(y) 253 | G[x].append(y) 254 | if undirected: 255 | G[y].append(x) 256 | 257 | G.make_consistent() 258 | return G 259 | 260 | 261 | def load_matfile(file_, variable_name="network", undirected=True): 262 | mat_varables = loadmat(file_) 263 | mat_matrix = mat_varables[variable_name] 264 | 265 | return from_numpy(mat_matrix, undirected) 266 | 267 | 268 | def from_networkx(G_input, undirected=True): 269 | G = Graph() 270 | 271 | for _, x in enumerate(G_input): 272 | for y in iterkeys(G_input[x]): 273 | G[x].append(y) 274 | 275 | if undirected: 276 | G.make_undirected() 277 | 278 | return G 279 | 280 | 281 | def from_numpy(x, undirected=True): 282 | G = Graph() 283 | 284 | if issparse(x): 285 | cx = x.tocoo() 286 | for i,j,v in zip(cx.row, cx.col, cx.data): 287 | G[i].append(j) 288 | else: 289 | raise Exception("Dense matrices not yet supported.") 290 | 291 | if undirected: 292 | G.make_undirected() 293 | 294 | G.make_consistent() 295 | return G 296 | 297 | 298 | def from_adjlist(adjlist): 299 | G = Graph() 300 | 301 | for row in adjlist: 302 | node = row[0] 303 | neighbors = row[1:] 304 | G[node] = list(sorted(set(neighbors))) 305 | 306 | return G 307 | 308 | 309 | def from_adjlist_unchecked(adjlist): 310 | G = Graph() 311 | 312 | for row in adjlist: 313 | node = row[0] 314 | neighbors = row[1:] 315 | G[node] = neighbors 316 | 317 | return G 318 | 319 | 320 | -------------------------------------------------------------------------------- /gmatch4py/embedding/graph2vec.pyx: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import glob 4 | 5 | import pandas as pd 6 | import networkx as nx 7 | from tqdm import tqdm 8 | cimport numpy as np 9 | import numpy.distutils.system_info as sysinfo 10 | 11 | from joblib import Parallel, delayed 12 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument 13 | from sklearn.metrics.pairwise import cosine_similarity 14 | 15 | from ..base cimport Base 16 | cimport cython 17 | 18 | 19 | class WeisfeilerLehmanMachine: 20 | """ 21 | Weisfeiler Lehman feature extractor class. 22 | """ 23 | def __init__(self, graph, features, iterations): 24 | """ 25 | Initialization method which executes feature extraction. 26 | 27 | Parameters 28 | ---------- 29 | graph : nx.Graph 30 | graph 31 | features : dict 32 | Feature hash table. 33 | iterations : int 34 | number of WL iteration 35 | 36 | """ 37 | 38 | self.iterations = iterations 39 | self.graph = graph 40 | self.features = features 41 | self.nodes = self.graph.nodes() 42 | self.extracted_features = [str(v) for k,v in features.items()] 43 | self.do_recursions() 44 | 45 | def do_a_recursion(self): 46 | """ 47 | The method does a single WL recursion. 48 | 49 | Returns 50 | ------- 51 | dict 52 | The hash table with extracted WL features. 53 | """ 54 | 55 | new_features = {} 56 | for node in self.nodes: 57 | nebs = self.graph.neighbors(node) 58 | degs = [self.features[neb] for neb in nebs] 59 | features = "_".join([str(self.features[node])]+list(set(sorted([str(deg) for deg in degs])))) 60 | hash_object = hashlib.md5(features.encode()) 61 | hashing = hash_object.hexdigest() 62 | new_features[node] = hashing 63 | self.extracted_features = self.extracted_features + list(new_features.values()) 64 | return new_features 65 | 66 | def do_recursions(self): 67 | """ 68 | The method does a series of WL recursions. 69 | """ 70 | for iteration in range(self.iterations): 71 | self.features = self.do_a_recursion() 72 | 73 | 74 | def dataset_reader(graph): 75 | """ 76 | Function to extract features from a networkx graph 77 | 78 | Parameters 79 | ---------- 80 | graph : nx.Graph 81 | graph 82 | 83 | Returns 84 | ------- 85 | dict 86 | Features hash table. 87 | """ 88 | 89 | features = dict(nx.degree(graph)) 90 | 91 | features = {k:v for k,v, in features.items()} 92 | return graph, features 93 | 94 | 95 | def feature_extractor(graph, ix, rounds): 96 | """ 97 | Function to extract WL features from a graph 98 | 99 | Parameters 100 | ---------- 101 | graph : nx.Graph 102 | graph 103 | ix : int 104 | index of the graph in the dataset 105 | rounds : int 106 | number of WL iterations 107 | 108 | Returns 109 | ------- 110 | TaggedDocument 111 | random walks 112 | """ 113 | 114 | graph, features = dataset_reader(graph) 115 | machine = WeisfeilerLehmanMachine(graph,features,rounds) 116 | doc = TaggedDocument(words = machine.extracted_features , tags = ["g_{0}".format(ix)]) 117 | return doc 118 | 119 | 120 | 121 | def generate_model(graphs, iteration = 2, dimensions = 64, min_count = 5, down_sampling = 0.0001, learning_rate = 0.0001, epochs = 10, workers = 4 ): 122 | """ 123 | Main function to read the graph list, extract features, learn the embedding and save it. 124 | 125 | Parameters 126 | ---------- 127 | graphs : nx.Graph 128 | Input graph 129 | iteration : int, optional 130 | number of iteration (the default is 2) 131 | dimensions : int, optional 132 | output vector dimension (the default is 64) 133 | min_count : int, optional 134 | min count parameter of Doc2vec model (the default is 5) 135 | down_sampling : float, optional 136 | Down sampling rate for frequent features. (the default is 0.0001) 137 | learning_rate : float, optional 138 | Initial learning rate (the default is 0.0001, which [default_description]) 139 | epochs : int, optional 140 | Number of epochs (the default is 10) 141 | workers : int, optional 142 | Number of workers (the default is 4) 143 | 144 | Returns 145 | ------- 146 | [type] 147 | [description] 148 | """ 149 | 150 | document_collections = Parallel(n_jobs = workers)(delayed(feature_extractor)(g, ix,iteration) for ix,g in tqdm(enumerate(graphs),desc="Extracting Features...")) 151 | graphs=[nx.relabel_nodes(g,{node:str(node) for node in list(g.nodes)},copy=True) for g in graphs] 152 | model = Doc2Vec(document_collections, 153 | vector_size = dimensions, 154 | window = 0, 155 | min_count = min_count, 156 | dm = 0, 157 | sample = down_sampling, 158 | workers = workers, 159 | epochs = epochs, 160 | alpha = learning_rate) 161 | return model 162 | 163 | cdef class Graph2Vec(Base): 164 | """ 165 | Based on : 166 | graph2vec: Learning distributed representations of graphs. 167 | Narayanan, Annamalai and Chandramohan, Mahinthan and Venkatesan, Rajasekar and Chen, Lihui and Liu, Yang 168 | MLG 2017, 13th International Workshop on Mining and Learning with Graphs (MLGWorkshop 2017) 169 | 170 | Original Code : https://github.com/benedekrozemberczki/graph2vec 171 | 172 | Modified by : Jacques Fize 173 | """ 174 | 175 | def __init__(self): 176 | Base.__init__(self,0,False) 177 | 178 | @cython.boundscheck(False) 179 | cpdef np.ndarray compare(self,list listgs, list selected): 180 | # Selected is ignored 181 | model = generate_model(listgs) 182 | vector_matrix = model.docvecs.vectors_docs 183 | cs = cosine_similarity(vector_matrix) 184 | return cs 185 | -------------------------------------------------------------------------------- /gmatch4py/embedding/node2vec.pyx: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | cimport numpy as np 5 | from gensim.models import Word2Vec 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | 8 | from ..base cimport Base 9 | cimport cython 10 | from joblib import Parallel, delayed 11 | import networkx as nx 12 | 13 | class Graph(): 14 | def __init__(self, nx_G, is_directed, p, q): 15 | self.G = nx_G 16 | self.is_directed = is_directed 17 | self.p = p 18 | self.q = q 19 | 20 | def node2vec_walk(self, walk_length, start_node): 21 | ''' 22 | Simulate a random walk starting from start node. 23 | ''' 24 | G = self.G 25 | alias_nodes = self.alias_nodes 26 | alias_edges = self.alias_edges 27 | 28 | walk = [start_node] 29 | 30 | while len(walk) < walk_length: 31 | cur = walk[-1] 32 | cur_nbrs = sorted(G.neighbors(cur)) 33 | if len(cur_nbrs) > 0: 34 | if len(walk) == 1: 35 | walk.append( 36 | cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) 37 | else: 38 | prev = walk[-2] 39 | next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], 40 | alias_edges[(prev, cur)][1])] 41 | walk.append(next) 42 | else: 43 | break 44 | 45 | return walk 46 | 47 | def simulate_walks(self, num_walks, walk_length): 48 | ''' 49 | Repeatedly simulate random walks from each node. 50 | ''' 51 | # sys.stdout.write("\r") 52 | G = self.G 53 | walks = [] 54 | nodes = list(G.nodes) 55 | for walk_iter in range(num_walks): 56 | # sys.stdout.write( 57 | # '\rWalk iteration: {0}/{1}'.format(walk_iter + 1, num_walks)) 58 | random.shuffle(nodes) 59 | for node in nodes: 60 | walks.append(self.node2vec_walk( 61 | walk_length=walk_length, start_node=node)) 62 | 63 | return walks 64 | 65 | def get_alias_edge(self, src, dst): 66 | ''' 67 | Get the alias edge setup lists for a given edge. 68 | ''' 69 | G = self.G 70 | p = self.p 71 | q = self.q 72 | 73 | unnormalized_probs = [] 74 | for dst_nbr in sorted(G.neighbors(dst)): 75 | if dst_nbr == src: 76 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p) 77 | elif G.has_edge(dst_nbr, src): 78 | unnormalized_probs.append(G[dst][dst_nbr]['weight']) 79 | else: 80 | unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q) 81 | norm_const = sum(unnormalized_probs) 82 | normalized_probs = [ 83 | float(u_prob) / norm_const for u_prob in unnormalized_probs] 84 | 85 | return alias_setup(normalized_probs) 86 | 87 | def preprocess_transition_probs(self): 88 | ''' 89 | Preprocessing of transition probabilities for guiding the random walks. 90 | ''' 91 | G = self.G 92 | is_directed = self.is_directed 93 | 94 | alias_nodes = {} 95 | for node in list(G.nodes): 96 | unnormalized_probs = [G[node][nbr]['weight'] 97 | for nbr in sorted(G.neighbors(node))] 98 | norm_const = sum(unnormalized_probs) 99 | normalized_probs = [ 100 | float(u_prob) / norm_const for u_prob in unnormalized_probs] 101 | alias_nodes[node] = alias_setup(normalized_probs) 102 | 103 | alias_edges = {} 104 | triads = {} 105 | 106 | if is_directed: 107 | for edge in list(G.edges()): 108 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 109 | else: 110 | for edge in list(G.edges()): 111 | alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) 112 | alias_edges[(edge[1], edge[0])] = self.get_alias_edge( 113 | edge[1], edge[0]) 114 | 115 | self.alias_nodes = alias_nodes 116 | self.alias_edges = alias_edges 117 | 118 | return 119 | 120 | 121 | def alias_setup(probs): 122 | ''' 123 | Compute utility lists for non-uniform sampling from discrete distributions. 124 | Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ 125 | for details 126 | ''' 127 | K = len(probs) 128 | q = np.zeros(K) 129 | J = np.zeros(K, dtype=np.int) 130 | 131 | smaller = [] 132 | larger = [] 133 | for kk, prob in enumerate(probs): 134 | q[kk] = K * prob 135 | if q[kk] < 1.0: 136 | smaller.append(kk) 137 | else: 138 | larger.append(kk) 139 | 140 | while len(smaller) > 0 and len(larger) > 0: 141 | small = smaller.pop() 142 | large = larger.pop() 143 | 144 | J[small] = large 145 | q[large] = q[large] + q[small] - 1.0 146 | if q[large] < 1.0: 147 | smaller.append(large) 148 | else: 149 | larger.append(large) 150 | 151 | return J, q 152 | 153 | 154 | def alias_draw(J, q): 155 | ''' 156 | Draw sample from a non-uniform discrete distribution using alias sampling. 157 | ''' 158 | K = len(J) 159 | 160 | kk = int(np.floor(np.random.rand() * K)) 161 | if np.random.rand() < q[kk]: 162 | return kk 163 | else: 164 | return J[kk] 165 | 166 | 167 | def learn_embeddings(walks, dimensions, window_size, nb_workers, nb_iter): 168 | ''' 169 | Learn embeddings by optimizing the Skipgram objective using SGD. 170 | ''' 171 | walks_ = [list(map(str, walk)) for walk in walks] 172 | model = Word2Vec(walks_, size=dimensions, window=window_size, 173 | min_count=0, sg=1, workers=nb_workers, iter=nb_iter) 174 | return model 175 | 176 | 177 | def compute_graph_model(nx_graph, **kwargs): 178 | ''' 179 | Pipeline for representational learning for all nodes in a graph. 180 | @param nx_graph 181 | @kwarg p: int 182 | @kwarg q: int 183 | ''' 184 | p = kwargs.get("p", 1) 185 | q = kwargs.get("q", 1) 186 | dimensions = kwargs.get("dimensions", 128) 187 | window_size = kwargs.get("window_size", 10) 188 | nb_workers = kwargs.get("nb_workers", 8) 189 | nb_iter = kwargs.get("nb_iter", 1) 190 | num_walks = kwargs.get("num_walks", 10) 191 | walk_length = kwargs.get("walk_length", 80) 192 | directed = kwargs.get("directed", False) 193 | 194 | G = Graph(nx_graph, directed, p, q) 195 | G.preprocess_transition_probs() 196 | walks = G.simulate_walks(num_walks, walk_length) 197 | return learn_embeddings(walks, dimensions, window_size, nb_workers, nb_iter).wv.vectors 198 | 199 | cdef class Node2Vec(Base): 200 | """ 201 | Based on : 202 | Extract Node2vec embedding of each graph in `listgs` 203 | @inproceedings{Grover:2016:NSF:2939672.2939754, 204 | author = {Grover, Aditya and Leskovec, Jure}, 205 | title = {Node2Vec: Scalable Feature Learning for Networks}, 206 | booktitle = {Proceedings of the 22Nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, 207 | series = {KDD '16}, 208 | year = {2016}, 209 | isbn = {978-1-4503-4232-2}, 210 | location = {San Francisco, California, USA}, 211 | pages = {855--864}, 212 | numpages = {10}, 213 | url = {http://doi.acm.org/10.1145/2939672.2939754}, 214 | doi = {10.1145/2939672.2939754}, 215 | acmid = {2939754}, 216 | publisher = {ACM}, 217 | address = {New York, NY, USA}, 218 | keywords = {feature learning, graph representations, information networks, node embeddings}, 219 | } 220 | 221 | Original code : https://github.com/aditya-grover/node2vec 222 | 223 | Modified by : Jacques Fize 224 | """ 225 | 226 | def __init__(self): 227 | Base.__init__(self,0,False) 228 | 229 | def extract_embedding(self, listgs): 230 | """ 231 | Extract Node2vec embedding of each graph in `listgs` 232 | 233 | Parameters 234 | ---------- 235 | listgs : list 236 | list of graphs 237 | 238 | Returns 239 | ------- 240 | list 241 | list of embeddings 242 | """ 243 | 244 | from tqdm import tqdm 245 | models = Parallel(n_jobs = self.cpu_count)(delayed(compute_graph_model)(g,directed=g.is_directed()) for g in tqdm(listgs,desc="Extracting Embeddings...")) 246 | return models 247 | 248 | @cython.boundscheck(False) 249 | cpdef np.ndarray compare(self,list listgs, list selected): 250 | # Selected is ignored 251 | [nx.set_edge_attributes(g,1,'weight') for g in listgs] 252 | models = self.extract_embedding(listgs) 253 | vector_matrix = np.array([mod.mean(axis=0) for mod in models]) # Average nodes representations 254 | cs = cosine_similarity(vector_matrix) 255 | return cs 256 | -------------------------------------------------------------------------------- /gmatch4py/embedding/skipgram.pyx: -------------------------------------------------------------------------------- 1 | import logging 2 | from multiprocessing import cpu_count 3 | 4 | from gensim.models import Word2Vec 5 | 6 | logger = logging.getLogger("deepwalk") 7 | 8 | class Skipgram(Word2Vec): 9 | """A subclass to allow more customization of the Word2Vec internals.""" 10 | 11 | def __init__(self, vocabulary_counts=None, **kwargs): 12 | 13 | self.vocabulary_counts = None 14 | 15 | kwargs["min_count"] = kwargs.get("min_count", 0) 16 | kwargs["workers"] = kwargs.get("workers", cpu_count()) 17 | kwargs["size"] = kwargs.get("size", 128) 18 | kwargs["sentences"] = kwargs.get("sentences", None) 19 | kwargs["window"] = kwargs.get("window", 10) 20 | kwargs["sg"] = 1 21 | kwargs["hs"] = 1 22 | 23 | if vocabulary_counts != None: 24 | self.vocabulary_counts = vocabulary_counts 25 | 26 | super(Skipgram, self).__init__(**kwargs) 27 | -------------------------------------------------------------------------------- /gmatch4py/embedding/walks.pyx: -------------------------------------------------------------------------------- 1 | import logging 2 | from io import open 3 | from os import path 4 | from time import time 5 | from multiprocessing import cpu_count 6 | import random 7 | from concurrent.futures import ProcessPoolExecutor 8 | from collections import Counter 9 | 10 | from six.moves import zip 11 | 12 | from . import graph 13 | 14 | logger = logging.getLogger("deepwalk") 15 | 16 | __current_graph = None 17 | 18 | # speed up the string encoding 19 | __vertex2str = None 20 | 21 | def count_words(file): 22 | """ Counts the word frequences in a list of sentences. 23 | 24 | Note: 25 | This is a helper function for parallel execution of `Vocabulary.from_text` 26 | method. 27 | """ 28 | c = Counter() 29 | with open(file, 'r') as f: 30 | for l in f: 31 | words = l.strip().split() 32 | c.update(words) 33 | return c 34 | 35 | 36 | def count_textfiles(files, workers=1): 37 | c = Counter() 38 | with ProcessPoolExecutor(max_workers=workers) as executor: 39 | for c_ in executor.map(count_words, files): 40 | c.update(c_) 41 | return c 42 | 43 | 44 | def count_lines(f): 45 | if path.isfile(f): 46 | num_lines = sum(1 for line in open(f)) 47 | return num_lines 48 | else: 49 | return 0 50 | 51 | def _write_walks_to_disk(args): 52 | num_paths, path_length, alpha, rand, f = args 53 | G = __current_graph 54 | t_0 = time() 55 | with open(f, 'w') as fout: 56 | for walk in graph.build_deepwalk_corpus_iter(G=G, num_paths=num_paths, path_length=path_length, 57 | alpha=alpha, rand=rand): 58 | fout.write(u"{}\n".format(u" ".join(v for v in walk))) 59 | logger.debug("Generated new file {}, it took {} seconds".format(f, time() - t_0)) 60 | return f 61 | 62 | def write_walks_to_disk(G, filebase, num_paths, path_length, alpha=0, rand=random.Random(0), num_workers=cpu_count(), 63 | always_rebuild=True): 64 | global __current_graph 65 | __current_graph = G 66 | files_list = ["{}.{}".format(filebase, str(x)) for x in list(range(num_paths))] 67 | expected_size = len(G) 68 | args_list = [] 69 | files = [] 70 | 71 | if num_paths <= num_workers: 72 | paths_per_worker = [1 for x in range(num_paths)] 73 | else: 74 | paths_per_worker = [len(list(filter(lambda z: z!= None, [y for y in x]))) 75 | for x in graph.grouper(int(num_paths / num_workers)+1, range(1, num_paths+1))] 76 | 77 | with ProcessPoolExecutor(max_workers=num_workers) as executor: 78 | for size, file_, ppw in zip(executor.map(count_lines, files_list), files_list, paths_per_worker): 79 | if always_rebuild or size != (ppw*expected_size): 80 | args_list.append((ppw, path_length, alpha, random.Random(rand.randint(0, 2**31)), file_)) 81 | else: 82 | files.append(file_) 83 | 84 | with ProcessPoolExecutor(max_workers=num_workers) as executor: 85 | for file_ in executor.map(_write_walks_to_disk, args_list): 86 | files.append(file_) 87 | 88 | return files 89 | 90 | class WalksCorpus(object): 91 | def __init__(self, file_list): 92 | self.file_list = file_list 93 | def __iter__(self): 94 | for file in self.file_list: 95 | with open(file, 'r') as f: 96 | for line in f: 97 | yield line.split() 98 | 99 | def combine_files_iter(file_list): 100 | for file in file_list: 101 | with open(file, 'r') as f: 102 | for line in f: 103 | yield line.split() 104 | -------------------------------------------------------------------------------- /gmatch4py/ged/__init__.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | -------------------------------------------------------------------------------- /gmatch4py/ged/abstract_graph_edit_dist.pxd: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | from ..base cimport Base 4 | 5 | cdef class AbstractGraphEditDistance(Base): 6 | cdef double node_del 7 | cdef double node_ins 8 | cdef double edge_del 9 | cdef double edge_ins 10 | cdef np.ndarray cost_matrix 11 | cdef bint weighted 12 | 13 | cpdef double distance_ged(self,G,H) 14 | cdef list edit_costs(self,G,H) 15 | cpdef np.ndarray create_cost_matrix(self,G,H) 16 | cdef double insert_cost(self, int i, int j, nodesH, H) 17 | cdef double delete_cost(self, int i, int j, nodesG, G) 18 | cpdef double substitute_cost(self, node1, node2, G, H) 19 | 20 | -------------------------------------------------------------------------------- /gmatch4py/ged/abstract_graph_edit_dist.pyx: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from __future__ import print_function 3 | 4 | import sys 5 | import warnings 6 | 7 | import numpy as np 8 | cimport numpy as np 9 | import networkx as nx 10 | from cython.parallel cimport prange,parallel 11 | 12 | try: 13 | from munkres import munkres 14 | except ImportError: 15 | warnings.warn("To obtain optimal results install the Cython 'munkres' module at https://github.com/jfrelinger/cython-munkres-wrapper") 16 | from scipy.optimize import linear_sum_assignment as munkres 17 | 18 | from ..base cimport Base 19 | from ..helpers.general import parsenx2graph 20 | 21 | 22 | 23 | cdef class AbstractGraphEditDistance(Base): 24 | 25 | 26 | def __init__(self, node_del,node_ins,edge_del,edge_ins): 27 | Base.__init__(self,1,False) 28 | 29 | self.node_del = node_del 30 | self.node_ins = node_ins 31 | self.edge_del = edge_del 32 | self.edge_ins = edge_ins 33 | 34 | 35 | cpdef double distance_ged(self,G,H): 36 | """ 37 | Return the distance value between G and H 38 | 39 | Parameters 40 | ---------- 41 | G : gmatch4py.Graph 42 | graph 43 | H : gmatch4py.Graph 44 | graph 45 | 46 | Returns 47 | ------- 48 | int 49 | distance 50 | """ 51 | cdef list opt_path = self.edit_costs(G,H) 52 | return np.sum(opt_path) 53 | 54 | def edit_path(self,G,H): 55 | """ 56 | Return the edit path along with the cost matrix and the selected indices from the Munkres Algorithm 57 | 58 | Parameters 59 | ---------- 60 | G : nx.Graph 61 | first graph 62 | H : nx.Graph 63 | second graph 64 | 65 | Returns 66 | ------- 67 | np.array(1D), np.array(2D), (np.array(2D) if munkres) or (np.array(1,2) if scipy) 68 | edit_path, cost_matrix, munkres results 69 | """ 70 | cost_matrix = self.create_cost_matrix(G,H).astype(float) 71 | index_path= munkres(cost_matrix) 72 | return cost_matrix[index_path], cost_matrix, index_path 73 | 74 | 75 | cdef list edit_costs(self, G, H): 76 | """ 77 | Return the optimal path edit cost list, to transform G into H 78 | 79 | Parameters 80 | ---------- 81 | G : gmatch4py.Graph 82 | graph 83 | H : gmatch4py.Graph 84 | graph 85 | 86 | Returns 87 | ------- 88 | np.array 89 | edit path 90 | """ 91 | cdef np.ndarray cost_matrix = self.create_cost_matrix(G,H).astype(float) 92 | return cost_matrix[munkres(cost_matrix)].tolist() 93 | 94 | cpdef np.ndarray create_cost_matrix(self, G, H): 95 | """ 96 | Creates a |N+M| X |N+M| cost matrix between all nodes in 97 | graphs G and H 98 | Each cost represents the cost of substituting, 99 | deleting or inserting a node 100 | The cost matrix consists of four regions: 101 | 102 | substitute | insert costs 103 | ------------------------------- 104 | delete | delete -> delete 105 | 106 | The delete -> delete region is filled with zeros 107 | 108 | Parameters 109 | ---------- 110 | G : gmatch4py.Graph 111 | graph 112 | H : gmatch4py.Graph 113 | graph 114 | 115 | Returns 116 | ------- 117 | np.array 118 | cost matrix 119 | """ 120 | cdef int n,m 121 | try: 122 | n = G.number_of_nodes() 123 | m = H.number_of_nodes() 124 | except: 125 | n = G.size() 126 | m = H.size() 127 | cdef np.ndarray cost_matrix = np.zeros((n+m,n+m)) 128 | cdef list nodes1 = list(G.nodes()) 129 | cdef list nodes2 = list(H.nodes()) 130 | cdef int i,j 131 | for i in range(n): 132 | for j in range(m): 133 | cost_matrix[i,j] = self.substitute_cost(nodes1[i], nodes2[j], G, H) 134 | 135 | for i in range(m): 136 | for j in range(m): 137 | cost_matrix[i+n,j] = self.insert_cost(i, j, nodes2, H) 138 | 139 | for i in range(n): 140 | for j in range(n): 141 | cost_matrix[j,i+m] = self.delete_cost(i, j, nodes1, G) 142 | 143 | return cost_matrix 144 | 145 | cdef double insert_cost(self, int i, int j, nodesH, H): 146 | """ 147 | Return the insert cost of the ith nodes in H 148 | 149 | Returns 150 | ------- 151 | int 152 | insert cost 153 | """ 154 | raise NotImplementedError 155 | 156 | cdef double delete_cost(self, int i, int j, nodesG, G): 157 | """ 158 | Return the delete cost of the ith nodes in H 159 | 160 | Returns 161 | ------- 162 | int 163 | delete cost 164 | """ 165 | raise NotImplementedError 166 | 167 | cpdef double substitute_cost(self, node1, node2, G, H): 168 | """ 169 | Return the substitute cost of between the node1 in G and the node2 in H 170 | 171 | Returns 172 | ------- 173 | int 174 | substitution cost 175 | """ 176 | raise NotImplementedError 177 | 178 | 179 | cpdef np.ndarray compare(self,list listgs, list selected): 180 | cdef int n = len(listgs) 181 | cdef double[:,:] comparison_matrix = np.zeros((n, n)) 182 | listgs=parsenx2graph(listgs,self.node_attr_key,self.edge_attr_key) 183 | cdef long[:] n_nodes = np.array([g.size() for g in listgs]) 184 | cdef double[:] selected_test = np.array(self.get_selected_array(selected,n)) 185 | cdef int i,j 186 | cdef float inf=np.inf 187 | 188 | with nogil, parallel(num_threads=self.cpu_count): 189 | for i in prange(n,schedule='static'): 190 | for j in range(n): 191 | if n_nodes[i]>0 and n_nodes[j]>0 and selected_test[i] == 1 : 192 | with gil: 193 | comparison_matrix[i][j] = self.distance_ged(listgs[i],listgs[j]) 194 | else: 195 | comparison_matrix[i][j] = inf 196 | #comparison_matrix[j, i] = comparison_matrix[i, j] 197 | return np.array(comparison_matrix) 198 | -------------------------------------------------------------------------------- /gmatch4py/ged/bipartite_graph_matching_2.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | import numpy as np 3 | cimport numpy as np 4 | from ..base cimport Base 5 | from cython.parallel cimport prange,parallel 6 | from ..helpers.general import parsenx2graph 7 | cimport cython 8 | 9 | cdef class BP_2(Base): 10 | 11 | 12 | cdef int node_del 13 | cdef int node_ins 14 | cdef int edge_del 15 | cdef int edge_ins 16 | 17 | def __init__(self, int node_del=1, int node_ins=1, int edge_del=1, int edge_ins=1): 18 | """ 19 | BP_2 Constructor 20 | 21 | Parameters 22 | ---------- 23 | node_del :int 24 | Node deletion cost 25 | node_ins : int 26 | Node insertion cost 27 | edge_del : int 28 | Edge Deletion cost 29 | edge_ins : int 30 | Edge Insertion cost 31 | """ 32 | Base.__init__(self,1,False) 33 | self.node_del = node_del 34 | self.node_ins = node_ins 35 | self.edge_del = edge_del 36 | self.edge_ins = edge_ins 37 | 38 | 39 | @cython.boundscheck(False) 40 | cpdef np.ndarray compare(self,list listgs, list selected): 41 | cdef int n = len(listgs) 42 | cdef list new_gs=parsenx2graph(listgs) 43 | cdef double[:,:] comparison_matrix = np.zeros((n, n)) 44 | cdef double[:] selected_test = self.get_selected_array(selected,n) 45 | cdef int i,j 46 | cdef long[:] n_nodes = np.array([g.size() for g in new_gs]) 47 | cdef long[:] n_edges = np.array([g.density() for g in new_gs]) 48 | 49 | with nogil, parallel(num_threads=self.cpu_count): 50 | for i in prange(n,schedule='static'): 51 | for j in range(i,n): 52 | if n_nodes[i] > 0 and n_nodes[j] > 0 and selected_test[i] == 1: 53 | with gil: 54 | comparison_matrix[i, j] = self.bp2(new_gs[i], new_gs[j]) 55 | else: 56 | comparison_matrix[i, j] = 0 57 | comparison_matrix[j, i] = comparison_matrix[i, j] 58 | 59 | return np.array(comparison_matrix) 60 | 61 | 62 | cdef double bp2(self, g1, g2): 63 | """ 64 | Compute the BP2 similarity value between two `networkx.Graph` 65 | 66 | Parameters 67 | ---------- 68 | g1 : gmatch4py.Graph 69 | First Graph 70 | g2 : gmatch4py.Graph 71 | Second Graph 72 | 73 | Returns 74 | ------- 75 | float 76 | similarity value 77 | """ 78 | return np.min([self.distance_bp2(self.psi(g1,g2)),self.distance_bp2(self.psi(g2,g1))]) 79 | 80 | cdef double distance_bp2(self,e): 81 | """ 82 | Return the distance based on the edit path found. 83 | Parameters 84 | ---------- 85 | e : list 86 | Contains the edit path costs 87 | 88 | Returns 89 | ------- 90 | double 91 | Return sum of the costs from the edit path 92 | """ 93 | return np.sum(e) 94 | 95 | cdef list psi(self,g1,g2): 96 | """ 97 | Return the optimal edit path :math:`\psi` based on BP2 algorithm. 98 | 99 | 100 | Parameters 101 | ---------- 102 | g1 : networkx.Graph 103 | First Graph 104 | g2 : networkx.Graph 105 | Second Graph 106 | 107 | Returns 108 | ------- 109 | list 110 | list containing costs from the optimal edit path 111 | """ 112 | cdef list psi_=[] 113 | cdef list nodes1 = list(g1.nodes()) 114 | cdef list nodes2 = list(g2.nodes()) 115 | for u in nodes1: 116 | v=None 117 | for w in nodes2: 118 | if 2*self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,None) + self.fuv(g1,g2,None,w)\ 119 | and self.fuv(g1,g2,u,w) < self.fuv(g1,g2,u,v): 120 | v=w 121 | psi_.append(self.fuv(g1,g2,u,v)) 122 | if u: 123 | nodes1= list(set(nodes1).difference(set([u]))) 124 | if v: 125 | nodes2= list(set(nodes2).difference(set([v]))) 126 | for v in nodes2: 127 | psi_.append(self.fuv(g1,g2,None,v)) 128 | return psi_ 129 | 130 | 131 | 132 | cdef float fuv(self, g1, g2, str n1, str n2): 133 | """ 134 | Compute the Node Distance function 135 | Parameters 136 | ---------- 137 | g1 : gmatch4py.Graph 138 | First graph 139 | g2 : gmatch4py.Graph 140 | Second graph 141 | n1 : int or str 142 | identifier of the first node 143 | n2 : int or str 144 | identifier of the second node 145 | 146 | Returns 147 | ------- 148 | float 149 | node distance 150 | """ 151 | if n2 == None: # Del 152 | return self.node_del + ((self.edge_del / 2.) * g1.degree(n1)) 153 | if n1 == None: # Insert 154 | return self.node_ins + ((self.edge_ins / 2.) * g2.degree(n2)) 155 | else: 156 | if n1 == n2: 157 | return 0 158 | return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2 159 | 160 | cdef float hed_edge(self, g1, g2, str n1, str n2): 161 | """ 162 | Compute HEDistance between edges of n1 and n2, respectively in g1 and g2 163 | Parameters 164 | ---------- 165 | g1 : gmatch4py.Graph 166 | First graph 167 | g2 : gmatch4py.Graph 168 | Second graph 169 | n1 : int or str 170 | identifier of the first node 171 | n2 : int or str 172 | identifier of the second node 173 | 174 | Returns 175 | ------- 176 | float 177 | HEDistance between g1 and g2 178 | """ 179 | return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2) 180 | 181 | 182 | cdef float sum_gpq(self, g1, str n1, g2, str n2): 183 | """ 184 | Compute Nearest Neighbour Distance between edges around n1 in G1 and edges around n2 in G2 185 | Parameters 186 | ---------- 187 | g1 : gmatch4py.Graph 188 | First graph 189 | g2 : gmatch4py.Graph 190 | Second graph 191 | n1 : int or str 192 | identifier of the first node 193 | n2 : int or str 194 | identifier of the second node 195 | 196 | Returns 197 | ------- 198 | float 199 | Nearest Neighbour Distance 200 | """ 201 | 202 | #if isinstance(g1, nx.MultiDiGraph): 203 | cdef list edges1 = g1.get_edges_no(n1) if n1 else [] 204 | cdef list edges2 = g2.get_edges_no(n2) if n2 else [] 205 | 206 | cdef np.ndarray min_sum = np.zeros(len(edges1)) 207 | edges2.extend([None]) 208 | cdef np.ndarray min_i 209 | for i in range(len(edges1)): 210 | min_i = np.zeros(len(edges2)) 211 | for j in range(len(edges2)): 212 | min_i[j] = self.gpq(edges1[i], edges2[j]) 213 | min_sum[i] = np.min(min_i) 214 | return np.sum(min_sum) 215 | 216 | cdef float gpq(self, str e1, str e2): 217 | """ 218 | Compute the edge distance function 219 | Parameters 220 | ---------- 221 | e1 : str 222 | first edge identifier 223 | e2 224 | second edge indentifier 225 | Returns 226 | ------- 227 | float 228 | edge distance 229 | """ 230 | 231 | if e2 == None: # Del 232 | return self.edge_del 233 | if e1 == None: # Insert 234 | return self.edge_ins 235 | else: 236 | if e1 == e2: 237 | return 0 238 | return (self.edge_del + self.edge_ins) / 2. 239 | -------------------------------------------------------------------------------- /gmatch4py/ged/graph_edit_dist.pxd: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | from .abstract_graph_edit_dist cimport AbstractGraphEditDistance 4 | 5 | 6 | cdef class GraphEditDistance(AbstractGraphEditDistance): 7 | cpdef object relabel_cost(self, node1, node2, G, H) 8 | cpdef double substitute_cost(self, node1, node2, G, H) 9 | cdef double delete_cost(self, int i, int j, nodesG, G) 10 | cdef double insert_cost(self, int i, int j, nodesH, H) -------------------------------------------------------------------------------- /gmatch4py/ged/graph_edit_dist.pyx: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import sys 4 | 5 | import networkx as nx 6 | import numpy as np 7 | cimport numpy as np 8 | from .abstract_graph_edit_dist cimport AbstractGraphEditDistance 9 | 10 | 11 | 12 | cdef class GraphEditDistance(AbstractGraphEditDistance): 13 | 14 | def __init__(self,node_del,node_ins,edge_del,edge_ins,weighted=False): 15 | AbstractGraphEditDistance.__init__(self,node_del,node_ins,edge_del,edge_ins) 16 | self.weighted=weighted 17 | 18 | cpdef double substitute_cost(self, node1, node2, G, H): 19 | return self.relabel_cost(node1, node2, G, H) 20 | 21 | cpdef object relabel_cost(self, node1, node2, G, H): 22 | ## Si deux noeuds égaux 23 | if node1 == node2 and G.degree(node1) == H.degree(node2): 24 | return 0.0 25 | elif node1 == node2 and G.degree(node1) != H.degree(node2): 26 | #R = Graph(self.add_edges(node1,node2,G),G.get_node_key(),G.get_egde_key()) 27 | #R2 = Graph(self.add_edges(node1,node2,H),H.get_node_key(),H.get_egde_key()) 28 | #inter_= R.size_edge_intersect(R2) 29 | R=set(G.get_edges_no(node1)) 30 | R2=set(H.get_edges_no(node2)) 31 | inter_=R.intersection(R2) 32 | add_diff=abs(len(R2)-len(inter_))#abs(R2.density()-inter_) 33 | del_diff=abs(len(R)-len(inter_))#abs(R.density()-inter_) 34 | return (add_diff*self.edge_ins)+(del_diff*self.edge_del) 35 | 36 | 37 | #si deux noeuds connectés 38 | if G.has_edge(node1,node2) or G.has_edge(node2,node1): 39 | return self.node_ins+self.node_del 40 | if not node2 in G.nodes(): 41 | nodesH=H.nodes() 42 | index=list(nodesH).index(node2) 43 | return self.node_del+self.node_ins+self.insert_cost(index,index,nodesH,H) 44 | return sys.maxsize 45 | 46 | cdef double delete_cost(self, int i, int j, nodesG, G): 47 | if i == j: 48 | return self.node_del+(G.degree(nodesG[i],weight=True)*self.edge_del) # Deleting a node implicate to delete in and out edges 49 | return sys.maxsize 50 | 51 | cdef double insert_cost(self, int i, int j, nodesH, H): 52 | if i == j: 53 | deg=H.degree(nodesH[j],weight=True) 54 | if isinstance(deg,dict):deg=0 55 | return self.node_ins+(deg*self.edge_ins) 56 | else: 57 | return sys.maxsize -------------------------------------------------------------------------------- /gmatch4py/ged/greedy_edit_distance.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | import sys 3 | 4 | from .graph_edit_dist cimport GraphEditDistance 5 | import numpy as np 6 | cimport numpy as np 7 | from cython.parallel cimport prange,parallel 8 | 9 | cdef class GreedyEditDistance(GraphEditDistance): 10 | """ 11 | Implementation of the Greedy Edit Distance presented in : 12 | 13 | Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement 14 | Andreas Fischer, Kaspar Riesen, Horst Bunke 15 | 2016 16 | """ 17 | 18 | def __init__(self,node_del,node_ins,edge_del,edge_ins): 19 | GraphEditDistance.__init__(self,node_del,node_ins,edge_del,edge_ins) 20 | 21 | 22 | cdef list edit_costs(self, G, H): 23 | cdef np.ndarray cost_matrix=self.create_cost_matrix(G,H) 24 | cdef np.ndarray cost_matrix_2=cost_matrix.copy().astype(np.double) 25 | cdef list psi=[] 26 | for i in range(len(cost_matrix)): 27 | phi_i=np.argmin(cost_matrix_2[i]) 28 | cost_matrix_2[:,phi_i]=sys.maxsize 29 | psi.append([i,phi_i]) #+i to compensate the previous column deletion 30 | return [cost_matrix[psi[i][0]][psi[i][1]] for i in range(len(psi))] 31 | -------------------------------------------------------------------------------- /gmatch4py/ged/hausdorff_edit_distance.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | import numpy as np 4 | cimport numpy as np 5 | from ..base cimport Base 6 | from cython.parallel cimport prange,parallel 7 | from ..helpers.general import parsenx2graph 8 | cimport cython 9 | 10 | cdef class HED(Base): 11 | """ 12 | Implementation of Hausdorff Edit Distance described in 13 | 14 | Improved quadratic time approximation of graph edit distance by Hausdorff matching and greedy assignement 15 | Andreas Fischer, Kaspar Riesen, Horst Bunke 16 | 2016 17 | """ 18 | 19 | cdef int node_del 20 | cdef int node_ins 21 | cdef int edge_del 22 | cdef int edge_ins 23 | 24 | def __init__(self, int node_del=1, int node_ins=1, int edge_del=1, int edge_ins=1): 25 | """ 26 | HED Constructor 27 | 28 | Parameters 29 | ---------- 30 | node_del :int 31 | Node deletion cost 32 | node_ins : int 33 | Node insertion cost 34 | edge_del : int 35 | Edge Deletion cost 36 | edge_ins : int 37 | Edge Insertion cost 38 | """ 39 | Base.__init__(self,1,False) 40 | self.node_del = node_del 41 | self.node_ins = node_ins 42 | self.edge_del = edge_del 43 | self.edge_ins = edge_ins 44 | 45 | 46 | @cython.boundscheck(False) 47 | cpdef np.ndarray compare(self,list listgs, list selected): 48 | cdef int n = len(listgs) 49 | cdef list new_gs=parsenx2graph(listgs,self.node_attr_key,self.edge_attr_key) 50 | cdef double[:,:] comparison_matrix = np.zeros((n, n)) 51 | cdef double[:] selected_test = np.array(self.get_selected_array(selected,n)) 52 | cdef int i,j 53 | cdef long[:] n_nodes = np.array([g.size() for g in new_gs]) 54 | cdef long[:] n_edges = np.array([g.density() for g in new_gs]) 55 | 56 | with nogil, parallel(num_threads=self.cpu_count): 57 | for i in prange(n,schedule='static'): 58 | for j in range(i,n): 59 | if n_nodes[i] > 0 and n_nodes[j] > 0 and selected_test[i] == True: 60 | with gil: 61 | comparison_matrix[i, j] = self.hed(new_gs[i], new_gs[j]) 62 | else: 63 | comparison_matrix[i, j] = 0 64 | comparison_matrix[j, i] = comparison_matrix[i, j] 65 | 66 | return np.array(comparison_matrix) 67 | 68 | 69 | cdef float hed(self, g1, g2): 70 | """ 71 | Compute the HED similarity value between two `gmatch4py.Graph` 72 | 73 | Parameters 74 | ---------- 75 | g1 : gmatch4py.Graph 76 | First Graph 77 | g2 : gmatch4py.Graph 78 | Second Graph 79 | 80 | Returns 81 | ------- 82 | float 83 | similarity value 84 | """ 85 | return self.sum_fuv(g1, g2) + self.sum_fuv(g2, g1) 86 | 87 | cdef float sum_fuv(self, g1, g2): 88 | """ 89 | Compute Nearest Neighbour Distance between G1 and G2 90 | Parameters 91 | ---------- 92 | g1 : gmatch4py.Graph 93 | First graph 94 | g2 : gmatch4py.Graph 95 | Second graph 96 | 97 | Returns 98 | ------- 99 | float 100 | Nearest Neighbour Distance 101 | """ 102 | 103 | cdef np.ndarray min_sum = np.zeros(g1.size()) 104 | cdef list nodes1 = list(g1.nodes()) 105 | cdef list nodes2 = list(g2.nodes()) 106 | nodes2.extend([None]) 107 | cdef np.ndarray min_i 108 | for i in range(g1.size()): 109 | min_i = np.zeros(g2.size()) 110 | for j in range(g2.size()): 111 | min_i[j] = self.fuv(g1, g2, nodes1[i], nodes2[j]) 112 | min_sum[i] = np.min(min_i) 113 | return np.sum(min_sum) 114 | 115 | cdef float fuv(self, g1, g2, str n1, str n2): 116 | """ 117 | Compute the Node Distance function 118 | Parameters 119 | ---------- 120 | g1 : gmatch4py.Graph 121 | First graph 122 | g2 : gmatch4py.Graph 123 | Second graph 124 | n1 : int or str 125 | identifier of the first node 126 | n2 : int or str 127 | identifier of the second node 128 | 129 | Returns 130 | ------- 131 | float 132 | node distance 133 | """ 134 | if n2 == None: # Del 135 | return self.node_del + ((self.edge_del / 2.) * g1.degree(n1)) 136 | if n1 == None: # Insert 137 | return self.node_ins + ((self.edge_ins / 2.) * g2.degree(n2)) 138 | else: 139 | if n1 == n2: 140 | return 0 141 | return (self.node_del + self.node_ins + self.hed_edge(g1, g2, n1, n2)) / 2 142 | 143 | cdef float hed_edge(self, g1, g2, str n1, str n2): 144 | """ 145 | Compute HEDistance between edges of n1 and n2, respectively in g1 and g2 146 | Parameters 147 | ---------- 148 | g1 : gmatch4py.Graph 149 | First graph 150 | g2 : gmatch4py.Graph 151 | Second graph 152 | n1 : int or str 153 | identifier of the first node 154 | n2 : int or str 155 | identifier of the second node 156 | 157 | Returns 158 | ------- 159 | float 160 | HEDistance between g1 and g2 161 | """ 162 | return self.sum_gpq(g1, n1, g2, n2) + self.sum_gpq(g1, n1, g2, n2) 163 | 164 | 165 | cdef float sum_gpq(self, g1, str n1, g2, str n2): 166 | """ 167 | Compute Nearest Neighbour Distance between edges around n1 in G1 and edges around n2 in G2 168 | Parameters 169 | ---------- 170 | g1 : gmatch4py.Graph 171 | First graph 172 | g2 : gmatch4py.Graph 173 | Second graph 174 | n1 : int or str 175 | identifier of the first node 176 | n2 : int or str 177 | identifier of the second node 178 | 179 | Returns 180 | ------- 181 | float 182 | Nearest Neighbour Distance 183 | """ 184 | 185 | #if isinstance(g1, nx.MultiDiGraph): 186 | cdef list edges1 = g1.get_edges_no(n1) if n1 else [] # rename method ... 187 | cdef list edges2 = g2.get_edges_no(n2) if n2 else [] 188 | 189 | cdef np.ndarray min_sum = np.zeros(len(edges1)) 190 | edges2.extend([None]) 191 | cdef np.ndarray min_i 192 | for i in range(len(edges1)): 193 | min_i = np.zeros(len(edges2)) 194 | for j in range(len(edges2)): 195 | min_i[j] = self.gpq(edges1[i], edges2[j]) 196 | min_sum[i] = np.min(min_i) 197 | return np.sum(min_sum) 198 | 199 | cdef float gpq(self, str e1, str e2): 200 | """ 201 | Compute the edge distance function 202 | Parameters 203 | ---------- 204 | e1 : str 205 | first edge identifier 206 | e2 207 | second edge indentifier 208 | Returns 209 | ------- 210 | float 211 | edge distance 212 | """ 213 | if e2 == None: # Del 214 | return self.edge_del 215 | if e1 == None: # Insert 216 | return self.edge_ins 217 | else: 218 | if e1 == e2: 219 | return 0 220 | return (self.edge_del + self.edge_ins) / 2. 221 | -------------------------------------------------------------------------------- /gmatch4py/graph.pxd: -------------------------------------------------------------------------------- 1 | cimport numpy as np 2 | 3 | cdef class Graph: 4 | ################################## 5 | # ATTRIBUTES 6 | ################################## 7 | 8 | # GRAPH PROPERTY ATTRIBUTES 9 | ########################### 10 | cdef bint is_directed # If the graph is directed 11 | cdef bint is_multi # If the graph is a Multi-Graph 12 | cdef bint is_node_attr 13 | cdef bint is_edge_attr 14 | 15 | # ATTR VAL ATTRIBUTES 16 | ##################### 17 | cdef str node_attr_key # Key that contains the main attr value for a node 18 | cdef str edge_attr_key # Key that contains the main attr value for an edge 19 | cdef set unique_node_attr_vals # list 20 | cdef set unique_edge_attr_vals # list 21 | 22 | 23 | ## NODE ATTRIBUTES 24 | ################# 25 | 26 | cdef list nodes_list # list of nodes ids 27 | cdef list nodes_attr_list # list of attr value for each node (following nodes list order) 28 | cdef list nodes_hash # hash representation of every node 29 | cdef set nodes_hash_set # hash representation of every node (set version for intersection and union operation) 30 | cdef dict nodes_idx # index of each node in `nodes_list` 31 | cdef list nodes_weight # list that contains each node's weight (following nodes_list order) 32 | cdef long[:] nodes_degree # degree list 33 | cdef long[:] nodes_degree_in # in degree list 34 | cdef long[:] nodes_degree_out # out degree list 35 | cdef double[:] nodes_degree_weighted #weighted vers. of nodes_degree 36 | cdef double[:] nodes_degree_in_weighted #weighted vers. of nodes_degree_in 37 | cdef double[:] nodes_degree_out_weighted #weighted vers. of nodes_degree_out 38 | cdef dict degree_per_attr # degree information per attr val 39 | cdef dict degree_per_attr_weighted # degree information per attr val 40 | cdef list attr_nodes # list of attr(dict) values for each node 41 | cdef dict edges_of_nodes # list of egdes connected to each node 42 | 43 | # EDGES ATTRIBUTES 44 | ################## 45 | 46 | cdef list edges_list # edge list 47 | cdef list edges_attr_list # list of attr value for each edge (following nodes list order) 48 | cdef dict edges_hash_idx # index of hash in edges_list and edges_attr_list 49 | cdef list edges_hash # hash representation of every edges ## A VOIR ! 50 | cdef set edges_hash_set # set of hash representation of every edges (set version for intersection and union operation) 51 | cdef dict edges_weight # list that contains each node's weight (following nodes_list order) 52 | cdef dict edges_hash_map #[id1,[id2,hash]] 53 | cdef list attr_edges # list of attr(dict) values for each edge 54 | 55 | # SIZE ATTTRIBUTE 56 | ############### 57 | 58 | cdef long number_of_nodes # number of nodes 59 | cdef long number_of_edges # number of edges 60 | 61 | cdef dict number_of_edges_per_attr # number of nodes per attr value 62 | cdef dict number_of_nodes_per_attr # number of edges per attr value 63 | 64 | cdef object nx_g 65 | 66 | ################################## 67 | # METHODS 68 | ################################## 69 | 70 | # DIMENSION GETTER 71 | ################## 72 | cpdef long size(self) 73 | cpdef int size_attr(self, attr_val) 74 | 75 | cpdef long density(self) 76 | cpdef int density_attr(self, str attr_val) 77 | 78 | # HASH FUNCTION 79 | ############### 80 | cpdef str hash_node(self,str n1) 81 | cpdef str hash_edge(self,str n1,str n2) 82 | cpdef str hash_node_attr(self,str n1, str attr_value) 83 | cpdef str hash_edge_attr(self,str n1,str n2, str attr_value) 84 | 85 | ## EXIST FUNCTION 86 | ############### 87 | cpdef bint has_node(self,str n_id) 88 | cpdef bint has_edge(self,str n_id1,str n_id2) 89 | 90 | ## LEN FUNCTION 91 | ############### 92 | cpdef int size_node_intersect(self,Graph G) 93 | cpdef int size_node_union(self,Graph G) 94 | 95 | cpdef int size_edge_intersect(self,Graph G) 96 | cpdef int size_edge_union(self,Graph G) 97 | 98 | # DEGREE FUNCTION 99 | ################# 100 | cpdef double degree(self,str n_id, bint weight=*) 101 | cpdef double in_degree(self,str n_id, bint weight=*) 102 | cpdef double out_degree(self,str n_id, bint weight=*) 103 | 104 | cpdef double in_degree_attr(self,str n_id,str attr_val, bint weight=*) 105 | cpdef double out_degree_attr(self,str n_id,str attr_val, bint weight=*) 106 | cpdef double degree_attr(self,str n_id,str attr_val, bint weight=*) 107 | 108 | ## GETTER 109 | ######### 110 | 111 | cpdef list get_edges_ed(self,str e1, str e2) 112 | cpdef list get_edges_no(self,str n) 113 | cpdef set get_edges_hash(self) 114 | cpdef set get_nodes_hash(self) 115 | 116 | cpdef str get_node_key(self) 117 | cpdef str get_egde_key(self) 118 | 119 | cpdef dict get_edge_attrs(self,edge_hash) 120 | cpdef dict get_node_attrs(self, node_hash) 121 | cpdef dict get_node_attr(self, node_hash) 122 | cpdef dict get_edge_attr(self,edge_hash) -------------------------------------------------------------------------------- /gmatch4py/graph.pyx: -------------------------------------------------------------------------------- 1 | from libcpp.map cimport map 2 | from libcpp.utility cimport pair 3 | from libcpp.string cimport string 4 | from libcpp.vector cimport vector 5 | import numpy as np 6 | cimport numpy as np 7 | import networkx as nx 8 | 9 | cdef class Graph: 10 | 11 | def __init__(self,G, node_attr_key="",edge_attr_key=""): 12 | self.nx_g=G 13 | 14 | #GRAPH PROPERTY INIT 15 | self.is_directed = G.is_directed() 16 | self.is_multi = G.is_multigraph() 17 | self.is_node_attr=(True if node_attr_key else False) 18 | self.is_edge_attr=(True if edge_attr_key else False) 19 | if self.is_multi and not self.is_edge_attr: 20 | if not len(nx.get_edge_attributes(G,"id")) == len(G.edges(data=True)): 21 | i=0 22 | for id1 in G.adj: 23 | for id2 in G.adj[id1]: 24 | for id3 in G.adj[id1][id2]: 25 | G._adj[id1][id2][id3]["id"]=str(i) 26 | i+=1 27 | self.is_edge_attr = True 28 | edge_attr_key = "id" 29 | 30 | # for ed in 31 | 32 | #len(nx.get_edge_attributes(G1,"id")) == len(G1.edges(data=True)) 33 | 34 | if len(G) ==0: 35 | self.__init_empty__() 36 | 37 | else: 38 | a,b=list(zip(*list(G.nodes(data=True)))) 39 | self.nodes_list,self.attr_nodes=list(a),list(b) 40 | if G.number_of_edges()>0: 41 | e1,e2,d=zip(*list(G.edges(data=True))) 42 | self.attr_edges=list(d) 43 | self.edges_list=list(zip(e1,e2)) 44 | else: 45 | self.edges_list=[] 46 | self.attr_edges=[] 47 | 48 | if self.is_node_attr: 49 | self.node_attr_key = node_attr_key 50 | self.nodes_attr_list = [attr_dict[node_attr_key] for attr_dict in self.attr_nodes] 51 | self.unique_node_attr_vals=set(self.nodes_attr_list) 52 | 53 | if self.is_edge_attr: 54 | self.edge_attr_key = edge_attr_key 55 | self.edges_attr_list = [attr_dict[edge_attr_key] for attr_dict in self.attr_edges] 56 | self.unique_edge_attr_vals=set(self.edges_attr_list) 57 | 58 | # NODE Information init 59 | ####################### 60 | 61 | self.nodes_hash=[self.hash_node_attr(node,self.nodes_attr_list[ix]) if self.is_node_attr else self.hash_node(node) for ix, node in enumerate(self.nodes_list) ] 62 | self.nodes_hash_set=set(self.nodes_hash) 63 | self.nodes_idx={node:ix for ix, node in enumerate(self.nodes_list)} 64 | self.nodes_weight=[attr_dict["weight"] if "weight" in attr_dict else 1 for attr_dict in self.attr_nodes] 65 | degree_all=[] 66 | degree_in=[] 67 | degree_out=[] 68 | 69 | degree_all_weighted=[] 70 | degree_in_weighted=[] 71 | degree_out_weighted=[] 72 | if self.is_edge_attr: 73 | self.degree_per_attr={attr_v:{n:{"in":0,"out":0} for n in self.nodes_list} for attr_v in self.unique_edge_attr_vals} 74 | self.degree_per_attr_weighted={attr_v:{n:{"in":0,"out":0} for n in self.nodes_list} for attr_v in self.unique_edge_attr_vals} 75 | # Retrieving Degree Information 76 | self.edges_of_nodes={} 77 | for n in self.nodes_list: 78 | self.edges_of_nodes[n]=[self.hash_edge_attr(e1,e2,attr_dict[self.edge_attr_key]) if self.is_edge_attr else self.hash_edge(e1,e2) for e1,e2,attr_dict in G.edges(n,data=True)] 79 | degree_all.append(G.degree(n)) 80 | degree_all_weighted.append(G.degree(n,weight="weight")) 81 | if self.is_directed: 82 | degree_in.append(G.in_degree(n)) 83 | degree_in_weighted.append(G.in_degree(n,weight="weight")) 84 | degree_out.append(G.out_degree(n)) 85 | degree_out_weighted.append(G.out_degree(n)) 86 | else: 87 | degree_in.append(degree_all[-1]) 88 | degree_in_weighted.append(degree_all_weighted[-1]) 89 | degree_out.append(degree_all[-1]) 90 | degree_out_weighted.append(degree_all_weighted[-1]) 91 | if self.is_edge_attr: 92 | if self.is_directed: 93 | in_edge=list(G.in_edges(n,data=True)) 94 | out_edge=list(G.out_edges(n,data=True)) 95 | for n1,n2,attr_dict in in_edge: 96 | self.degree_per_attr[attr_dict[self.edge_attr_key]][n]["in"]+=1 97 | self.degree_per_attr_weighted[attr_dict[self.edge_attr_key]][n]["in"]+=1*(attr_dict["weight"] if "weight" in attr_dict else 1 ) 98 | 99 | for n1,n2,attr_dict in out_edge: 100 | self.degree_per_attr[attr_dict[self.edge_attr_key]][n]["out"]+=1 101 | self.degree_per_attr_weighted[attr_dict[self.edge_attr_key]][n]["out"]+=1*(attr_dict["weight"] if "weight" in attr_dict else 1 ) 102 | 103 | else: 104 | edges=G.edges(n,data=True) 105 | for n1,n2,attr_dict in edges: 106 | self.degree_per_attr[attr_dict[self.edge_attr_key]][n]["in"]+=1 107 | self.degree_per_attr[attr_dict[self.edge_attr_key]][n]["out"]+=1 108 | self.degree_per_attr_weighted[attr_dict[self.edge_attr_key]][n]["in"]+=1*(attr_dict["weight"] if "weight" in attr_dict else 1 ) 109 | self.degree_per_attr_weighted[attr_dict[self.edge_attr_key]][n]["out"]+=1*(attr_dict["weight"] if "weight" in attr_dict else 1 ) 110 | 111 | self.nodes_degree=np.array(degree_all) 112 | self.nodes_degree_in=np.array(degree_in) 113 | self.nodes_degree_out=np.array(degree_out) 114 | 115 | self.nodes_degree_weighted=np.array(degree_all_weighted).astype(np.double) 116 | self.nodes_degree_in_weighted=np.array(degree_in_weighted).astype(np.double) 117 | self.nodes_degree_out_weighted=np.array(degree_out_weighted).astype(np.double) 118 | 119 | 120 | # EDGE INFO INIT 121 | ################# 122 | 123 | self.edges_hash=[] 124 | self.edges_hash_map = {} 125 | self.edges_hash_idx = {} 126 | for ix, ed in enumerate(self.edges_list): 127 | e1,e2=ed 128 | if not e1 in self.edges_hash_map:self.edges_hash_map[e1]={} 129 | 130 | hash_=self.hash_edge_attr(e1,e2,self.edges_attr_list[ix]) if self.is_edge_attr else self.hash_edge(e1,e2) 131 | if self.is_multi and self.is_edge_attr: 132 | if not e2 in self.edges_hash_map[e1]:self.edges_hash_map[e1][e2]={} 133 | self.edges_hash_map[e1][e2][self.edges_attr_list[ix]]=hash_ 134 | else: 135 | self.edges_hash_map[e1][e2]=hash_ 136 | self.edges_hash_idx[hash_]=ix 137 | self.edges_hash.append(hash_) 138 | self.edges_hash_set=set(self.edges_hash) 139 | 140 | self.edges_weight={} 141 | for e1,e2,attr_dict in list(G.edges(data=True)): 142 | hash_=self.hash_edge_attr(e1,e2,attr_dict[self.edge_attr_key]) if self.is_edge_attr else self.hash_edge(e1,e2) 143 | self.edges_weight[hash_]=attr_dict["weight"] if "weight" in attr_dict else 1 144 | 145 | self.number_of_edges = len(self.edges_list) 146 | self.number_of_nodes = len(self.nodes_list) 147 | 148 | if self.is_edge_attr and self.number_of_edges >0: 149 | self.number_of_edges_per_attr={attr:0 for attr in self.unique_edge_attr_vals} 150 | for _,_,attr_dict in list(G.edges(data=True)): 151 | self.number_of_edges_per_attr[attr_dict[self.edge_attr_key]]+=1 152 | 153 | if self.is_node_attr and self.number_of_nodes >0: 154 | self.number_of_nodes_per_attr={attr:0 for attr in self.unique_node_attr_vals} 155 | for _,attr_dict in list(G.nodes(data=True)): 156 | self.number_of_nodes_per_attr[attr_dict[self.node_attr_key]]+=1 157 | 158 | 159 | # HASH FUNCTION 160 | cpdef str hash_node(self,str n1): 161 | return "{0}".format(n1) 162 | 163 | cpdef str hash_edge(self,str n1,str n2): 164 | if not self.is_directed: 165 | return "_".join(sorted([n1,n2])) 166 | return "_".join([n1,n2]) 167 | 168 | cpdef str hash_node_attr(self,str n1, str attr_value): 169 | return "_".join([n1,attr_value]) 170 | 171 | cpdef str hash_edge_attr(self,str n1,str n2, str attr_value): 172 | if self.is_directed: 173 | return "_".join([n1,n2,attr_value]) 174 | ed=sorted([n1,n2]) 175 | ed.extend([attr_value]) 176 | return "_".join(ed) 177 | 178 | ## EXIST FUNCTION 179 | cpdef bint has_node(self,str n_id): 180 | if n_id in self.nodes_list: 181 | return True 182 | return False 183 | 184 | cpdef bint has_edge(self,str n_id1,str n_id2): 185 | if self.number_of_edges == 0: 186 | return False 187 | if self.is_directed: 188 | if n_id1 in self.edges_hash_map and n_id2 in self.edges_hash_map[n_id1]: 189 | return True 190 | else: 191 | if n_id1 in self.edges_hash_map and n_id2 in self.edges_hash_map[n_id1]: 192 | return True 193 | if n_id2 in self.edges_hash_map and n_id1 in self.edges_hash_map[n_id2]: 194 | return True 195 | return False 196 | 197 | ## LEN FUNCTION 198 | cpdef int size_node_intersect(self,Graph G): 199 | if self.number_of_nodes == 0: 200 | return 0 201 | return len(self.nodes_hash_set.intersection(G.nodes_hash_set)) 202 | cpdef int size_node_union(self,Graph G): 203 | return len(self.nodes_hash_set.union(G.nodes_hash_set)) 204 | 205 | cpdef int size_edge_intersect(self,Graph G): 206 | if self.number_of_edges == 0: 207 | return 0 208 | return len(self.edges_hash_set.intersection(G.edges_hash_set)) 209 | cpdef int size_edge_union(self,Graph G): 210 | return len(self.edges_hash_set.union(G.edges_hash_set)) 211 | 212 | ## GETTER 213 | 214 | def get_nx(self): 215 | return self.nx_g 216 | 217 | def nodes(self,data=False): 218 | if data: 219 | if self.number_of_nodes == 0: 220 | return [],[] 221 | return self.nodes_list,self.attr_nodes 222 | 223 | if self.number_of_nodes == 0: 224 | return [] 225 | return self.nodes_list 226 | 227 | 228 | def edges(self,data=False): 229 | if data: 230 | if self.number_of_edges == 0: 231 | return [],[] 232 | return self.edges_list,self.attr_edges 233 | 234 | if self.number_of_edges == 0: 235 | return [] 236 | return self.edges_list 237 | 238 | cpdef list get_edges_ed(self,str e1,str e2): 239 | if self.is_edge_attr: 240 | hashes=self.edges_hash_map[e1][e2] 241 | return [(e1,e2,self.edges_attr_list[self.edges_hash_idx[hash_]])for hash_ in hashes] 242 | 243 | return [(e1,e2,None)] 244 | 245 | cpdef list get_edges_no(self,str n): 246 | return self.edges_of_nodes[n] 247 | 248 | cpdef dict get_edge_attr(self,edge_hash): 249 | return self.edges_attr_list[self.edges_hash_idx[edge_hash]] 250 | 251 | cpdef dict get_node_attr(self, node_hash): 252 | return self.edges_attr_list[self.edges_hash_idx[node_hash]] 253 | 254 | cpdef dict get_edge_attrs(self,edge_hash): 255 | return self.attr_edges[self.edges_hash_idx[edge_hash]] 256 | 257 | cpdef dict get_node_attrs(self, node_hash): 258 | return self.attr_nodes[self.edges_hash_idx[node_hash]] 259 | 260 | cpdef set get_edges_hash(self): 261 | return self.edges_hash_set 262 | 263 | cpdef set get_nodes_hash(self): 264 | return self.nodes_hash_set 265 | 266 | cpdef str get_node_key(self): 267 | return self.node_attr_key 268 | 269 | cpdef str get_egde_key(self): 270 | return self.edge_attr_key 271 | ##### 272 | 273 | cpdef long size(self): 274 | return self.number_of_nodes 275 | 276 | cpdef int size_attr(self, attr_val): 277 | return self.number_of_nodes_per_attr[attr_val] 278 | 279 | cpdef long density(self): 280 | return self.number_of_edges 281 | 282 | cpdef int density_attr(self, str attr_val): 283 | return self.number_of_edges_per_attr[attr_val] 284 | 285 | cpdef double degree(self,str n_id, bint weight=False): 286 | if weight: 287 | return self.nodes_degree_weighted[self.nodes_idx[n_id]] 288 | return self.nodes_degree[self.nodes_idx[n_id]] 289 | 290 | cpdef double in_degree(self,str n_id, bint weight=False): 291 | if weight: 292 | return self.nodes_degree_in_weighted[self.nodes_idx[n_id]] 293 | return self.nodes_degree_in[self.nodes_idx[n_id]] 294 | 295 | cpdef double out_degree(self,str n_id, bint weight=False): 296 | if weight: 297 | return self.nodes_degree_out_weighted[self.nodes_idx[n_id]] 298 | return self.nodes_degree_out[self.nodes_idx[n_id]] 299 | 300 | cpdef double in_degree_attr(self,str n_id,str attr_val, bint weight=False): 301 | if not self.is_edge_attr and not self.is_directed: 302 | raise AttributeError("No edge attribute have been defined") 303 | if weight: 304 | return self.degree_per_attr_weighted[attr_val][n_id]["in"] 305 | return self.degree_per_attr[attr_val][n_id]["in"] 306 | 307 | cpdef double out_degree_attr(self,str n_id,str attr_val, bint weight=False): 308 | if not self.is_edge_attr and not self.is_directed: 309 | raise AttributeError("No edge attribute have been defined") 310 | if weight: 311 | return self.degree_per_attr_weighted[attr_val][n_id]["out"] 312 | return self.degree_per_attr[attr_val][n_id]["out"] 313 | 314 | cpdef double degree_attr(self,str n_id,str attr_val, bint weight=False): 315 | if not self.is_edge_attr: 316 | raise AttributeError("No edge attribute have been defined") 317 | if not self.is_directed: 318 | if weight: 319 | return self.degree_per_attr_weighted[attr_val][n_id]["out"] 320 | return self.degree_per_attr[attr_val][n_id]["out"] 321 | if weight: 322 | return self.degree_per_attr_weighted[attr_val][n_id]["in"] + self.degree_per_attr_weighted[attr_val][n_id]["out"] 323 | return self.degree_per_attr[attr_val][n_id]["out"] + self.degree_per_attr[attr_val][n_id]["in"] 324 | 325 | #GRAPH SETTER 326 | def add_node(self,str id_,**kwargs): 327 | if not self.node_attr_key in kwargs: 328 | print("Node not added because information lacks") 329 | return self 330 | if id_ in self.nodes_idx: 331 | print("Already in G") 332 | return self 333 | G=self.nx_g.copy() 334 | G.add_node(id_,**kwargs) 335 | return Graph(G,self.node_attr_key,self.edge_attr_key) 336 | 337 | 338 | def add_edge(self,str n1,str n2,**kwargs): 339 | G=self.nx_g.copy() 340 | G.add_edge(n1,n2,**kwargs) 341 | return Graph(G,self.node_attr_key,self.edge_attr_key) 342 | 343 | def remove_node(self,str id_): 344 | if not id_ in self.nodes_idx: 345 | print("Already removed in G") 346 | return self 347 | G=self.nx_g.copy() 348 | G.remove_node(id_) 349 | return Graph(G,self.node_attr_key,self.edge_attr_key) 350 | 351 | def remove_edge(self,str n1,str n2,**kwargs): 352 | G=self.nx_g.copy() 353 | edges=G.edges([n1,n2],data=True) 354 | if len(edges) == 0: 355 | return self 356 | elif len(edges)<2: 357 | G.remove_edge(n1,n2) 358 | else: 359 | if not self.edge_attr_key in kwargs: 360 | for i in range(len(edges)): 361 | G.remove_edge(n1,n2,i) 362 | else: 363 | key,val,i=self.edge_attr_key, kwargs[self.edge_attr_key],0 364 | for e1,ed2,attr_dict in edges: 365 | if attr_dict[key] == val: 366 | G.remove_edge(n1,n2,i) 367 | break 368 | i+=1 369 | 370 | return Graph(G,self.node_attr_key,self.edge_attr_key) 371 | 372 | def __init_empty__(self): 373 | self.nodes_list,self.nodes_attr_list,self.nodes_hash,self.nodes_weight,self.attr_nodes=[],[],[],[],[] 374 | self.nodes_degree,self.nodes_degree_in,self.nodes_degree_out,self.nodes_degree_weighted,self.nodes_degree_in_weighted,self.nodes_degree_out_weighted=np.array([],dtype=np.long),np.array([],dtype=np.long),np.array([],dtype=np.long),np.array([],dtype=np.double),np.array([],dtype=np.double),np.array([],dtype=np.double) 375 | self.nodes_idx,self.degree_per_attr,self.degree_per_attr_weighted={},{},{} 376 | self.nodes_hash_set=set([]) 377 | self.number_of_nodes = 0 378 | 379 | self.number_of_edges = 0 380 | self.edges_list=[] 381 | self.edges_attr_list =[] 382 | self.edges_hash_idx = {} 383 | self.edges_hash = [] 384 | self.edges_hash_set= set([]) 385 | self.edges_weight={} 386 | self.edges_hash_map={} 387 | self.attr_edges=[] 388 | 389 | -------------------------------------------------------------------------------- /gmatch4py/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 -------------------------------------------------------------------------------- /gmatch4py/helpers/general.pyx: -------------------------------------------------------------------------------- 1 | from ..graph cimport Graph 2 | import networkx as nx 3 | 4 | def parsenx2graph(list_gs,node_attr_key="",edge_attr_key=""): 5 | """ 6 | Parse list of Networkx graphs into Gmatch4py graph format 7 | Parameters 8 | ---------- 9 | list_gs : list 10 | list of graph 11 | node_attr_key : str 12 | node attribute used for the hash 13 | edge_attr_key: str 14 | edge attribute used for the hash 15 | 16 | Returns 17 | ------- 18 | list 19 | list of gmatch4py.Graph 20 | """ 21 | new_gs=[nx.relabel_nodes(g,{node:str(node) for node in list(g.nodes)},copy=True) for g in list_gs] 22 | new_gs=[Graph(g,node_attr_key,edge_attr_key) for g in new_gs] 23 | return new_gs 24 | -------------------------------------------------------------------------------- /gmatch4py/helpers/reader.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | import sys, os, glob, json, re 3 | import networkx as nx 4 | from tqdm import tqdm 5 | 6 | 7 | """ 8 | The reader submodule contains high-level function to read and store graphs from various files. 9 | """ 10 | 11 | 12 | 13 | methods_read_graph={ 14 | "gexf":nx.read_gexf, 15 | "gml":nx.read_gml, 16 | "graphml":nx.read_graphml 17 | } 18 | 19 | def extract_index(fn): 20 | """ 21 | Extract index from filename 22 | Parameters 23 | ---------- 24 | fn : str 25 | filename 26 | 27 | Returns 28 | ------- 29 | int 30 | index 31 | """ 32 | try: 33 | return int(re.findall("\d+",fn)[-1]) 34 | except: 35 | print("No number found !") 36 | return 0 37 | 38 | 39 | def import_dir(directory,format="gexf",numbered=True): 40 | """ 41 | Based on a given directory, import all graphs and store them in a list/array 42 | 43 | Parameters 44 | ---------- 45 | directory : str 46 | directory path where graphs are stored 47 | format : str 48 | graph file format 49 | numbered 50 | if graph filename are numbered 51 | Returns 52 | ------- 53 | array 54 | graphs 55 | """ 56 | if not os.path.exists(directory): 57 | raise FileNotFoundError("{0} does not exists".format(directory)) 58 | if not format in methods_read_graph: 59 | raise NotImplementedError("{0} is not implemented !".format(format)) 60 | 61 | # Retrieve filename 62 | fns = glob.glob(os.path.join(directory, "*.{0}".format(format))) 63 | 64 | graphs=[] 65 | if numbered: 66 | n=max([extract_index(fn) for fn in fns]) 67 | graphs= [nx.Graph()]*(n+1) 68 | 69 | association_map, i = {}, 0 70 | for fn in tqdm(fns,desc="Loading Graphs from {0}".format(directory)): 71 | if not numbered: 72 | graphs.append(methods_read_graph[format](fn)) 73 | association_map[fn]=i 74 | i+=1 75 | else: 76 | graphs[extract_index(fn)]=methods_read_graph[format](fn) 77 | if not numbered: 78 | return association_map,graphs 79 | return graphs 80 | -------------------------------------------------------------------------------- /gmatch4py/jaccard.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | import numpy as np 4 | cimport numpy as np 5 | 6 | from .base cimport Base 7 | from .helpers.general import parsenx2graph 8 | from cython.parallel cimport prange,parallel 9 | cimport cython 10 | 11 | cdef class Jaccard(Base): 12 | 13 | def __init__(self): 14 | Base.__init__(self,0,True) 15 | 16 | 17 | @cython.boundscheck(False) 18 | cpdef np.ndarray compare(self,list listgs, list selected): 19 | cdef int n = len(listgs) 20 | cdef list new_gs=parsenx2graph(listgs,self.node_attr_key,self.edge_attr_key) 21 | cdef double[:,:] comparison_matrix = np.zeros((n, n)) 22 | cdef long[:] n_nodes = np.array([g.size() for g in new_gs]) 23 | cdef long[:] n_edges = np.array([g.density() for g in new_gs]) 24 | cdef int i,j 25 | 26 | cdef double[:] selected_test = np.array(self.get_selected_array(selected,n)) 27 | 28 | cdef double[:,:] intersect_len_nodes = np.zeros((n, n)) 29 | cdef double[:,:] intersect_len_edges = np.zeros((n, n)) 30 | cdef double[:,:] union_len_nodes = np.zeros((n, n)) 31 | cdef double[:,:] union_len_edges = np.zeros((n, n)) 32 | for i in range(n): 33 | for j in range(i,n): 34 | intersect_len_nodes[i][j]=new_gs[i].size_node_intersect(new_gs[j]) 35 | intersect_len_edges[i][j]=new_gs[i].size_edge_intersect(new_gs[j])#len(set(hash_edges[i]).intersection(hash_edges[j])) 36 | union_len_nodes[i][j]=new_gs[i].size_node_union(new_gs[j]) 37 | union_len_edges[i][j]=new_gs[i].size_edge_union(new_gs[j]) 38 | with nogil, parallel(num_threads=self.cpu_count): 39 | for i in prange(n,schedule='static'): 40 | for j in range(i,n): 41 | if n_nodes[i] > 0 and n_nodes[j] > 0 and selected_test[i] == 1: 42 | if union_len_edges[i][j] >0 and union_len_nodes[i][j] >0: 43 | comparison_matrix[i][j]= \ 44 | (intersect_len_edges[i][j]/union_len_edges[i][j])*\ 45 | (intersect_len_nodes[i][j]/union_len_nodes[i][j]) 46 | 47 | else: 48 | comparison_matrix[i][j] = 0. 49 | 50 | comparison_matrix[j][i] = comparison_matrix[i][j] 51 | 52 | return np.array(comparison_matrix) 53 | -------------------------------------------------------------------------------- /gmatch4py/kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # coding = utf-8 -------------------------------------------------------------------------------- /gmatch4py/kernels/adjacency.pyx: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | 4 | def get_adjacency(G1,G2): 5 | """ 6 | Return adjacency matrices of two graph based on nodes present in both of them. 7 | 8 | Parameters 9 | ---------- 10 | G1 : nx.Graph 11 | first graph 12 | G2 : nx.Graph 13 | second graph 14 | 15 | Returns 16 | ------- 17 | tuple of np.array 18 | adjacency matrices of G1 and G2 19 | """ 20 | 21 | # Extract nodes 22 | nodes_G1=list(G1.nodes()) 23 | nodes_G2=list(G2.nodes()) 24 | 25 | # Get Adjacency Matrix for each graph 26 | adj_original_G1 = nx.convert_matrix.to_numpy_matrix(G1,nodes_G1) 27 | adj_original_G2 = nx.convert_matrix.to_numpy_matrix(G2,nodes_G2) 28 | 29 | # Get old index 30 | index_node_G1={node: ix for ix,node in enumerate(nodes_G1)} 31 | index_node_G2={node: ix for ix,node in enumerate(nodes_G2)} 32 | 33 | # Building new indices 34 | nodes_unique = list(set(nodes_G1).union(nodes_G2)) 35 | new_node_index = {node:i for i,node in enumerate(nodes_unique)} 36 | 37 | n=len(nodes_unique) 38 | 39 | #Generate new adjacent matrices 40 | new_adj_G1= np.zeros((n,n)) 41 | new_adj_G2= np.zeros((n,n)) 42 | 43 | # Filling old values 44 | for n1 in nodes_unique: 45 | for n2 in nodes_unique: 46 | if n1 in G1.nodes() and n2 in G1.nodes(): 47 | new_adj_G1[new_node_index[n1],new_node_index[n2]]=adj_original_G1[index_node_G1[n1],index_node_G1[n2]] 48 | if n1 in G2.nodes() and n2 in G2.nodes(): 49 | new_adj_G2[new_node_index[n1],new_node_index[n2]]=adj_original_G2[index_node_G2[n1],index_node_G2[n2]] 50 | 51 | return new_adj_G1,new_adj_G2 52 | 53 | -------------------------------------------------------------------------------- /gmatch4py/kernels/random_walk_kernel.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | import networkx as nx 4 | import numpy as np 5 | 6 | class GeometricRandomWalkKernel(): 7 | __type__ = "sim" 8 | @staticmethod 9 | def maxDegree(G): 10 | degree_sequence = sorted(nx.degree(G).values(), reverse=True) # degree sequence 11 | 12 | # print "Degree sequence", degree_sequence 13 | dmax = max(degree_sequence) 14 | return dmax 15 | @staticmethod 16 | def compare(listgs): 17 | 18 | n = len(listgs) 19 | comparison_matrix=np.zeros((n,n)) 20 | for i in range(n): 21 | for j in range(i,n): 22 | if len(listgs[i]) <1 or len(listgs[j]) <1: 23 | comparison_matrix[i, j] = 0 24 | comparison_matrix[j, i] = 0 25 | continue 26 | direct_product_graph=nx.tensor_product(listgs[i],listgs[j]) 27 | Ax = nx.adjacency_matrix(direct_product_graph).todense() 28 | try: 29 | la = 1/ ((GeometricRandomWalkKernel.maxDegree(direct_product_graph)**2)+1) # lambda value 30 | except: 31 | la= pow(1,-6) 32 | eps = pow(10,-10) 33 | I=np.identity(Ax.shape[0]) 34 | I_vec=np.ones(Ax.shape[0]) 35 | x=I_vec.copy() 36 | x_pre=np.zeros(Ax.shape[0]) 37 | c=0 38 | 39 | while (np.linalg.norm(x-x_pre)) > eps: 40 | if c > 100: 41 | break 42 | x_pre=x 43 | 44 | x= I_vec + la*np.dot(Ax,x_pre.T) 45 | c+=1 46 | comparison_matrix[i,j]=np.sum(x) 47 | comparison_matrix[j,i]=comparison_matrix[i,j] 48 | print(comparison_matrix) 49 | for i in range(n): 50 | for j in range(i,n): 51 | comparison_matrix[i,j] = (comparison_matrix[i,j]/np.sqrt(comparison_matrix[i,i]*comparison_matrix[j,j])) 52 | comparison_matrix[j,i]=comparison_matrix[i,j] 53 | return comparison_matrix 54 | 55 | class KStepRandomWalkKernel(): 56 | __type__ = "sim" 57 | @staticmethod 58 | def maxDegree(G): 59 | degree_sequence = sorted(nx.degree(G).values(), reverse=True) # degree sequence 60 | # print "Degree sequence", degree_sequence 61 | dmax = max(degree_sequence) 62 | return dmax 63 | @staticmethod 64 | def compare(listgs,lambda_list=[1,1,1]): 65 | k=len(lambda_list) 66 | if not len(lambda_list) == k: 67 | raise AttributeError 68 | n = len(listgs) 69 | comparison_matrix=np.zeros((n,n)) 70 | for i in range(n): 71 | for j in range(i,n): 72 | if len(listgs[i]) <1 or len(listgs[j]) <1: 73 | comparison_matrix[i, j] = 0 74 | comparison_matrix[j, i] = 0 75 | continue 76 | direct_product_graph=nx.tensor_product(listgs[i],listgs[j]) 77 | Ax = nx.adjacency_matrix(direct_product_graph).todense() 78 | eps = pow(10,-10) 79 | I=np.identity(Ax.shape[0]) 80 | ax_pow = I.copy() 81 | sum_ = lambda_list[0] * I 82 | for kk in range(1, k): 83 | ax_pow *= Ax 84 | sum_ += lambda_list[kk] * ax_pow 85 | 86 | comparison_matrix[i, j] = np.sum(sum_)/(len(listgs[i])**2 * len(listgs[j])**2) 87 | comparison_matrix[j,i] = comparison_matrix[i,j] 88 | 89 | for i in range(n): 90 | for j in range(i,n): 91 | comparison_matrix[i,j] = comparison_matrix[i,j]/np.sqrt(comparison_matrix[i,i]*comparison_matrix[j,j]) 92 | comparison_matrix[j,i]=comparison_matrix[i,j] 93 | return comparison_matrix -------------------------------------------------------------------------------- /gmatch4py/kernels/shortest_path_kernel.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | """ 4 | Shortest-Path graph kernel. 5 | Python implementation based on: "Shortest-path kernels on graphs", by 6 | Borgwardt, K.M.; Kriegel, H.-P., in Data Mining, Fifth IEEE 7 | International Conference on , vol., no., pp.8 pp.-, 27-30 Nov. 2005 8 | doi: 10.1109/ICDM.2005.132 9 | Author : Sandro Vega-Pons, Emanuele Olivetti 10 | Modified by : Jacques Fize 11 | """ 12 | 13 | import networkx as nx 14 | import numpy as np 15 | cimport numpy as np 16 | from scipy.sparse.csgraph import floyd_warshall 17 | from .adjacency import get_adjacency 18 | from cython.parallel cimport prange,parallel 19 | from ..helpers.general import parsenx2graph 20 | from ..base cimport Base 21 | cimport cython 22 | 23 | cdef class ShortestPathGraphKernel(Base): 24 | """ 25 | Shorthest path graph kernel. 26 | """ 27 | def __init__(self): 28 | Base.__init__(self,0,False) 29 | 30 | def compare_two(self,g_1, g_2): 31 | """Compute the kernel value (similarity) between two graphs. 32 | Parameters 33 | ---------- 34 | g1 : networkx.Graph 35 | First graph. 36 | g2 : networkx.Graph 37 | Second graph. 38 | Returns 39 | ------- 40 | k : The similarity value between g1 and g2. 41 | """ 42 | # Diagonal superior matrix of the floyd warshall shortest 43 | # paths: 44 | if isinstance(g_1,nx.Graph) and isinstance(g_2,nx.Graph): 45 | g_1,g_2= get_adjacency(g_1,g_2) 46 | 47 | fwm1 = np.array(floyd_warshall(g_1)) 48 | fwm1[np.isinf(fwm1)] = 0 49 | fwm1[np.isnan(fwm1)] = 0 50 | fwm1 = np.triu(fwm1, k=1) 51 | bc1 = np.bincount(fwm1.reshape(-1).astype(int)) 52 | 53 | fwm2 = np.array(floyd_warshall(g_2)) 54 | fwm2[np.isinf(fwm2)] = 0 55 | fwm2[np.isnan(fwm2)] = 0 56 | fwm2 = np.triu(fwm2, k=1) 57 | bc2 = np.bincount(fwm2.reshape(-1).astype(int)) 58 | 59 | # Copy into arrays with the same length the non-zero shortests 60 | # paths: 61 | v1 = np.zeros(max(len(bc1), len(bc2)) - 1) 62 | v1[range(0, len(bc1)-1)] = bc1[1:] 63 | 64 | v2 = np.zeros(max(len(bc1), len(bc2)) - 1) 65 | v2[range(0, len(bc2)-1)] = bc2[1:] 66 | 67 | return np.sum(v1 * v2) 68 | 69 | @cython.boundscheck(False) 70 | cpdef np.ndarray compare(self,list graph_list, list selected): 71 | """Compute the all-pairs kernel values for a list of graphs. 72 | This function can be used to directly compute the kernel 73 | matrix for a list of graphs. The direct computation of the 74 | kernel matrix is faster than the computation of all individual 75 | pairwise kernel values. 76 | Parameters 77 | ---------- 78 | graph_list: list 79 | A list of graphs (list of networkx graphs) 80 | Return 81 | ------ 82 | K: numpy.array, shape = (len(graph_list), len(graph_list)) 83 | The similarity matrix of all graphs in graph_list. 84 | """ 85 | cdef int n = len(graph_list) 86 | cdef double[:,:] k = np.zeros((n, n)) 87 | cdef int cpu_count = self.cpu_count 88 | cdef int i,j 89 | cdef list adjacency_matrices = [[None for i in range(n)]for j in range(n)] 90 | 91 | for i in range(n): 92 | for j in range(i, n): 93 | adjacency_matrices[i][j] = get_adjacency(graph_list[i],graph_list[j]) 94 | adjacency_matrices[j][i] = adjacency_matrices[i][j] 95 | 96 | with nogil, parallel(num_threads=cpu_count): 97 | for i in prange(n,schedule='static'): 98 | for j in range(i, n): 99 | with gil: 100 | if len(graph_list[i]) > 0 and len(graph_list[j]) >0: 101 | a,b=adjacency_matrices[i][j] 102 | k[i][j] = self.compare_two(a,b) 103 | k[j][i] = k[i][j] 104 | 105 | k_norm = np.zeros((n,n)) 106 | for i in range(n): 107 | for j in range(i,n): 108 | k_norm[i, j] = k[i][j] / np.sqrt(k[i][i] * k[j][j]) 109 | k_norm[j, i] = k_norm[i, j] 110 | 111 | return np.nan_to_num(k_norm) 112 | 113 | 114 | 115 | cdef class ShortestPathGraphKernelDotCostMatrix(ShortestPathGraphKernel): 116 | """ 117 | Instead of just multiply the count of distance values fou,d between nodes of each graph, this version propose to multiply the node distance matrix generated from each graph. 118 | """ 119 | def __init__(self): 120 | ShortestPathGraphKernel.__init__(self) 121 | 122 | def compare_two(self,g_1, g_2): 123 | """Compute the kernel value (similarity) between two graphs. 124 | Parameters 125 | ---------- 126 | g1 : networkx.Graph 127 | First graph. 128 | g2 : networkx.Graph 129 | Second graph. 130 | Returns 131 | ------- 132 | k : The similarity value between g1 and g2. 133 | """ 134 | # Diagonal superior matrix of the floyd warshall shortest 135 | # paths: 136 | if isinstance(g_1,nx.Graph) and isinstance(g_2,nx.Graph): 137 | g_1,g_2= get_adjacency(g_1,g_2) 138 | 139 | fwm1 = np.array(floyd_warshall(g_1)) 140 | fwm1[np.isinf(fwm1)] = 0 141 | fwm1[np.isnan(fwm1)] = 0 142 | fwm1 = np.triu(fwm1, k=1) 143 | 144 | fwm2 = np.array(floyd_warshall(g_2)) 145 | fwm2[np.isinf(fwm2)] = 0 146 | fwm2[np.isnan(fwm2)] = 0 147 | fwm2 = np.triu(fwm2, k=1) 148 | 149 | return np.sum(fwm1 * fwm2) -------------------------------------------------------------------------------- /gmatch4py/kernels/weisfeiler_lehman.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | """Weisfeiler_Lehman graph kernel. 4 | 5 | Python implementation based on: "Weisfeiler-Lehman Graph Kernels", by: 6 | Nino Shervashidze, Pascal Schweitzer, Erik J. van Leeuwen, Kurt 7 | Mehlhorn, Karsten M. Borgwardt, JMLR, 2012. 8 | http://jmlr.csail.mit.edu/papers/v12/shervashidze11a.html 9 | 10 | Author : Sandro Vega-Pons, Emanuele Olivetti 11 | Source : https://github.com/emanuele/jstsp2015/blob/master/gk_weisfeiler_lehman.py 12 | Modified by : Jacques Fize 13 | """ 14 | 15 | import copy 16 | 17 | import networkx as nx 18 | import numpy as np 19 | cimport numpy as np 20 | from ..base cimport Base 21 | from ..base import minmax_scale 22 | from scipy.sparse import csc_matrix,lil_matrix 23 | 24 | cdef class WeisfeleirLehmanKernel(Base): 25 | 26 | cdef int h 27 | 28 | def __init__(self,h=2): 29 | Base.__init__(self,0,True) 30 | self.h=h 31 | 32 | 33 | cpdef np.ndarray compare(self,list graph_list, list selected): 34 | """Compute the all-pairs kernel values for a list of graphs. 35 | This function can be used to directly compute the kernel 36 | matrix for a list of graphs. The direct computation of the 37 | kernel matrix is faster than the computation of all individual 38 | pairwise kernel values. 39 | Parameters 40 | ---------- 41 | graph_list: list 42 | A list of graphs (list of networkx graphs) 43 | h : interger 44 | Number of iterations. 45 | node_label : boolean 46 | Whether to use original node labels. True for using node labels 47 | saved in the attribute 'node_label'. False for using the node 48 | degree of each node as node attribute. 49 | Return 50 | ------ 51 | K: numpy.array, shape = (len(graph_list), len(graph_list)) 52 | The similarity matrix of all graphs in graph_list. 53 | """ 54 | 55 | cdef int n = len(graph_list) 56 | cdef int n_nodes = 0 57 | cdef int n_max = 0 58 | cdef int i,j 59 | # Compute adjacency lists and n_nodes, the total number of 60 | # nodes in the dataset. 61 | for i in range(n): 62 | n_nodes += graph_list[i].number_of_nodes() 63 | 64 | # Computing the maximum number of nodes in the graphs. It 65 | # will be used in the computation of vectorial 66 | # representation. 67 | if n_max < graph_list[i].number_of_nodes(): 68 | n_max = graph_list[i].number_of_nodes() 69 | 70 | phi = np.zeros((n_nodes, n), dtype=np.uint64) 71 | phi=lil_matrix(phi) 72 | 73 | # INITIALIZATION: initialize the nodes labels for each graph 74 | # with their labels or with degrees (for unlabeled graphs) 75 | 76 | cdef list labels = [0] * n 77 | cdef dict label_lookup = {} 78 | cdef int label_counter = 0 79 | 80 | 81 | # label_lookup is an associative array, which will contain the 82 | # mapping from multiset labels (strings) to short labels 83 | # (integers) 84 | 85 | cdef list nodes 86 | for i in range(n): 87 | nodes = list(graph_list[i].nodes) 88 | # It is assumed that the graph has an attribute 89 | # 'node_label' 90 | labels[i] = np.zeros(len(nodes), dtype=np.int32) 91 | 92 | for j in range(len(nodes)): 93 | if not (nodes[j] in label_lookup): 94 | label_lookup[nodes[j]] = str(label_counter) 95 | labels[i][j] = label_counter 96 | label_counter += 1 97 | else: 98 | labels[i][j] = label_lookup[nodes[j]] 99 | # labels are associated to a natural number 100 | # starting with 0. 101 | 102 | phi[labels[i][j], i] += 1 103 | 104 | graph_list[i]=nx.relabel_nodes(graph_list[i],label_lookup) 105 | 106 | # cdef np.ndarray[np.float64_t] k 107 | k = np.dot(phi.transpose(), phi) 108 | # MAIN LOOP 109 | cdef int it = 0 110 | 111 | new_labels = copy.deepcopy(labels) # Can't work without it !!! 112 | 113 | while it < self.h: 114 | # create an empty lookup table 115 | label_lookup = {} 116 | label_counter = 0 117 | 118 | phi = np.zeros((n_nodes, n)) 119 | for i in range(n): 120 | nodes = list(graph_list[i].nodes) 121 | for v in range(len(nodes)): 122 | # form a multiset label of the node v of the i'th graph 123 | # and convert it to a string 124 | 125 | long_label = [] 126 | long_label.extend(nx.neighbors(graph_list[i],nodes[v])) 127 | 128 | long_label_string = "".join(long_label) 129 | # if the multiset label has not yet occurred, add it to the 130 | # lookup table and assign a number to it 131 | if not (long_label_string in label_lookup): 132 | label_lookup[long_label_string] = str(label_counter) 133 | new_labels[i][v] = label_counter 134 | label_counter += 1 135 | else: 136 | new_labels[i][v] = label_lookup[long_label_string] 137 | # fill the column for i'th graph in phi 138 | aux = np.bincount(new_labels[i]) 139 | phi[new_labels[i], i] += aux[new_labels[i]] 140 | 141 | k += np.dot(phi.transpose(), phi) 142 | it = it + 1 143 | 144 | return np.ma.getdata(minmax_scale(k)) -------------------------------------------------------------------------------- /gmatch4py/mcs.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | import numpy as np 3 | cimport numpy as np 4 | from .graph cimport Graph 5 | from .base cimport Base 6 | from cython.parallel cimport prange,parallel 7 | from .helpers.general import parsenx2graph 8 | cimport cython 9 | 10 | cdef class MCS(Base): 11 | """ 12 | *A graph distance metric based on the maximal common subgraph, H. Bunke and K. Shearer, 13 | Pattern Recognition Letters, 1998* 14 | """ 15 | def __init__(self): 16 | Base.__init__(self,0,True) 17 | 18 | @cython.boundscheck(False) 19 | cpdef np.ndarray compare(self,list listgs, list selected): 20 | cdef int n = len(listgs) 21 | cdef double [:,:] comparison_matrix = np.zeros((n, n)) 22 | cdef double[:] selected_test = np.array(self.get_selected_array(selected,n)) 23 | cdef list new_gs=parsenx2graph(listgs,self.node_attr_key,self.edge_attr_key) 24 | cdef long[:] n_nodes = np.array([g.size() for g in new_gs]) 25 | cdef double [:,:] intersect_len_nodes = np.zeros((n, n)) 26 | cdef int i,j 27 | for i in range(n): 28 | for j in range(i,n): 29 | intersect_len_nodes[i][j]=new_gs[i].size_node_intersect(new_gs[j]) 30 | 31 | with nogil, parallel(num_threads=self.cpu_count): 32 | for i in prange(n,schedule='static'): 33 | for j in range(i, n): 34 | if n_nodes[i] > 0 and n_nodes[j] > 0 and selected_test[i] == 1: 35 | comparison_matrix[i][j] = intersect_len_nodes[i][j]/max(n_nodes[i],n_nodes[j]) 36 | else: 37 | comparison_matrix[i][j] = 0. 38 | if i==j: 39 | comparison_matrix[i][j]=1 40 | comparison_matrix[j][i] = comparison_matrix[i][j] 41 | 42 | 43 | return np.array(comparison_matrix) 44 | 45 | 46 | -------------------------------------------------------------------------------- /gmatch4py/vertex_edge_overlap.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | import numpy as np 4 | cimport numpy as np 5 | 6 | from .graph cimport Graph 7 | from cython.parallel cimport prange,parallel 8 | from .helpers.general import parsenx2graph 9 | cimport cython 10 | from .base cimport Base 11 | 12 | cdef class VertexEdgeOverlap(Base): 13 | 14 | """ 15 | Vertex/Edge Overlap Algorithm 16 | presented in Web graph similarity for anomaly detection, Journal of Internet Services and Applications, 2008 17 | by P. Papadimitriou, A. Dasdan and H.Gracia-Molina 18 | 19 | Code Author : Jacques Fize 20 | """ 21 | def __init__(self): 22 | Base.__init__(self,0,True) 23 | 24 | @cython.boundscheck(False) 25 | cpdef np.ndarray compare(self,list listgs, list selected): 26 | cdef int n = len(listgs) 27 | cdef list new_gs=parsenx2graph(listgs,self.node_attr_key,self.edge_attr_key) 28 | cdef double[:,:] comparison_matrix = np.zeros((n, n)) 29 | cdef int denom,i,j 30 | cdef long[:] n_nodes = np.array([g.size() for g in new_gs]) 31 | cdef long[:] n_edges = np.array([g.density() for g in new_gs]) 32 | 33 | cdef double[:] selected_test = np.array(self.get_selected_array(selected,n)) 34 | 35 | cdef double[:,:] intersect_len_nodes = np.zeros((n, n)) 36 | cdef double[:,:] intersect_len_edges = np.zeros((n, n)) 37 | for i in range(n): 38 | for j in range(i,n): 39 | intersect_len_nodes[i][j]=new_gs[i].size_node_intersect(new_gs[j]) 40 | intersect_len_edges[i][j]=new_gs[i].size_edge_intersect(new_gs[j])#len(set(hash_edges[i]).intersection(hash_edges[j])) 41 | 42 | with nogil, parallel(num_threads=self.cpu_count): 43 | for i in prange(n,schedule='static'): 44 | for j in range(i,n): 45 | if n_nodes[i] > 0 and n_nodes[j] > 0 and selected_test[i] == 1: 46 | denom=n_nodes[i]+n_nodes[j]+\ 47 | n_edges[i]+n_edges[j] 48 | if denom > 0: 49 | comparison_matrix[i][j]=(2*(intersect_len_nodes[i][j] 50 | +intersect_len_edges[i][j]))/denom # Data = True --> For nx.MultiDiGraph 51 | if i==j: 52 | comparison_matrix[i][j]=1 53 | comparison_matrix[j][i] = comparison_matrix[i][j] 54 | return np.array(comparison_matrix) 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /gmatch4py/vertex_ranking.pyx: -------------------------------------------------------------------------------- 1 | # coding = utf-8 2 | 3 | import networkx as nx 4 | import numpy as np 5 | cimport numpy as np 6 | from scipy.stats import spearmanr 7 | 8 | from .base cimport Base 9 | 10 | cdef class VertexRanking(Base): 11 | """ 12 | Vertex Ranking 13 | presented in Web graph similarity for anomaly detection, Journal of Internet Services and Applications, 2008 # Maybe not ?? 14 | by P. Papadimitriou, A. Dasdan and H.Gracia-Molina 15 | 16 | Code Author : Jacques Fize 17 | 18 | """ 19 | def __init__(self): 20 | Base.__init__(self,0,True) 21 | 22 | cpdef np.ndarray compare(self,list listgs, list selected): 23 | cdef int n,i,j # number of graphs 24 | n = len(listgs) 25 | 26 | cdef np.ndarray comparison_matrix = np.zeros((n,n)) #similarity matrix 27 | cdef list X,Y,pager_i,pager_j,page_r,node_intersection #temp data (page rank data for the most part) 28 | page_r=[nx.pagerank(nx.DiGraph(g)) for g in listgs] 29 | for i in range(n): 30 | pager_i=list(page_r[i]) 31 | for j in range(i,n): 32 | g1,g2=listgs[i],listgs[j] 33 | f=self.isAccepted(g1,i,selected) 34 | pager_j=list(page_r[j]) 35 | node_intersection=list(set(pager_i) & set(pager_j)) 36 | X,Y=[],[] 37 | for node in node_intersection: 38 | X.append(page_r[i][node]) 39 | Y.append(page_r[j][node]) 40 | comparison_matrix[i,j] = spearmanr(X,Y)[0] 41 | comparison_matrix[j,i] = comparison_matrix[i,j] 42 | return np.nan_to_num(comparison_matrix) 43 | -------------------------------------------------------------------------------- /logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jacquesfize/GMatch4py/4fc0a822514c65c0d8b12d090b5b89c0af50ef2a/logo2.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | decorator 2 | scipy 3 | networkx==2.1 4 | numpy 5 | cython 6 | 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #import setuptools 2 | import sys, os, shutil 3 | from distutils.core import setup 4 | from distutils.extension import Extension 5 | import numpy as np 6 | import platform 7 | try: 8 | from Cython.Build import cythonize 9 | from Cython.Distutils import build_ext 10 | except: 11 | print("You don't seem to have Cython installed. Please get a") 12 | print("copy from www.cython.org and install it") 13 | sys.exit(1) 14 | 15 | is_linux = sys.platform == 'linux' 16 | libs=[] 17 | if is_linux: # Issue #42 18 | libs.append('rt') # -lrt for clock_gettime 19 | 20 | def scandir(dir, files=[]): 21 | for file in os.listdir(dir): 22 | path = os.path.join(dir, file) 23 | if os.path.isfile(path) and path.endswith(".pyx"): 24 | files.append(path.replace(os.path.sep, ".")[:-4]) 25 | elif os.path.isdir(path): 26 | scandir(path, files) 27 | return files 28 | 29 | # generate an Extension object from its dotted name 30 | def makeExtension(extName): 31 | global libs 32 | extPath = extName.replace(".", os.path.sep)+".pyx" 33 | 34 | ## For Mojave Users 35 | if platform.system() == "Darwin": 36 | if "10.14" in platform.mac_ver()[0]: 37 | return Extension( 38 | extName, 39 | [extPath],include_dirs=[np.get_include()],language='c++',libraries=libs, 40 | extra_compile_args=["-stdlib=libc++"] 41 | ) 42 | 43 | return Extension( 44 | extName, 45 | [extPath],include_dirs=[np.get_include()],language='c++',libraries=libs, 46 | #extra_compile_args = ["-O0", "-fopenmp"],extra_link_args=['-fopenmp'] 47 | 48 | ) 49 | 50 | # get the list of extensions 51 | extNames = scandir("gmatch4py") 52 | 53 | # and build up the set of Extension objects 54 | extensions = cythonize([makeExtension(name) for name in extNames]) 55 | 56 | from os import path 57 | this_directory = path.abspath(path.dirname(__file__)) 58 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 59 | long_description = f.read() 60 | 61 | requirements=["numpy","networkx","scipy",'scikit-learn','tqdm','pandas',"joblib","gensim","psutil"] 62 | setup( 63 | name="GMatch4py", 64 | author="Jacques Fize", 65 | description="A python module for graph matching (use Cython)", 66 | long_description=long_description, 67 | long_description_content_type='text/markdown', 68 | url="https://github.com/Jacobe2169/GMatch4py", 69 | packages=["gmatch4py"], 70 | ext_modules=extensions, 71 | cmdclass={'build_ext': build_ext}, 72 | setup_requires=requirements, 73 | install_requires=requirements, 74 | version="0.2.5b", 75 | classifiers=[ 76 | "Programming Language :: Python :: 3", 77 | "License :: OSI Approved :: MIT License", 78 | "Operating System :: OS Independent", 79 | ] 80 | ) 81 | #Clean cpp and compiled file 82 | f=True 83 | if f: 84 | if os.path.exists("build"): 85 | shutil.rmtree("build") 86 | if os.path.exists("dist"): 87 | shutil.rmtree("dist") 88 | os.system("find . -name \*.c -delete ; find . -name \*.cpp -delete ;") -------------------------------------------------------------------------------- /test/gmatch4py_performance_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.chdir(os.environ["HOME"]) 3 | 4 | def test_mesure(): 5 | import gmatch4py as gm 6 | import networkx as nx 7 | import time 8 | from tqdm import tqdm 9 | import pandas as pd 10 | 11 | 12 | max_=100 13 | size_g=10 14 | graphs_all=[nx.random_tree(size_g) for i in range(max_)] 15 | result_compiled=[] 16 | for size_ in tqdm(range(50,max_,50)): 17 | graphs=graphs_all[:size_] 18 | comparator=None 19 | for class_ in [gm.BagOfNodes,gm.WeisfeleirLehmanKernel, gm.GraphEditDistance, gm.GreedyEditDistance, gm.HED, gm.BP_2, gm.Jaccard, gm.MCS, gm.VertexEdgeOverlap]: 20 | deb=time.time() 21 | if class_ in (gm.GraphEditDistance, gm.BP_2, gm.GreedyEditDistance, gm.HED): 22 | comparator = class_(1, 1, 1, 1) 23 | elif class_ == gm.WeisfeleirLehmanKernel: 24 | comparator = class_(h=2) 25 | else: 26 | comparator=class_() 27 | matrix = comparator.compare(graphs,None) 28 | print([class_.__name__,size_,time.time()-deb]) 29 | result_compiled.append([class_.__name__,size_,time.time()-deb]) 30 | 31 | df = pd.DataFrame(result_compiled,columns="algorithm size_data time_exec_s".split()) 32 | df.to_csv("new_gmatch4py_res_{0}graphs_{1}size.csv".format(max_,size_g)) -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import networkx as nx 4 | 5 | def __import(): 6 | # Gmatch4py use networkx graph 7 | import networkx as nx 8 | import gmatch4py as gm 9 | 10 | 11 | def test_import(): 12 | os.chdir(os.environ["HOME"] ) 13 | __import() 14 | 15 | def test_graph(): 16 | os.chdir(os.environ["HOME"]) 17 | import networkx as nx 18 | import gmatch4py as gm 19 | 20 | # Simple Graph 21 | G1 = nx.Graph() 22 | G2 = nx.Graph() 23 | G1.add_edge("1","2") 24 | G1.add_edge("1","3") 25 | 26 | gm.graph.Graph(G1) 27 | 28 | # Digraph Graph 29 | G1 = nx.DiGraph() 30 | G1.add_edge("1","2") 31 | G1.add_edge("1","3") 32 | assert list(G1.edges()) == gm.graph.Graph(G1).edges() 33 | 34 | G1 = nx.DiGraph() 35 | G1.add_edge("1","2",color="blue") 36 | G1.add_edge("1","2",color="red") 37 | G1.add_edge("1","3",color="green") 38 | assert gm.graph.Graph(G1,edge_attr_key="color").density() == 2 39 | assert gm.graph.Graph(G1).density() == 2 40 | 41 | # Multi Graph 42 | G1 = nx.MultiGraph() 43 | G1.add_edge("1","2",color="blue") 44 | G1.add_edge("1","3",color="green") 45 | assert list(G1.edges()) == gm.graph.Graph(G1).edges() 46 | G1 = nx.MultiGraph() 47 | G1.add_edge("1","2",color="blue") 48 | G1.add_edge("1","3",color="green") 49 | assert len(set([gm.graph.Graph(G1).hash_edge_attr(ed[0],ed[1],ed[2]["color"]) for ed in list(G1.edges(data=True))]).intersection(gm.graph.Graph(G1,edge_attr_key="color").get_edges_hash())) == 2 50 | 51 | G1 = nx.MultiGraph() 52 | G1.add_edge("1","2",color="blue") 53 | G1.add_edge("1","2",color="red") 54 | G1.add_edge("1","3",color="green") 55 | assert gm.graph.Graph(G1,edge_attr_key="color").density() == len(G1.edges(data=True)) 56 | assert gm.graph.Graph(G1).density() == len(G1.edges(data=True)) 57 | 58 | # Multi DiGraph 59 | G1 = nx.MultiDiGraph() 60 | G1.add_edge("1","2",color="blue") 61 | G1.add_edge("1","2",color="red") 62 | G1.add_edge("1","3",color="green") 63 | assert gm.graph.Graph(G1,edge_attr_key="color").density() == len(G1.edges(data=True)) 64 | assert gm.graph.Graph(G1).density() == len(G1.edges(data=True)) 65 | 66 | def test_hash(): 67 | os.chdir(os.environ["HOME"]) 68 | import networkx as nx 69 | import gmatch4py as gm 70 | 71 | # Basic HASH 72 | G1 = nx.Graph() 73 | G_gm = gm.graph.Graph(G1) 74 | assert G_gm.hash_edge("1","2") == "1_2" 75 | assert G_gm.hash_edge("2","1") == "1_2" 76 | 77 | # IF directed 78 | G1 = nx.DiGraph() 79 | G1.add_edge("1","2") 80 | G_gm = gm.graph.Graph(G1) 81 | assert G_gm.hash_edge("3","2") == "3_2" 82 | assert G_gm.hash_edge("2","1") == "2_1" 83 | 84 | # IF color and directed 85 | G1 = nx.DiGraph() 86 | G1.add_edge("1","2",color="blue") 87 | G_gm = gm.graph.Graph(G1,edge_attr_key="color") 88 | assert G_gm.hash_edge_attr("3","2","blue") == "3_2_blue" 89 | assert G_gm.get_edges_hash() == {"1_2_blue"} 90 | 91 | # if color and not directed 92 | G1 = nx.Graph() 93 | G1.add_edge("1","2",color="blue") 94 | G_gm = gm.graph.Graph(G1,edge_attr_key="color") 95 | assert G_gm.hash_edge_attr("3","2","blue") == "2_3_blue" 96 | 97 | def test_intersect_union(): 98 | os.chdir(os.environ["HOME"]) 99 | import networkx as nx 100 | import gmatch4py as gm 101 | 102 | # Basic 103 | G1 = nx.Graph() 104 | G1.add_edge("1","2") 105 | G1.add_edge("1","3") 106 | G2 = G1.copy() 107 | G2.add_edge("3","4") 108 | GM1 = gm.graph.Graph(G1) 109 | GM2 = gm.graph.Graph(G2) 110 | 111 | assert GM1.size_edge_union(GM2) == 3 112 | assert GM1.size_node_union(GM2) == 4 113 | 114 | assert GM1.size_edge_intersect(GM2) == 2 115 | assert GM1.size_node_intersect(GM2) == 3 116 | 117 | # BASIC and noised for hash 118 | G1 = nx.Graph() 119 | G1.add_edge("1","2") 120 | G1.add_edge("1","3") 121 | G2 = nx.Graph() 122 | G2.add_edge("1","2") 123 | G2.add_edge("3","1") # Changing the direction (no impact if working) 124 | G2.add_edge("3","4") 125 | GM1 = gm.graph.Graph(G1) 126 | GM2 = gm.graph.Graph(G2) 127 | 128 | assert GM1.size_edge_union(GM2) == 3 129 | assert GM1.size_node_union(GM2) == 4 130 | 131 | assert GM1.size_edge_intersect(GM2) == 2 132 | assert GM1.size_node_intersect(GM2) == 3 133 | 134 | 135 | # Directed 136 | G1 = nx.DiGraph() 137 | G1.add_edge("1","2") 138 | G1.add_edge("1","3") 139 | G2 = nx.DiGraph() 140 | G2.add_edge("1","2") 141 | G2.add_edge("3","1") # Changing the direction (no impact if working) 142 | G2.add_edge("3","4") 143 | GM1 = gm.graph.Graph(G1) 144 | GM2 = gm.graph.Graph(G2) 145 | 146 | assert GM1.size_edge_union(GM2) == 4 147 | assert GM1.size_node_union(GM2) == 4 148 | 149 | assert GM1.size_edge_intersect(GM2) == 1 150 | assert GM1.size_node_intersect(GM2) == 3 151 | 152 | 153 | # IF COLOR 154 | G1 = nx.DiGraph(); G1.add_node("1",color="blue") 155 | G2 = nx.DiGraph(); G2.add_node("1",color="red") 156 | 157 | GM1,GM2 = gm.graph.Graph(G1),gm.graph.Graph(G2) 158 | assert GM1.size_node_intersect(GM2) == 1 159 | GM1,GM2 = gm.graph.Graph(G1,node_attr_key="color"),gm.graph.Graph(G2,node_attr_key="color") 160 | assert GM1.size_node_intersect(GM2) == 0 161 | 162 | 163 | G1 = nx.DiGraph(); G1.add_edge("1","2",color="blue") 164 | G2 = nx.DiGraph(); G2.add_edge("1","2",color="red") 165 | 166 | GM1,GM2 = gm.graph.Graph(G1),gm.graph.Graph(G2) 167 | assert GM1.size_edge_intersect(GM2) == 1 168 | assert GM1.size_edge_union(GM2) == 1 169 | GM1,GM2 = gm.graph.Graph(G1,edge_attr_key="color"),gm.graph.Graph(G2,edge_attr_key="color") 170 | assert GM1.size_edge_intersect(GM2) == 0 171 | assert GM1.size_edge_union(GM2) == 2 172 | 173 | def test_degree(): 174 | os.chdir(os.environ["HOME"]) 175 | import networkx as nx 176 | import gmatch4py as gm 177 | 178 | # Not DIRECTED and no attr 179 | G1 = nx.Graph() 180 | G1.add_edge("1","2") 181 | G1.add_edge("1","3") 182 | GM1 = gm.graph.Graph(G1) 183 | assert GM1.degree('1') == 2 184 | 185 | G1 = nx.DiGraph() 186 | G1.add_edge("1","2") 187 | G1.add_edge("3","1") 188 | GM1 = gm.graph.Graph(G1) 189 | assert GM1.degree('1') == 2 190 | assert GM1.in_degree('1') == 1 191 | assert GM1.out_degree('1') == 1 192 | 193 | G1 = nx.MultiGraph() 194 | G1.add_edge("1","2",color="blue") 195 | G1.add_edge("1","2",color="red") 196 | G1.add_edge("1","3",color="blue") 197 | GM1 = gm.graph.Graph(G1,edge_attr_key ="color") 198 | 199 | assert GM1.degree_attr('1',"blue") == 2 200 | assert GM1.degree('1') == 3 201 | 202 | G1 = nx.MultiDiGraph() 203 | G1.add_edge("1","2",color="blue") 204 | G1.add_edge("1","2",color="red") 205 | G1.add_edge("1","3",color="green") 206 | GM1 = gm.graph.Graph(G1,edge_attr_key ="color") 207 | assert GM1.in_degree_attr('2','red') == 1 208 | assert GM1.in_degree('2') == 2 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | --------------------------------------------------------------------------------