├── tests ├── __init__.py ├── .DS_Store ├── test_factory.py ├── test_gaston.py ├── test_embedding.py └── test_search.py ├── gaston_py ├── __init__.py ├── .DS_Store ├── level.py ├── tree.py ├── cycle.py ├── node.py ├── fragment.py ├── path.py ├── gaston.py ├── embedding.py ├── command_line.py ├── graph.py ├── search.py └── factory.py ├── .DS_Store ├── bin └── gaston ├── setup.py ├── test_files ├── small_chemical.txt └── medium_chemical.txt ├── .gitignore └── README.txt /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gaston_py/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ColinConduff/FrequentSubgraphMining/HEAD/.DS_Store -------------------------------------------------------------------------------- /bin/gaston: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from gaston_py.command_line import main 4 | main() -------------------------------------------------------------------------------- /tests/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ColinConduff/FrequentSubgraphMining/HEAD/tests/.DS_Store -------------------------------------------------------------------------------- /gaston_py/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ColinConduff/FrequentSubgraphMining/HEAD/gaston_py/.DS_Store -------------------------------------------------------------------------------- /gaston_py/level.py: -------------------------------------------------------------------------------- 1 | 2 | from enum import IntEnum 3 | 4 | class Level(IntEnum): 5 | """ Constants specifying the queue corresponding to the graph type. """ 6 | NODE = 0 7 | PATH = 1 8 | TREE = 2 9 | CYCLE = 3 10 | -------------------------------------------------------------------------------- /gaston_py/tree.py: -------------------------------------------------------------------------------- 1 | 2 | from gaston_py.fragment import Fragment 3 | from gaston_py.level import Level 4 | 5 | class Tree(Fragment): 6 | """ A fragment containing a tree subgraph. """ 7 | 8 | def __str__(self): 9 | return "Tree" 10 | 11 | @property 12 | def queue_level(self): 13 | """ A property to specify the search queue containing tree fragments. """ 14 | return Level.TREE 15 | -------------------------------------------------------------------------------- /gaston_py/cycle.py: -------------------------------------------------------------------------------- 1 | 2 | from gaston_py.fragment import Fragment 3 | from gaston_py.level import Level 4 | 5 | class Cycle(Fragment): 6 | """ A fragment containing a subgraph with one or more cycles. """ 7 | 8 | def __str__(self): 9 | return "Cycle" 10 | 11 | @property 12 | def queue_level(self): 13 | """ A property to specify the search queue containing cycle fragments. """ 14 | return Level.CYCLE -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='gaston_py', 4 | version='0.1', 5 | description='Python implementation of the Gaston graph mining algorithm.', 6 | url='', 7 | author='Colin Conduff', 8 | author_email='colin.conduff@mst.edu', 9 | license='MIT', 10 | packages=['gaston_py'], 11 | install_requires=[ 12 | 'networkx', 13 | 'matplotlib' 14 | ], 15 | scripts=['bin/gaston'], 16 | entry_points={ 17 | 'console_scripts': ['gaston=gaston_py.gaston:gaston'], 18 | }, 19 | test_suite='nose.collector', 20 | tests_require=['nose'], 21 | include_package_data=True, 22 | zip_safe=False) 23 | -------------------------------------------------------------------------------- /test_files/small_chemical.txt: -------------------------------------------------------------------------------- 1 | t # 0 2 | v 0 0 3 | v 1 0 4 | v 2 0 5 | v 3 0 6 | v 4 0 7 | v 5 0 8 | v 6 1 9 | v 7 1 10 | v 8 1 11 | v 9 1 12 | v 10 2 13 | v 11 0 14 | v 12 0 15 | v 13 2 16 | v 14 0 17 | v 15 0 18 | v 16 0 19 | v 17 0 20 | v 18 1 21 | v 19 1 22 | v 20 1 23 | v 21 3 24 | v 22 3 25 | v 23 4 26 | v 24 5 27 | v 25 5 28 | e 0 1 3 29 | e 1 2 3 30 | e 2 3 3 31 | e 3 4 3 32 | e 4 5 3 33 | e 5 0 3 34 | e 0 6 0 35 | e 1 7 0 36 | e 4 8 0 37 | e 5 9 0 38 | e 2 10 0 39 | e 10 11 0 40 | e 11 12 3 41 | e 12 13 0 42 | e 13 3 0 43 | e 11 14 3 44 | e 14 15 3 45 | e 15 16 3 46 | e 16 17 3 47 | e 17 12 3 48 | e 14 18 0 49 | e 15 19 0 50 | e 17 20 0 51 | e 13 21 1 52 | e 10 22 1 53 | e 16 23 0 54 | e 23 24 0 55 | e 23 25 0 -------------------------------------------------------------------------------- /gaston_py/node.py: -------------------------------------------------------------------------------- 1 | 2 | import gaston_py.graph as graph_module 3 | from gaston_py.fragment import Fragment 4 | from gaston_py.level import Level 5 | 6 | class Node(Fragment): 7 | """ A fragment containing a node subgraph. """ 8 | 9 | def __init__(self, source_node_id, source_graph): 10 | 11 | node_label = source_graph.node[source_node_id]['label'] 12 | current_graph = graph_module.create_nx_node_graph(source_node_id, node_label) 13 | 14 | embedding_list = tuple([node_label]) 15 | 16 | super().__init__(source_node_id, current_graph, source_graph, embedding_list) 17 | 18 | def __str__(self): 19 | return "Node" 20 | 21 | @property 22 | def queue_level(self): 23 | """ A property to specify the search queue containing node fragments. """ 24 | return Level.NODE 25 | -------------------------------------------------------------------------------- /tests/test_factory.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import networkx as nx 4 | import gaston_py.factory as factory 5 | from gaston_py.node import Node 6 | from gaston_py.path import Path 7 | from gaston_py.tree import Tree 8 | from gaston_py.cycle import Cycle 9 | 10 | class FactoryTestCase(unittest.TestCase): 11 | 12 | def setUp(self): 13 | graph = nx.Graph(id=1) 14 | graph.add_node(1, label=0) 15 | graph.add_node(2, label=0) 16 | graph.add_edge(1, 2, label=0) 17 | graph.add_node(3, label=3) 18 | graph.add_node(4, label=4) 19 | graph.add_edge(1, 3, label=13) 20 | graph.add_edge(3, 4, label=34) 21 | self.small_graph = nx.freeze(graph) 22 | 23 | def test_initial_node_fragments(self): 24 | node_fragments = factory.initial_node_fragments([self.small_graph]) 25 | self.assertTrue(all(isinstance(x, Node) for x in node_fragments)) 26 | 27 | def test_apply_refinement_func_creates_paths_from_nodes(self): 28 | prev_fragment = Node(1, self.small_graph) 29 | edge = (1, 2) 30 | fragment = factory.apply_refinement(prev_fragment, edge, 31 | dont_generate_cycles=False, dont_generate_trees=False) 32 | self.assertTrue(isinstance(fragment, Path)) 33 | -------------------------------------------------------------------------------- /tests/test_gaston.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import networkx as nx 4 | from gaston_py.gaston import gaston 5 | 6 | class GastonTestCase(unittest.TestCase): 7 | 8 | ORIG_CHEM_DATASET = 'test_files/Chemical_340.txt' 9 | SMALL_DATASET = 'test_files/small_chemical.txt' 10 | MEDIUM_DATASET = 'test_files/medium_chemical.txt' 11 | 12 | def test_gaston_with_small_dataset(self): 13 | 14 | frequent_output = gaston(min_support=6, 15 | input_file=GastonTestCase.SMALL_DATASET) 16 | 17 | self.assertTrue(all(isinstance(key, tuple) for key in frequent_output.keys())) 18 | self.assertTrue(all(isinstance(value, tuple) for value in frequent_output.values())) 19 | 20 | allowed_graph_types = set(['Node', 'Path', 'Tree', 'Cycle']) 21 | for nx_graph, graph_type, frequency in frequent_output.values(): 22 | self.assertTrue(isinstance(nx_graph, nx.Graph)) 23 | self.assertTrue(graph_type in allowed_graph_types) 24 | self.assertTrue(isinstance(frequency, int)) 25 | self.assertTrue(frequency >= 0) 26 | 27 | def test_does_not_generate_unwanted_graph_types(self): 28 | frequent_output = gaston(min_support=0.95, 29 | input_file=GastonTestCase.SMALL_DATASET, 30 | dont_generate_trees=True, 31 | dont_generate_cycles=True) 32 | 33 | unwanted_graph_types = set(['Tree', 'Cycle']) 34 | for _, graph_type, _ in frequent_output.values(): 35 | self.assertTrue(graph_type not in unwanted_graph_types) 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # Visual Studio 101 | .vscode/ -------------------------------------------------------------------------------- /tests/test_embedding.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import networkx as nx 4 | import gaston_py.embedding as emb_module 5 | 6 | class EmbeddingTestCase(unittest.TestCase): 7 | 8 | def setUp(self): 9 | graph = nx.Graph(id=1) 10 | graph.add_node(1, label=0) 11 | graph.add_node(2, label=0) 12 | graph.add_edge(1, 2, label=0) 13 | graph.add_node(3, label=3) 14 | graph.add_node(4, label=4) 15 | graph.add_edge(1, 3, label=13) 16 | graph.add_edge(3, 4, label=34) 17 | self.small_graph = nx.freeze(graph) 18 | 19 | def test_initial_source_label_is_greater_than_alt_label(self): 20 | # if source_label > alt_label, should return None 21 | embedding = emb_module.create_embedding_list_if_unique(self.small_graph, 4, 1) 22 | self.assertEqual(embedding, None) 23 | 24 | def test_initial_source_label_is_less_than_alt_label(self): 25 | # if source_label < alt_label, should jump to create_embedding() 26 | embedding = emb_module.create_embedding_list_if_unique(self.small_graph, 1, 4) 27 | self.assertEqual(embedding, (0, 0, 0, 13, 3, 34, 4)) 28 | 29 | def test_labels_are_identical_and_source_has_smaller_node_ids(self): 30 | # Use ids to select embedding when labels are identical 31 | embedding = emb_module.create_embedding_list_if_unique(self.small_graph, 1, 2) 32 | self.assertEqual(embedding, (0, 0, 0, 13, 3, 34, 4)) 33 | 34 | def test_labels_are_identical_and_source_has_larger_node_ids(self): 35 | # Use ids to select embedding when labels are identical 36 | embedding = emb_module.create_embedding_list_if_unique(self.small_graph, 2, 1) 37 | self.assertEqual(embedding, None) 38 | -------------------------------------------------------------------------------- /gaston_py/fragment.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Fragment(object): 4 | """ 5 | Base class for node, path, tree, and cycle fragments. 6 | 7 | current_graph: a networkx graph representing a subgraph in source_graph 8 | source_node_id: the id of the source node in the graph 9 | source_graph: a networkx graph containing current_graph 10 | embedding_list: a unique tuple representation of current_graph 11 | """ 12 | 13 | def __init__(self, source_node_id, current_graph, source_graph, embedding_list): 14 | self.source_node_id = source_node_id 15 | self.current_graph = current_graph 16 | self.source_graph = source_graph 17 | self.embedding_list = embedding_list 18 | 19 | # Ensure correctness of gaston algorithm using unique hash values for each subgraph/fragment 20 | # Will be removed if correctness of faster methods are proven 21 | self.hash_value = hash(tuple( 22 | sorted((u, data['label']) for u, data in self.current_graph.nodes_iter(data=True)) + 23 | sorted(data['label'] for _, _, data in self.current_graph.edges_iter(data=True)) 24 | )) 25 | 26 | @property 27 | def frontier_edges(self): 28 | """ 29 | Returns tuples representing edges to neighboring nodes not already in current_graph. 30 | Used to find possible refinements to fragments. 31 | """ 32 | for node_id in self.current_graph: 33 | edges = self.current_graph.edge[node_id] 34 | if node_id in self.source_graph: 35 | for neighbor_id in self.source_graph.neighbors_iter(node_id): 36 | if neighbor_id not in edges: 37 | yield (node_id, neighbor_id) 38 | 39 | def __hash__(self): 40 | return self.hash_value 41 | 42 | def __eq__(self, other): 43 | return self.hash_value == other.hash_value 44 | -------------------------------------------------------------------------------- /tests/test_search.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import networkx as nx 4 | from gaston_py import search, factory 5 | 6 | class SearchTestCase(unittest.TestCase): 7 | 8 | ORIG_CHEM_DATASET = '../test_files/Chemical_340.txt' 9 | SMALL_DATASET = '../test_files/small_chemical.txt' 10 | 11 | def setUp(self): 12 | graph_a = nx.Graph(id=1) 13 | graph_a.add_node(1, label=0) 14 | graph_a.add_node(2, label=0) 15 | graph_a.add_edge(1, 2, label=0) 16 | self.tiny_graph = nx.freeze(graph_a) 17 | 18 | graph_b = nx.Graph(graph_a) 19 | graph_b.add_node(3, label=3) 20 | graph_b.add_edge(1, 3, label=13) 21 | graph_b.add_edge(2, 3, label=23) 22 | self.small_graph = nx.freeze(graph_b) 23 | 24 | def test_find_frequent_subgraphs_in_tiny_graph(self): 25 | 26 | min_freq = 0 27 | input_graphs = [self.tiny_graph] 28 | 29 | expected_frequencies = {(0,): 2, (0, 0, 0): 1} 30 | 31 | fragments = factory.initial_node_fragments(input_graphs) 32 | frequent_output = search.find_frequent_subgraphs(fragments, min_freq) 33 | frequencies = {embedding: frequent_output[embedding][2] for embedding in frequent_output} 34 | 35 | self.assertEqual(frequencies, expected_frequencies) 36 | 37 | def test_find_frequent_subgraphs_in_small_graph(self): 38 | 39 | min_freq = 0 40 | input_graphs = [self.small_graph] 41 | 42 | expected_frequencies = { 43 | (0,): 2, 44 | (0, 0, 0): 1, 45 | (0, 0, 0, 13, 3): 1, 46 | (0, 0, 0, 23, 3): 1, 47 | # (0, 0, 0, 13, 3, 23, 3): 1, 48 | (0, 0, 0, 23, 3, 13, 0): 1, 49 | (0, 13, 3): 1, 50 | (0, 13, 3, 23, 0): 1, 51 | (0, 23, 3): 1, 52 | (3,): 1 53 | } 54 | 55 | initial_fragments = factory.initial_node_fragments(input_graphs) 56 | frequent_output = search.find_frequent_subgraphs(initial_fragments, min_freq) 57 | frequencies = {embedding: frequent_output[embedding][2] for embedding in frequent_output} 58 | 59 | # print(frequencies) 60 | 61 | self.assertEqual(frequencies, expected_frequencies) 62 | -------------------------------------------------------------------------------- /gaston_py/path.py: -------------------------------------------------------------------------------- 1 | 2 | from gaston_py.fragment import Fragment 3 | from gaston_py.level import Level 4 | 5 | class Path(Fragment): 6 | """ A fragment containing a path subgraph. """ 7 | 8 | def __init__(self, source_node_id, back_node_id, current_graph, 9 | source_graph, embedding_list, 10 | total_symmetry, front_symmetry, back_symmetry): 11 | 12 | super().__init__(source_node_id, current_graph, source_graph, embedding_list) 13 | self.back_node_id = back_node_id 14 | self.total_symmetry = total_symmetry 15 | self.front_symmetry = front_symmetry 16 | self.back_symmetry = back_symmetry 17 | 18 | def __str__(self): 19 | return "Path" 20 | 21 | @property 22 | def queue_level(self): 23 | """ A property to specify the search queue containing path fragments. """ 24 | return Level.PATH 25 | 26 | @property 27 | def symmetries(self): 28 | return (self.total_symmetry, self.front_symmetry, self.back_symmetry) 29 | 30 | @staticmethod 31 | def compute_symmetry(embedding_list, reversed_list=None): 32 | """ O(n) method for calculating the symmetry """ 33 | if reversed_list is None: 34 | reversed_list = tuple(reversed(embedding_list)) 35 | return 0 if embedding_list == reversed_list else 1 if embedding_list < reversed_list else -1 36 | 37 | # @staticmethod 38 | # def new_path_symmetries(prev_symmetries, edge1, new_edge): 39 | 40 | # old_total, old_front, old_back = prev_symmetries 41 | 42 | # O(1) method is direction dependent (needs to be changed if used for prepending) 43 | # new_front = old_total 44 | # if old_back == 0: 45 | # new_total = 1 if edge1 < new_edge else 0 if edge1 == new_edge else -1 46 | # elif old_back == 1: 47 | # new_total = 1 if edge1 <= new_edge else -1 48 | 49 | # new_back = calculate new back 50 | 51 | # return (new_total, new_front, new_back) 52 | 53 | @staticmethod 54 | def new_path_symmetries(embedding_list): 55 | # slower methods being used to ensure correctness 56 | reversed_list = tuple(reversed(embedding_list)) 57 | 58 | new_front = Path.compute_symmetry(embedding_list[:-2], reversed_list[:-2]) 59 | new_total = Path.compute_symmetry(embedding_list, reversed_list) 60 | new_back = Path.compute_symmetry(embedding_list[2:], reversed_list[2:]) 61 | 62 | return (new_total, new_front, new_back) -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | 2 | Gaston Graph Mining with Python 3 | This is a python implementation of the Gaston graph mining algorithm. 4 | 5 | Author: Colin Conduff 6 | 7 | "Gaston finds all frequent subgraphs by using a level-wise approach in which first simple paths are considered, 8 | then more complex trees and finally the most complex cyclic graphs. It appears that in practice most frequent 9 | graphs are not actually very complex structures; Gaston uses this quickstart observation to organize the search 10 | space efficiently. To determine the frequency of graphs, Gaston employs an occurrence list based approach in 11 | which all occurrences of a small set of graphs are stored in main memory." 12 | - Siegfried Nijssen (Gaston author) 13 | 14 | Software Requirements: 15 | - python 3.2 or later 16 | - networkx 1.11 17 | - matplotlib 18 | 19 | Installation: 20 | `cd gaston_py` 21 | `pip install .` 22 | 23 | Command Line Interface: 24 | For usage instructions, use the following command. 25 | `gaston -h` 26 | 27 | Examples: 28 | `gaston 0.95 test_files/medium_chemical.txt -o output_files/ -c -t` 29 | `gaston 6 test_files/medium_chemical.txt` 30 | `gaston 2 test_files/Chemical_340.txt` 31 | 32 | Notes: 33 | - Support is defined as frequency(subgraph) / count(graphs). See reference [1] below for details. 34 | - If an output directory is provided: 35 | * Frequent subgraphs are drawn using matplotlib and saved under [output folder]/graphs/. 36 | * A `line_graph.txt` file is generated containing the frequent subgraphs in Line Graph format. 37 | - Test files are available in the `test_files` directory. The Chemical_340 dataset was obtained 38 | from Nijssen and Kok's website. 39 | 40 | Run Tests: 41 | `python setup.py test` 42 | 43 | Performance: 44 | - This Python implementation is significantly slower than the original implementation 45 | in C++ by Nijssen and Kok, as well as the Java implementation by the ParSeMis library. 46 | - It is also much slower than gSpan impelementations in Python. This is likely due 47 | to the use of an occurrence list, which tracks all subgraphs that have been seen to 48 | ensure the correctness of the algorithm. 49 | 50 | References: 51 | [1] Siegfried Nijssen and Joost Kok. A Quickstart in Frequent Structure Mining Can 52 | Make a Difference. Proceedings of the SIGKDD, 2004. 53 | http://liacs.leidenuniv.nl/~nijssensgr/gaston/index.html 54 | 55 | Additional Resources: 56 | Java implementation of the Gaston algorithm 57 | https://www2.cs.fau.de/EN/research/zold/ParSeMiS/index.html 58 | https://github.com/timtadh/parsemis -------------------------------------------------------------------------------- /gaston_py/gaston.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import Counter 3 | 4 | import gaston_py.graph as graph_module 5 | import gaston_py.factory as factory 6 | import gaston_py.search as search 7 | 8 | def gaston(min_support, input_file, 9 | dont_generate_cycles=False, dont_generate_trees=False, 10 | should_print_graph_information=False): 11 | """ 12 | Reads graphs from a line graph file and finds frequently occurring 13 | subgraphs with support > min_support. 14 | 15 | Args: 16 | min_support: a float specifying the minimum support 17 | input_file: a file path to the input file containing line graphs 18 | dont_generate_cycles: a flag specifying whether to generate cycles 19 | dont_generate_trees: a flag specifying whether to generate trees 20 | should_print_graph_information: a flag specifying whether to print graph info 21 | 22 | Returns: 23 | a dictionary of the form {embedding_list: (subgraph, graph type, frequency)} 24 | """ 25 | 26 | graphs = graph_module.read_line_graphs(input_file) 27 | min_frequency = int(min_support * len(graphs)) 28 | if min_frequency < 1: 29 | min_frequency = 1 30 | 31 | if should_print_graph_information: 32 | print_graph_information(graphs, min_frequency) 33 | 34 | fragments = factory.initial_node_fragments(graphs) 35 | return search.find_frequent_subgraphs(fragments, min_frequency, 36 | dont_generate_cycles, dont_generate_trees) 37 | 38 | def print_graph_information(graphs, min_frequency): 39 | """ Prints relevant graph information such as min frequency and counts. """ 40 | print("\nMinimum Frequency: {}".format(min_frequency)) 41 | print("Total - graphs: {}, nodes: {}, edges: {}".format( 42 | len(graphs), 43 | graph_module.count_total_nodes(graphs), 44 | graph_module.count_total_edges(graphs))) 45 | 46 | print("Unique - nodes: {}, edges: {}\n".format( 47 | graph_module.count_unique_nodes(graphs), graph_module.count_unique_edges(graphs))) 48 | 49 | def write_frequent_subgraphs_to_file_path(output_file, frequent_output): 50 | """ Writes frequently occurring subgraphs to the output filepath. """ 51 | frequent_graph_iter = iter(graph for graph, _, _ in frequent_output.values()) 52 | graph_module.write_line_graphs(frequent_graph_iter, output_file) 53 | 54 | def print_statistics(frequent_output): 55 | """ Prints frequencies by graph type and embedding list. """ 56 | graph_type_frequency = Counter(graph_type for _, graph_type, _ in frequent_output.values()) 57 | 58 | print("Frequencies:") 59 | print("Nodes: {}".format(graph_type_frequency['Node'])) 60 | print("Paths: {}".format(graph_type_frequency['Path'])) 61 | print("Trees: {}".format(graph_type_frequency['Tree'])) 62 | print("Cycles: {}\n".format(graph_type_frequency['Cycle'])) 63 | 64 | print("Frequent Subgraphs:") 65 | for embedding_list, (_, _, frequency) in frequent_output.items(): 66 | print("embedding_list: {}, frequency: {}".format(''.join(embedding_list), frequency)) 67 | -------------------------------------------------------------------------------- /gaston_py/embedding.py: -------------------------------------------------------------------------------- 1 | 2 | def create_embedding_list(graph, source_id): 3 | """ Generalized, slow method for creating unique embedded lists for any type of graph. """ 4 | embedding_labels = [graph.node[source_id]['label']] 5 | embed_iter = _create_embedding_list(graph, visited=set(), node_id=source_id) 6 | 7 | for node_id, (edge_label, neighbor_label) in embed_iter: 8 | embedding_labels.append(edge_label) 9 | embedding_labels.append(neighbor_label) 10 | 11 | return tuple(embedding_labels) 12 | 13 | def create_embedding_list_if_unique(graph, source_id, alt_source_id): 14 | node_label = graph.node[source_id]['label'] 15 | alt_node_label = graph.node[alt_source_id]['label'] 16 | 17 | if node_label > alt_node_label: 18 | return None 19 | 20 | # If a cycle is formed by wrapping back to source_id, source_id == alt_source_id 21 | elif node_label < alt_node_label or source_id == alt_source_id: 22 | return create_embedding_list(graph, source_id) 23 | else: 24 | return _embedding_list_with_comparison(graph, source_id, alt_source_id, node_label, alt_node_label) 25 | 26 | def _embedding_list_with_comparison(graph, source_id, alt_source_id, node_label, alt_node_label): 27 | 28 | embedding_list = [node_label] 29 | source_node_ids = [source_id] 30 | alt_node_ids = [alt_source_id] 31 | 32 | embed_iter = _create_embedding_list(graph, visited=set(), node_id=source_id) 33 | alt_embed_iter = _create_embedding_list(graph, visited=set(), node_id=alt_source_id) 34 | 35 | should_compare = True 36 | 37 | while True: 38 | try: 39 | node_id, edge = next(embed_iter) 40 | embedding_list.extend(edge) 41 | 42 | if should_compare: 43 | alt_node_id, alt_edge = next(alt_embed_iter) 44 | 45 | source_node_ids.append(node_id) 46 | alt_node_ids.append(alt_node_id) 47 | 48 | if edge > alt_edge: 49 | return None 50 | elif edge < alt_edge: 51 | should_compare = False 52 | del source_node_ids 53 | del alt_node_ids 54 | 55 | except StopIteration: 56 | break 57 | 58 | if should_compare and source_node_ids > alt_node_ids: 59 | return None 60 | 61 | return tuple(embedding_list) 62 | 63 | def _create_embedding_list(graph, visited, node_id): 64 | for edge_label, neighbor_label, neighbor_id in sorted(_neighbor_labels(graph, visited, node_id)): 65 | if (node_id, neighbor_id) not in visited: 66 | visited.add((node_id, neighbor_id)) 67 | visited.add((neighbor_id, node_id)) # if graph is undirected 68 | 69 | yield node_id, (edge_label, neighbor_label) 70 | yield from _create_embedding_list(graph, visited, neighbor_id) 71 | 72 | def _neighbor_labels(graph, visited, node_id): 73 | for neighbor_id in graph.neighbors_iter(node_id): 74 | if (node_id, neighbor_id) not in visited: 75 | edge_label = graph.edge[node_id][neighbor_id]['label'] 76 | neighbor_label = graph.node[neighbor_id]['label'] 77 | yield edge_label, neighbor_label, neighbor_id -------------------------------------------------------------------------------- /gaston_py/command_line.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import os 4 | import argparse 5 | 6 | import gaston_py.gaston as gaston_alg 7 | import gaston_py.graph as graph_module 8 | 9 | DESCRIPTION = 'A command line interface for interacting with the gaston python implementation.' 10 | 11 | def main(): 12 | """ 13 | A command line interface for interacting with the gaston python implementation. 14 | Args: 15 | min_support: a float or integer 16 | input_file_path: file path to a text file in line graph format 17 | output_folder_path: location to output frequent subgraphs in line graph format and drawings 18 | dont_generate_cycles: a flag to specify that cycles should not be generated 19 | don_generate_trees: a flag to specify that trees should not be generated 20 | 21 | Examples: 22 | gaston 0.95 test_files/medium_chemical.txt -o output_files/ -c -t 23 | gaston 6 test_files/medium_chemical.txt 24 | gaston 3 test_files/Chemical_340.txt 25 | """ 26 | 27 | # Parse command line input 28 | parser = argparse.ArgumentParser(description=DESCRIPTION) 29 | 30 | parser.add_argument("min_support", type=float, help='Minimum support for the gaston algorithm.') 31 | parser.add_argument("input_file_path", 32 | help='Input file path containing graphs in line graph format.') 33 | parser.add_argument("-o", "--output_folder_path", help='Ouput location for frequent subgraphs.') 34 | parser.add_argument("-c", "--dont_generate_cycles", default=False, 35 | help='Do not generate cyclic subgraphs.', action="store_true") 36 | parser.add_argument("-t", "--dont_generate_trees", default=False, 37 | help='Do not generate tree subgraphs.', action="store_true") 38 | 39 | args = parser.parse_args() 40 | 41 | # Validate input 42 | if args.min_support <= 0: 43 | raise argparse.ArgumentTypeError("\n\n\t Minimum support must be greater than 0.\n") 44 | if not os.path.exists(args.input_file_path): 45 | raise argparse.ArgumentTypeError( 46 | "\n\n\t The input file path '{}' does not exist.\n".format(args.input_file_path)) 47 | if args.output_folder_path is not None and not os.path.exists(args.output_folder_path): 48 | raise argparse.ArgumentTypeError( 49 | "\n\n\t The output folder path '{}' does not exist.\n".format(args.output_folder_path)) 50 | 51 | print("\nMinimum Support:{}".format(args.min_support)) 52 | 53 | if args.dont_generate_cycles: 54 | print("Cycles will not be generated.") 55 | 56 | if args.dont_generate_trees: 57 | print("Trees will not be generated.") 58 | 59 | frequent_output = gaston_alg.gaston(args.min_support, args.input_file_path, 60 | args.dont_generate_cycles, args.dont_generate_trees, 61 | should_print_graph_information=True) 62 | 63 | gaston_alg.print_statistics(frequent_output) 64 | 65 | # If a file path to an output folder is provided, 66 | # write frequently occurring subgraphs to 'line_graphs.txt' in line graph format. 67 | # Also, create a graphs folder if necessary, draw graphs, and save them to a 'graphs' folder. 68 | if args.output_folder_path is not None: 69 | print("\nProcessing output...") 70 | 71 | output_dirname = os.path.dirname(args.output_folder_path) 72 | output_file_path = os.path.join(output_dirname, "line_graphs.txt") 73 | graph_drawings_file_path = os.path.join(os.path.dirname(args.output_folder_path), "graphs", "") 74 | 75 | if not os.path.exists(graph_drawings_file_path): 76 | os.makedirs(graph_drawings_file_path) 77 | 78 | gaston_alg.write_frequent_subgraphs_to_file_path(output_file_path, frequent_output) 79 | graph_module.draw_nx_graphs(graph_drawings_file_path, frequent_output) 80 | 81 | print("Completed execution of program.\n") -------------------------------------------------------------------------------- /gaston_py/graph.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import networkx as nx 3 | import matplotlib.pyplot as plt 4 | 5 | def draw_nx_graphs(output_file_path, frequent_output): 6 | """ Save graphs with node and edge labels to output file path. """ 7 | 8 | for graph_id, (embedding, (graph, graph_type, frequency)) in enumerate(frequent_output.items()): 9 | 10 | figure = plt.figure(graph_id) 11 | 12 | node_labels = {id: data['label'] for (id, data) in graph.nodes_iter(data=True)} 13 | edge_labels = {(u, v): data['label'] for (u, v, data) in graph.edges_iter(data=True)} 14 | 15 | pos = nx.spring_layout(graph, k=0.8) 16 | nx.draw_networkx(graph, pos, node_color='b', alpha=0.5, labels=node_labels) 17 | nx.draw_networkx_edge_labels(graph, pos, edge_color='b', alpha=0.3, edge_labels=edge_labels) 18 | 19 | plt.axis('off') 20 | plt.title("{}\nGraph Type: {}, Frequency: {}".format( 21 | ''.join(embedding), graph_type, frequency)) 22 | plt.savefig("{}graph{}.png".format(output_file_path, graph_id)) 23 | 24 | plt.close(figure) 25 | 26 | def create_nx_node_graph(source_node_id, node_label): 27 | current_graph = nx.Graph() 28 | current_graph.add_node(source_node_id, label=node_label) 29 | return nx.freeze(current_graph) 30 | 31 | def create_nx_graph(origin_id, origin_label, target_id, target_label, edge_label): 32 | graph = nx.Graph() 33 | graph.add_node(origin_id, label=origin_label) 34 | graph.add_node(target_id, label=target_label) 35 | graph.add_edge(origin_id, target_id, label=edge_label) 36 | return nx.freeze(graph) 37 | 38 | def extend_nx_graph(prev_graph, origin_node_id, target_node_id, node_label, edge_label): 39 | graph = nx.Graph(prev_graph) 40 | graph.add_node(target_node_id, label=node_label) 41 | graph.add_edge(origin_node_id, target_node_id, label=edge_label) 42 | return nx.freeze(graph) 43 | 44 | def read_line_graphs(file_path): 45 | """ 46 | Returns a list of NetworkX graph objects read from a line graph file. 47 | """ 48 | graph_map = {} 49 | 50 | with open(file_path, "r") as f: 51 | graph_id = 0 52 | 53 | for line in f: 54 | line = line.strip() 55 | characters = line.split(" ") 56 | 57 | if line.startswith("t #"): 58 | graph_id += 1 59 | graph = nx.Graph(id=graph_id, embeddings=[]) 60 | graph_map[graph_id] = graph 61 | 62 | elif line.startswith("v"): 63 | label = " ".join(characters[2:]).strip('\'') 64 | graph_map[graph_id].add_node(characters[1], label=label) 65 | 66 | elif line.startswith("e"): 67 | label = " ".join(characters[3:]).strip('\'') 68 | graph_map[graph_id].add_edge(characters[1], characters[2], label=label) 69 | 70 | elif line.startswith("#=>"): 71 | embedding = characters[1] 72 | graph_map[graph_id].graph['embeddings'].append(embedding) 73 | 74 | return graph_map.values() 75 | 76 | def write_line_graphs(graphs, file_path): 77 | """ Write line graphs to file path. """ 78 | with open(file_path, "w") as f: 79 | for g_id, graph in enumerate(graphs): 80 | if "id" in graph.graph: 81 | f.write("t # {}\n".format(graph.graph['id'])) 82 | else: 83 | f.write("t # {}\n".format(g_id)) 84 | 85 | node_dict = {} 86 | 87 | for index, (node_id, data) in enumerate(graph.nodes_iter(data=True)): 88 | node_dict[node_id] = index 89 | node_label = data['label'] 90 | f.write("v {} {}\n".format(index, node_label)) 91 | 92 | for source, target, data in graph.edges_iter(data=True): 93 | f.write("e {} {} {}\n".format(node_dict[source], node_dict[target], data['label'])) 94 | 95 | def count_total_nodes(graphs): 96 | return functools.reduce(lambda total, graph: total + graph.number_of_nodes(), graphs, 0) 97 | 98 | def count_total_edges(graphs): 99 | return functools.reduce(lambda total, graph: total + graph.number_of_edges(), graphs, 0) 100 | 101 | def count_unique_nodes(graphs): 102 | return len(set(data['label'] for graph in graphs for _, data in graph.nodes_iter(data=True))) 103 | 104 | def count_unique_edges(graphs): 105 | return len(set(edge['label'] for graph in graphs for _, _, edge in graph.edges_iter(data=True))) 106 | -------------------------------------------------------------------------------- /gaston_py/search.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import deque 3 | 4 | import gaston_py.factory as factory 5 | from gaston_py.level import Level 6 | 7 | def find_frequent_subgraphs(initial_node_fragments, min_freq, 8 | dont_generate_cycles=False, dont_generate_trees=False): 9 | """ 10 | Perform a level-order search for frequently occurring subgraphs. 11 | An iterative approach is used rather than the recursive approach used by 12 | the original Gaston algorithm. 13 | 14 | Levels: 15 | level 0: nodes 16 | level 1: paths 17 | level 2: trees 18 | level 3: cycles 19 | 20 | Returns: 21 | a dictionary of the form {embedding_list: (subgraph, graph_type, frequency)} 22 | """ 23 | 24 | frequencies = {} # {embedding_list : frequency} 25 | frequent_fragments = {} # {embedding_list: fragment} 26 | visited_fragments = {} # {graph_id: set(fragments)} 27 | 28 | levels = (Level.NODE, Level.PATH, Level.TREE, Level.CYCLE) 29 | queues = (deque(initial_node_fragments), deque(), deque(), deque()) 30 | 31 | for level in levels: 32 | 33 | if dont_generate_trees and level == Level.TREE or \ 34 | dont_generate_cycles and level == Level.CYCLE: 35 | break 36 | 37 | fragments_dict = {} # {embedding_list: [fragments]} 38 | 39 | while len(queues[level]) > 0: 40 | 41 | fragment = queues[level].popleft() 42 | embedding_list = fragment.embedding_list 43 | 44 | _update_frequencies(frequencies, embedding_list) 45 | _update_fragments_dict(fragments_dict, embedding_list, fragment) 46 | 47 | if _is_frequent(frequencies, embedding_list, min_freq): 48 | frequent_fragments[embedding_list] = fragment 49 | 50 | # Generate the next fragment and 51 | # append it to the queue that corresponds to its graph type 52 | for next_fragment in _next_fragments(fragments_dict[embedding_list], 53 | dont_generate_cycles, dont_generate_trees, 54 | visited_fragments): 55 | queues[next_fragment.queue_level].append(next_fragment) 56 | 57 | del fragments_dict[embedding_list] 58 | 59 | # The remaining fragments in fragments_dict are infrequent 60 | _remove_from_source_graphs(fragments_dict.values()) 61 | 62 | return {embedding: values for (embedding, values) in _output(frequent_fragments, frequencies)} 63 | 64 | def _output(frequent_fragments, frequencies): 65 | for embedding, fragment in frequent_fragments.items(): 66 | yield embedding, (fragment.current_graph, str(fragment), frequencies[embedding]) 67 | 68 | def _update_frequencies(frequencies, embedding_list): 69 | if embedding_list in frequencies: 70 | frequencies[embedding_list] += 1 71 | else: 72 | frequencies[embedding_list] = 1 73 | 74 | def _update_fragments_dict(fragments_dict, embedding_list, fragment): 75 | if embedding_list not in fragments_dict: 76 | fragments_dict[embedding_list] = [fragment] 77 | else: 78 | fragments_dict[embedding_list].append(fragment) 79 | 80 | def _is_frequent(frequencies, embedding_list, min_freq): 81 | return frequencies[embedding_list] >= min_freq 82 | 83 | def _next_fragments(fragments, dont_generate_cycles, dont_generate_trees, visited_fragments): 84 | """ 85 | Returns fragments that can be produced by refining previously created frequent fragments. 86 | """ 87 | for prev_fragment in fragments: 88 | for edge in prev_fragment.frontier_edges: 89 | next_fragment = factory.apply_refinement(prev_fragment, edge, 90 | dont_generate_cycles, dont_generate_trees) 91 | if next_fragment is not None: 92 | if _fragment_was_not_already_visited(visited_fragments, next_fragment): 93 | yield next_fragment 94 | _update_visited_fragments(visited_fragments, next_fragment) 95 | 96 | def _fragment_was_not_already_visited(visited_fragments, fragment): 97 | graph_id = fragment.source_graph.graph['id'] 98 | return graph_id not in visited_fragments or fragment not in visited_fragments[graph_id] 99 | 100 | def _update_visited_fragments(visited_fragments, fragment): 101 | graph_id = fragment.source_graph.graph['id'] 102 | if graph_id in visited_fragments: 103 | visited_fragments[graph_id].add(fragment) 104 | else: 105 | visited_fragments[graph_id] = set([fragment]) 106 | 107 | def _remove_from_source_graphs(nested_fragments): 108 | for fragments in nested_fragments: 109 | for fragment in fragments: 110 | for node_id in fragment.current_graph: 111 | if node_id in fragment.source_graph: 112 | fragment.source_graph.remove_node(node_id) 113 | -------------------------------------------------------------------------------- /test_files/medium_chemical.txt: -------------------------------------------------------------------------------- 1 | t # 0 2 | v 0 0 3 | v 1 0 4 | v 2 0 5 | v 3 0 6 | v 4 0 7 | v 5 0 8 | v 6 1 9 | v 7 1 10 | v 8 1 11 | v 9 1 12 | v 10 2 13 | v 11 0 14 | v 12 0 15 | v 13 2 16 | v 14 0 17 | v 15 0 18 | v 16 0 19 | v 17 0 20 | v 18 1 21 | v 19 1 22 | v 20 1 23 | v 21 3 24 | v 22 3 25 | v 23 4 26 | v 24 5 27 | v 25 5 28 | e 0 1 3 29 | e 1 2 3 30 | e 2 3 3 31 | e 3 4 3 32 | e 4 5 3 33 | e 5 0 3 34 | e 0 6 0 35 | e 1 7 0 36 | e 4 8 0 37 | e 5 9 0 38 | e 2 10 0 39 | e 10 11 0 40 | e 11 12 3 41 | e 12 13 0 42 | e 13 3 0 43 | e 11 14 3 44 | e 14 15 3 45 | e 15 16 3 46 | e 16 17 3 47 | e 17 12 3 48 | e 14 18 0 49 | e 15 19 0 50 | e 17 20 0 51 | e 13 21 1 52 | e 10 22 1 53 | e 16 23 0 54 | e 23 24 0 55 | e 23 25 0 56 | t # 1 57 | v 0 0 58 | v 1 0 59 | v 2 0 60 | v 3 0 61 | v 4 0 62 | v 5 0 63 | v 6 1 64 | v 7 1 65 | v 8 1 66 | v 9 1 67 | v 10 1 68 | v 11 4 69 | v 12 4 70 | v 13 6 71 | v 14 5 72 | v 15 3 73 | e 0 1 3 74 | e 1 2 3 75 | e 2 3 3 76 | e 3 4 3 77 | e 4 5 3 78 | e 5 0 3 79 | e 0 6 0 80 | e 1 7 0 81 | e 2 8 0 82 | e 3 9 0 83 | e 5 10 0 84 | e 4 11 0 85 | e 11 12 0 86 | e 11 13 0 87 | e 13 14 0 88 | e 12 15 1 89 | t # 2 90 | v 0 0 91 | v 1 0 92 | v 2 0 93 | v 3 0 94 | v 4 0 95 | v 5 0 96 | v 6 1 97 | v 7 1 98 | v 8 1 99 | v 9 1 100 | v 10 0 101 | v 11 0 102 | v 12 0 103 | v 13 0 104 | v 14 0 105 | v 15 0 106 | v 16 1 107 | v 17 1 108 | v 18 1 109 | v 19 1 110 | v 20 7 111 | v 21 7 112 | v 22 8 113 | v 23 8 114 | v 24 8 115 | v 25 8 116 | e 0 1 3 117 | e 1 2 3 118 | e 2 3 3 119 | e 3 4 3 120 | e 4 5 3 121 | e 5 0 3 122 | e 1 6 0 123 | e 2 7 0 124 | e 4 8 0 125 | e 5 9 0 126 | e 10 11 3 127 | e 11 12 3 128 | e 12 13 3 129 | e 13 14 3 130 | e 14 15 3 131 | e 15 10 3 132 | e 10 16 0 133 | e 11 17 0 134 | e 13 18 0 135 | e 14 19 0 136 | e 3 20 0 137 | e 20 15 0 138 | e 20 21 1 139 | e 21 22 0 140 | e 21 23 0 141 | e 0 24 0 142 | e 12 25 0 143 | t # 3 144 | v 0 9 145 | v 1 9 146 | v 2 7 147 | v 3 7 148 | v 4 9 149 | v 5 9 150 | v 6 9 151 | v 7 7 152 | v 8 7 153 | v 9 1 154 | v 10 1 155 | v 11 1 156 | v 12 1 157 | v 13 9 158 | v 14 8 159 | v 15 8 160 | v 16 8 161 | v 17 1 162 | v 18 8 163 | v 19 8 164 | v 20 8 165 | v 21 8 166 | e 0 1 0 167 | e 1 2 0 168 | e 2 3 1 169 | e 3 4 0 170 | e 4 5 0 171 | e 5 0 0 172 | e 5 6 0 173 | e 6 7 0 174 | e 7 8 1 175 | e 8 0 0 176 | e 0 9 0 177 | e 5 10 0 178 | e 7 11 0 179 | e 8 12 0 180 | e 4 13 0 181 | e 13 1 0 182 | e 13 14 0 183 | e 13 15 0 184 | e 6 16 0 185 | e 6 17 0 186 | e 4 18 0 187 | e 3 19 0 188 | e 2 20 0 189 | e 1 21 0 190 | t # 4 191 | v 0 9 192 | v 1 9 193 | v 2 8 194 | v 3 8 195 | v 4 8 196 | v 5 8 197 | v 6 8 198 | v 7 1 199 | e 0 1 0 200 | e 0 2 0 201 | e 0 3 0 202 | e 0 4 0 203 | e 1 5 0 204 | e 1 6 0 205 | e 1 7 0 206 | t # 5 207 | v 0 9 208 | v 1 9 209 | v 2 8 210 | v 3 1 211 | v 4 1 212 | v 5 8 213 | v 6 8 214 | v 7 8 215 | e 0 1 0 216 | e 1 2 0 217 | e 1 3 0 218 | e 1 4 0 219 | e 0 5 0 220 | e 0 6 0 221 | e 0 7 0 222 | t # 6 223 | v 0 9 224 | v 1 9 225 | v 2 8 226 | v 3 8 227 | v 4 1 228 | v 5 8 229 | v 6 8 230 | v 7 1 231 | e 0 1 0 232 | e 0 2 0 233 | e 0 3 0 234 | e 0 4 0 235 | e 1 5 0 236 | e 1 6 0 237 | e 1 7 0 238 | t # 7 239 | v 0 7 240 | v 1 7 241 | v 2 8 242 | v 3 8 243 | v 4 8 244 | v 5 1 245 | e 0 1 1 246 | e 0 2 0 247 | e 0 3 0 248 | e 1 4 0 249 | e 1 5 0 250 | t # 8 251 | v 0 9 252 | v 1 9 253 | v 2 7 254 | v 3 7 255 | v 4 9 256 | v 5 9 257 | v 6 9 258 | v 7 7 259 | v 8 7 260 | v 9 9 261 | v 10 1 262 | v 11 1 263 | v 12 1 264 | v 13 1 265 | v 14 9 266 | v 15 9 267 | v 16 1 268 | v 17 1 269 | v 18 8 270 | v 19 8 271 | v 20 8 272 | v 21 8 273 | v 22 8 274 | v 23 8 275 | v 24 1 276 | v 25 1 277 | e 0 1 0 278 | e 1 2 0 279 | e 2 3 1 280 | e 3 4 0 281 | e 4 5 0 282 | e 5 0 0 283 | e 5 6 0 284 | e 6 7 0 285 | e 7 8 1 286 | e 8 9 0 287 | e 9 0 0 288 | e 0 10 0 289 | e 5 11 0 290 | e 7 12 0 291 | e 8 13 0 292 | e 4 14 0 293 | e 14 1 0 294 | e 6 15 0 295 | e 6 16 0 296 | e 15 9 0 297 | e 9 17 0 298 | e 14 18 0 299 | e 14 19 0 300 | e 4 20 0 301 | e 1 21 0 302 | e 3 22 0 303 | e 2 23 0 304 | e 15 24 0 305 | e 15 25 0 306 | t # 9 307 | v 0 9 308 | v 1 9 309 | v 2 9 310 | v 3 1 311 | v 4 1 312 | v 5 9 313 | v 6 1 314 | v 7 1 315 | v 8 9 316 | v 9 9 317 | v 10 1 318 | v 11 1 319 | v 12 9 320 | v 13 1 321 | v 14 1 322 | v 15 9 323 | v 16 1 324 | v 17 1 325 | v 18 9 326 | v 19 9 327 | v 20 9 328 | v 21 1 329 | v 22 1 330 | v 23 9 331 | v 24 9 332 | v 25 1 333 | v 26 1 334 | v 27 9 335 | v 28 1 336 | v 29 1 337 | v 30 9 338 | v 31 9 339 | v 32 1 340 | v 33 1 341 | v 34 9 342 | v 35 9 343 | v 36 1 344 | v 37 1 345 | v 38 9 346 | v 39 1 347 | v 40 1 348 | v 41 9 349 | v 42 9 350 | v 43 9 351 | v 44 1 352 | v 45 1 353 | v 46 9 354 | v 47 1 355 | v 48 1 356 | v 49 1 357 | v 50 1 358 | v 51 1 359 | v 52 8 360 | v 53 1 361 | v 54 1 362 | v 55 8 363 | v 56 8 364 | v 57 8 365 | v 58 1 366 | v 59 8 367 | v 60 1 368 | v 61 8 369 | v 62 1 370 | v 63 8 371 | v 64 1 372 | v 65 8 373 | v 66 1 374 | v 67 8 375 | v 68 1 376 | v 69 8 377 | v 70 1 378 | v 71 9 379 | v 72 9 380 | v 73 9 381 | v 74 9 382 | v 75 9 383 | v 76 9 384 | v 77 9 385 | v 78 9 386 | v 79 9 387 | v 80 1 388 | v 81 1 389 | v 82 1 390 | v 83 9 391 | v 84 1 392 | v 85 1 393 | v 86 1 394 | v 87 9 395 | v 88 9 396 | v 89 1 397 | v 90 1 398 | v 91 1 399 | v 92 9 400 | v 93 1 401 | v 94 1 402 | v 95 1 403 | v 96 9 404 | v 97 1 405 | v 98 1 406 | v 99 1 407 | v 100 9 408 | v 101 1 409 | v 102 1 410 | v 103 1 411 | v 104 9 412 | v 105 9 413 | v 106 1 414 | v 107 1 415 | v 108 1 416 | v 109 9 417 | v 110 1 418 | v 111 1 419 | v 112 1 420 | v 113 9 421 | v 114 9 422 | v 115 9 423 | v 116 9 424 | v 117 1 425 | v 118 1 426 | v 119 1 427 | v 120 1 428 | v 121 9 429 | v 122 1 430 | v 123 1 431 | v 124 8 432 | v 125 8 433 | v 126 8 434 | v 127 8 435 | v 128 8 436 | v 129 8 437 | v 130 8 438 | v 131 1 439 | v 132 1 440 | v 133 8 441 | v 134 1 442 | v 135 1 443 | v 136 8 444 | v 137 1 445 | v 138 1 446 | v 139 8 447 | v 140 1 448 | v 141 1 449 | e 0 1 0 450 | e 1 2 0 451 | e 1 3 0 452 | e 1 4 0 453 | e 2 5 0 454 | e 2 6 0 455 | e 2 7 0 456 | e 5 8 0 457 | e 8 9 0 458 | e 8 10 0 459 | e 8 11 0 460 | e 9 12 0 461 | e 9 13 0 462 | e 9 14 0 463 | e 12 15 0 464 | e 12 16 0 465 | e 12 17 0 466 | e 15 18 0 467 | e 18 19 0 468 | e 19 20 0 469 | e 19 21 0 470 | e 19 22 0 471 | e 20 23 0 472 | e 23 24 0 473 | e 23 25 0 474 | e 23 26 0 475 | e 24 27 0 476 | e 24 28 0 477 | e 24 29 0 478 | e 27 30 0 479 | e 30 31 0 480 | e 30 32 0 481 | e 30 33 0 482 | e 31 34 0 483 | e 34 35 0 484 | e 34 36 0 485 | e 34 37 0 486 | e 35 38 0 487 | e 35 39 0 488 | e 35 40 0 489 | e 38 41 0 490 | e 41 42 0 491 | e 42 43 0 492 | e 42 44 0 493 | e 42 45 0 494 | e 43 46 0 495 | e 43 47 0 496 | e 43 48 0 497 | e 46 49 0 498 | e 46 50 0 499 | e 46 51 0 500 | e 0 52 0 501 | e 0 53 0 502 | e 0 54 0 503 | e 5 55 0 504 | e 5 56 0 505 | e 15 57 0 506 | e 15 58 0 507 | e 18 59 0 508 | e 18 60 0 509 | e 20 61 0 510 | e 20 62 0 511 | e 27 63 0 512 | e 27 64 0 513 | e 31 65 0 514 | e 31 66 0 515 | e 38 67 0 516 | e 38 68 0 517 | e 41 69 0 518 | e 41 70 0 519 | e 71 72 0 520 | e 72 73 0 521 | e 72 74 0 522 | e 72 75 0 523 | e 73 76 0 524 | e 73 77 0 525 | e 73 78 0 526 | e 75 79 0 527 | e 79 80 0 528 | e 79 81 0 529 | e 79 82 0 530 | e 75 83 0 531 | e 83 84 0 532 | e 83 85 0 533 | e 83 86 0 534 | e 74 87 0 535 | e 74 88 0 536 | e 88 89 0 537 | e 88 90 0 538 | e 88 91 0 539 | e 71 92 0 540 | e 92 93 0 541 | e 92 94 0 542 | e 92 95 0 543 | e 71 96 0 544 | e 96 97 0 545 | e 96 98 0 546 | e 96 99 0 547 | e 75 100 0 548 | e 100 101 0 549 | e 100 102 0 550 | e 100 103 0 551 | e 74 104 0 552 | e 71 105 0 553 | e 105 106 0 554 | e 105 107 0 555 | e 105 108 0 556 | e 77 109 0 557 | e 109 110 0 558 | e 109 111 0 559 | e 109 112 0 560 | e 77 113 0 561 | e 77 114 0 562 | e 78 115 0 563 | e 78 116 0 564 | e 78 117 0 565 | e 116 118 0 566 | e 116 119 0 567 | e 116 120 0 568 | e 76 121 0 569 | e 76 122 0 570 | e 76 123 0 571 | e 87 124 0 572 | e 87 125 0 573 | e 87 126 0 574 | e 104 127 0 575 | e 104 128 0 576 | e 104 129 0 577 | e 113 130 0 578 | e 113 131 0 579 | e 113 132 0 580 | e 115 133 0 581 | e 115 134 0 582 | e 115 135 0 583 | e 114 136 0 584 | e 114 137 0 585 | e 114 138 0 586 | e 121 139 0 587 | e 121 140 0 588 | e 121 141 0 -------------------------------------------------------------------------------- /gaston_py/factory.py: -------------------------------------------------------------------------------- 1 | 2 | from gaston_py.node import Node 3 | from gaston_py.path import Path 4 | from gaston_py.tree import Tree 5 | from gaston_py.cycle import Cycle 6 | import gaston_py.graph as graph_module 7 | import gaston_py.embedding as embedding 8 | 9 | def initial_node_fragments(graphs): 10 | """ Creates initial node fragments from networkx graphs. """ 11 | return iter(Node(node_id, source_graph) for source_graph in graphs for node_id in source_graph) 12 | 13 | def apply_refinement(prev_fragment, edge, dont_generate_cycles, dont_generate_trees): 14 | """ 15 | Create a new fragment by applying a refinement to a frequently occurring fragment. 16 | 17 | Returns None if the refinement produces a duplicated fragment. 18 | Otherwise, returns a new fragment. 19 | """ 20 | 21 | origin_id, target_id = edge 22 | new_fragment = None 23 | 24 | # Create a path from a node. 25 | if isinstance(prev_fragment, Node): 26 | new_fragment = _create_path_from_node(prev_fragment, target_id) 27 | 28 | # Create a cycle from either a path, tree, or cycle fragment. 29 | elif target_id in prev_fragment.current_graph: 30 | if not dont_generate_cycles: 31 | new_fragment = _create_cycle(prev_fragment, origin_id, target_id) 32 | 33 | elif isinstance(prev_fragment, Path): 34 | # Create a path by appending to a path fragment. 35 | if origin_id == prev_fragment.back_node_id: 36 | new_fragment = _append_to_path(prev_fragment, target_id) 37 | 38 | # Create a path by prepending to a path fragment. 39 | # elif origin_id == prev_fragment.source_node_id: 40 | # new_fragment = _prepend_node_to_path(prev_fragment, target_id) 41 | 42 | # Create a tree by appending to a path fragment. 43 | elif not dont_generate_trees: 44 | new_fragment = _create_tree(prev_fragment, origin_id, target_id) 45 | 46 | # Create a tree by appending to a tree fragment. 47 | elif isinstance(prev_fragment, Tree) and not dont_generate_trees: 48 | new_fragment = _create_tree(prev_fragment, origin_id, target_id) 49 | 50 | return new_fragment 51 | 52 | def _create_path_from_node(node_fragment, appending_node_id): 53 | source_node_id, source_graph = node_fragment.source_node_id, node_fragment.source_graph 54 | 55 | start_node_label = source_graph.node[source_node_id]['label'] 56 | appending_node_label = source_graph.node[appending_node_id]['label'] 57 | edge_label = source_graph.edge[source_node_id][appending_node_id]['label'] 58 | 59 | # Check if refinement is allowed 60 | if appending_node_label < start_node_label: 61 | return None 62 | 63 | current_graph = graph_module.create_nx_graph(source_node_id, start_node_label, 64 | appending_node_id, appending_node_label, 65 | edge_label) 66 | 67 | # Find the embedding that begins at the previous path's back node 68 | # The alt embedding beginning from the new back node will be created from a different fragment 69 | embedding_list = embedding.create_embedding_list_if_unique(current_graph, 70 | source_id=source_node_id, 71 | alt_source_id=appending_node_id) 72 | if embedding_list is None: 73 | return None 74 | 75 | # embedding_list = embedding.create_embedding_list(current_graph, node_fragment.source_node_id) 76 | 77 | # total_symmetry = 0 if appending_node_label == start_node_label else 1 78 | 79 | return Path(source_node_id, appending_node_id, current_graph, 80 | source_graph, embedding_list, 81 | total_symmetry=0, front_symmetry=0, back_symmetry=0) 82 | 83 | def _append_to_path(prev_path, new_back_id): 84 | 85 | target_node_label = prev_path.source_graph.node[new_back_id]['label'] 86 | target_edge_label = prev_path.source_graph.edge[prev_path.back_node_id][new_back_id]['label'] 87 | 88 | # edge1 = tuple(prev_path.embedding_list[:2]) # (l(v1), l(e1)) 89 | # new_edge = (target_node_label, target_edge_label) 90 | # embedding_list = prev_path.embedding_list + (target_edge_label, target_node_label) 91 | 92 | # Needs to be changed if using the O(1) method for finding new path symmetries 93 | # total_symmetry, front_symmetry, back_symmetry = Path.new_path_symmetries(embedding_list) 94 | 95 | # Check if refinement is allowed 96 | # append is allowed if total_symmetry == 0 97 | # (l(v'), l(e')) > (l(v1), l(e1)) 98 | # if (l(v'), l(e')) == (l(v1), l(e1)) and back_symmetry >= 0 99 | 100 | # if total_symmetry == -1 or edge1 == new_edge and back_symmetry == -1: 101 | # return None 102 | 103 | current_graph = graph_module.extend_nx_graph(prev_path.current_graph, 104 | prev_path.back_node_id, new_back_id, 105 | target_node_label, target_edge_label) 106 | 107 | # Find the embedding that begins at the previous path's back node 108 | # The alt embedding beginning from the new back node will be created from a different fragment 109 | embedding_list = embedding.create_embedding_list_if_unique(current_graph, 110 | source_id=prev_path.source_node_id, 111 | alt_source_id=new_back_id) 112 | if embedding_list is None: 113 | return None 114 | 115 | # embedding_list = embedding.create_embedding_list(current_graph, prev_path.source_node_id) 116 | 117 | return Path(prev_path.source_node_id, new_back_id, current_graph, 118 | prev_path.source_graph, embedding_list, 119 | total_symmetry=0, front_symmetry=0, back_symmetry=0) 120 | 121 | def _prepend_node_to_path(prev_path, new_node_id): 122 | 123 | new_node_label = prev_path.source_graph.node[new_node_id]['label'] 124 | new_edge_label = prev_path.source_graph.edge[new_node_id][prev_path.source_node_id]['label'] 125 | 126 | # edge1 = tuple(prev_path.embedding_list[:2]) # (l(v1), l(e1)) 127 | # new_edge = (new_node_label, new_edge_label) 128 | # embedding_list = new_edge + prev_path.embedding_list 129 | 130 | # Needs to be changed if using the O(1) method for finding new path symmetries 131 | # total_symmetry, front_symmetry, back_symmetry = Path.new_path_symmetries(embedding_list) 132 | 133 | # Check if refinement is allowed 134 | # append is allowed if total_symmetry == 1 135 | # (l(v'), l(e')) > (l(v1), l(e1)) 136 | # if (l(v'), l(e')) == (l(v1), l(e1)) and back_symmetry >= 0 137 | 138 | # if total_symmetry != 1 or edge1 == new_edge and back_symmetry == -1: 139 | # return None 140 | 141 | # Incorrect order if graph is directed 142 | current_graph = graph_module.extend_nx_graph(prev_path.current_graph, prev_path.source_node_id, 143 | new_node_id, new_node_label, new_edge_label) 144 | 145 | # Find the embedding that begins at the previous path's start node 146 | # The alt embedding beginning from the new start node will be created from a different fragment 147 | embedding_list = embedding.create_embedding_list_if_unique(current_graph, 148 | source_id=prev_path.source_node_id, 149 | alt_source_id=new_node_id) 150 | if embedding_list is None or prev_path.embedding_list > embedding_list: 151 | return None 152 | 153 | # embedding_list = embedding.create_embedding_list(current_graph, prev_path.source_node_id) 154 | 155 | return Path(prev_path.source_node_id, prev_path.back_node_id, current_graph, 156 | prev_path.source_graph, embedding_list, 157 | total_symmetry=0, front_symmetry=0, back_symmetry=0) 158 | 159 | def _create_tree(prev_fragment, origin_id, target_id): 160 | 161 | source_graph = prev_fragment.source_graph 162 | 163 | new_node_label = source_graph.node[target_id]['label'] 164 | new_edge_label = source_graph.edge[origin_id][target_id]['label'] 165 | 166 | current_graph = graph_module.extend_nx_graph(prev_fragment.current_graph, origin_id, target_id, 167 | new_node_label, new_edge_label) 168 | 169 | embedding_list = embedding.create_embedding_list_if_unique(current_graph, 170 | source_id=prev_fragment.source_node_id, 171 | alt_source_id=target_id) 172 | if embedding_list is None: 173 | return None 174 | 175 | # embedding_list = embedding.create_embedding_list(current_graph, prev_fragment.source_node_id) 176 | 177 | return Tree(prev_fragment.source_node_id, current_graph, source_graph, embedding_list) 178 | 179 | def _create_cycle(prev_fragment, origin_id, target_id): 180 | 181 | source_graph = prev_fragment.source_graph 182 | prev_graph = prev_fragment.current_graph 183 | 184 | new_node_label = source_graph.node[target_id]['label'] 185 | new_edge_label = source_graph.edge[origin_id][target_id]['label'] 186 | 187 | current_graph = graph_module.extend_nx_graph(prev_graph, origin_id, target_id, 188 | new_node_label, new_edge_label) 189 | 190 | embedding_list = embedding.create_embedding_list_if_unique(current_graph, 191 | source_id=prev_fragment.source_node_id, 192 | alt_source_id=target_id) 193 | if embedding_list is None: 194 | return None 195 | 196 | # embedding_list = embedding.create_embedding_list(current_graph, prev_fragment.source_node_id) 197 | 198 | return Cycle(prev_fragment.source_node_id, current_graph, source_graph, embedding_list) 199 | --------------------------------------------------------------------------------